Context Navigation

plinkbinJZ.py @ 3

リビジョン 2, 35.6 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python2.4
2	"""
3	"""
4
5	import optparse,os,subprocess,gzip,struct,time,commands
6	from array import array
7
8	#from AIMS import util
9	#from pga import util as pgautil
10
11	__FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $'
12
13	VERBOSE = True
14
15	MISSING_ALLELES = set(['N', '0', '.', '-',''])
16
17	AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)])
18
19	MAGIC_BYTE1 = '00110110'
20	MAGIC_BYTE2 = '11011000'
21	FORMAT_SNP_MAJOR_BYTE = '10000000'
22	FORMAT_IND_MAJOR_BYTE = '00000000'
23	MAGIC1 = (0, 3, 1, 2)
24	MAGIC2 = (3, 1, 2, 0)
25	FORMAT_SNP_MAJOR = (2, 0, 0, 0)
26	FORMAT_IND_MAJOR = (0, 0, 0, 0)
27	HEADER_LENGTH = 3
28
29	HOM0 = 3
30	HOM1 = 0
31	MISS = 2
32	HET = 1
33	HOM0_GENO = (0, 0)
34	HOM1_GENO = (1, 1)
35	HET_GENO = (0, 1)
36	MISS_GENO = (-9, -9)
37
38	GENO_TO_GCODE = {
39	HOM0_GENO: HOM0,
40	HET_GENO: HET,
41	HOM1_GENO: HOM1,
42	MISS_GENO: MISS,
43	}
44
45	CHROM_REPLACE = {
46	'X': '23',
47	'Y': '24',
48	'XY': '25',
49	'MT': '26',
50	'M': '26',
51	}
52
53	MAP_LINE_EXCEPTION_TEXT = """
54	One or more lines in the *.map file has only three fields.
55	The line was:
56
57	%s
58
59	If you are running rgGRR through EPMP, this is usually a
60	sign that you are using an old version of the map file.
61	You can correct the problem by re-running Subject QC. If
62	you have already tried this, please contact the developers,
63	or file a bug.
64	"""
65
66	INT_TO_GCODE = {
67	0: array('i', (0, 0, 0, 0)), 1: array('i', (2, 0, 0, 0)), 2: array('i', (1, 0, 0, 0)), 3: array('i', (3, 0, 0, 0)),
68	4: array('i', (0, 2, 0, 0)), 5: array('i', (2, 2, 0, 0)), 6: array('i', (1, 2, 0, 0)), 7: array('i', (3, 2, 0, 0)),
69	8: array('i', (0, 1, 0, 0)), 9: array('i', (2, 1, 0, 0)), 10: array('i', (1, 1, 0, 0)), 11: array('i', (3, 1, 0, 0)),
70	12: array('i', (0, 3, 0, 0)), 13: array('i', (2, 3, 0, 0)), 14: array('i', (1, 3, 0, 0)), 15: array('i', (3, 3, 0, 0)),
71	16: array('i', (0, 0, 2, 0)), 17: array('i', (2, 0, 2, 0)), 18: array('i', (1, 0, 2, 0)), 19: array('i', (3, 0, 2, 0)),
72	20: array('i', (0, 2, 2, 0)), 21: array('i', (2, 2, 2, 0)), 22: array('i', (1, 2, 2, 0)), 23: array('i', (3, 2, 2, 0)),
73	24: array('i', (0, 1, 2, 0)), 25: array('i', (2, 1, 2, 0)), 26: array('i', (1, 1, 2, 0)), 27: array('i', (3, 1, 2, 0)),
74	28: array('i', (0, 3, 2, 0)), 29: array('i', (2, 3, 2, 0)), 30: array('i', (1, 3, 2, 0)), 31: array('i', (3, 3, 2, 0)),
75	32: array('i', (0, 0, 1, 0)), 33: array('i', (2, 0, 1, 0)), 34: array('i', (1, 0, 1, 0)), 35: array('i', (3, 0, 1, 0)),
76	36: array('i', (0, 2, 1, 0)), 37: array('i', (2, 2, 1, 0)), 38: array('i', (1, 2, 1, 0)), 39: array('i', (3, 2, 1, 0)),
77	40: array('i', (0, 1, 1, 0)), 41: array('i', (2, 1, 1, 0)), 42: array('i', (1, 1, 1, 0)), 43: array('i', (3, 1, 1, 0)),
78	44: array('i', (0, 3, 1, 0)), 45: array('i', (2, 3, 1, 0)), 46: array('i', (1, 3, 1, 0)), 47: array('i', (3, 3, 1, 0)),
79	48: array('i', (0, 0, 3, 0)), 49: array('i', (2, 0, 3, 0)), 50: array('i', (1, 0, 3, 0)), 51: array('i', (3, 0, 3, 0)),
80	52: array('i', (0, 2, 3, 0)), 53: array('i', (2, 2, 3, 0)), 54: array('i', (1, 2, 3, 0)), 55: array('i', (3, 2, 3, 0)),
81	56: array('i', (0, 1, 3, 0)), 57: array('i', (2, 1, 3, 0)), 58: array('i', (1, 1, 3, 0)), 59: array('i', (3, 1, 3, 0)),
82	60: array('i', (0, 3, 3, 0)), 61: array('i', (2, 3, 3, 0)), 62: array('i', (1, 3, 3, 0)), 63: array('i', (3, 3, 3, 0)),
83	64: array('i', (0, 0, 0, 2)), 65: array('i', (2, 0, 0, 2)), 66: array('i', (1, 0, 0, 2)), 67: array('i', (3, 0, 0, 2)),
84	68: array('i', (0, 2, 0, 2)), 69: array('i', (2, 2, 0, 2)), 70: array('i', (1, 2, 0, 2)), 71: array('i', (3, 2, 0, 2)),
85	72: array('i', (0, 1, 0, 2)), 73: array('i', (2, 1, 0, 2)), 74: array('i', (1, 1, 0, 2)), 75: array('i', (3, 1, 0, 2)),
86	76: array('i', (0, 3, 0, 2)), 77: array('i', (2, 3, 0, 2)), 78: array('i', (1, 3, 0, 2)), 79: array('i', (3, 3, 0, 2)),
87	80: array('i', (0, 0, 2, 2)), 81: array('i', (2, 0, 2, 2)), 82: array('i', (1, 0, 2, 2)), 83: array('i', (3, 0, 2, 2)),
88	84: array('i', (0, 2, 2, 2)), 85: array('i', (2, 2, 2, 2)), 86: array('i', (1, 2, 2, 2)), 87: array('i', (3, 2, 2, 2)),
89	88: array('i', (0, 1, 2, 2)), 89: array('i', (2, 1, 2, 2)), 90: array('i', (1, 1, 2, 2)), 91: array('i', (3, 1, 2, 2)),
90	92: array('i', (0, 3, 2, 2)), 93: array('i', (2, 3, 2, 2)), 94: array('i', (1, 3, 2, 2)), 95: array('i', (3, 3, 2, 2)),
91	96: array('i', (0, 0, 1, 2)), 97: array('i', (2, 0, 1, 2)), 98: array('i', (1, 0, 1, 2)), 99: array('i', (3, 0, 1, 2)),
92	100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)),
93	104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)),
94	108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)),
95	112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)),
96	116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)),
97	120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)),
98	124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)),
99	128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)),
100	132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)),
101	136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)),
102	140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)),
103	144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)),
104	148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)),
105	152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)),
106	156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)),
107	160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)),
108	164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)),
109	168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)),
110	172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)),
111	176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)),
112	180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)),
113	184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)),
114	188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)),
115	192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)),
116	196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)),
117	200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)),
118	204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)),
119	208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)),
120	212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)),
121	216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)),
122	220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)),
123	224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)),
124	228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)),
125	232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)),
126	236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)),
127	240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)),
128	244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)),
129	248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)),
130	252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)),
131	}
132
133	GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()])
134
135	### Exceptions
136	class DuplicateMarkerInMapFile(Exception): pass
137	class MapLineTooShort(Exception): pass
138	class ThirdAllele(Exception): pass
139	class PedError(Exception): pass
140	class BadMagic(Exception):
141	""" Raised when one of the MAGIC bytes in a bed file does not match
142	"""
143	pass
144	class BedError(Exception):
145	""" Raised when parsing a bed file runs into problems
146	"""
147	pass
148	class UnknownGenocode(Exception):
149	""" Raised when we get a 2-bit genotype that is undecipherable (is it possible?)
150	"""
151	pass
152	class UnknownGeno(Exception): pass
153
154	### Utility functions
155
156	def timenow():
157	"""return current time as a string
158	"""
159	return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
160
161	def ceiling(n, k):
162	''' Return the least multiple of k which is greater than n
163	'''
164	m = n % k
165	if m == 0:
166	return n
167	else:
168	return n + k - m
169
170	def nbytes(n):
171	''' Return the number of bytes required for n subjects
172	'''
173	return 2*ceiling(n, 4)/8
174
175	### Primary module functionality
176	class LPed:
177	""" The uber-class for processing the Linkage-format .ped/.map files
178	"""
179	def __init__(self, base):
180	self.base = base
181	self._ped = Ped('%s.ped' % (self.base))
182	self._map = Map('%s.map' % (self.base))
183
184	self._markers = {}
185	self._ordered_markers = []
186	self._marker_allele_lookup = {}
187	self._autosomal_indices = set()
188
189	self._subjects = {}
190	self._ordered_subjects = []
191
192	self._genotypes = []
193
194	def parse(self):
195	"""
196	"""
197	if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow())
198	self._map.parse()
199	self._markers = self._map._markers
200	self._ordered_markers = self._map._ordered_markers
201	self._autosomal_indices = self._map._autosomal_indices
202
203	self._ped.parse(self._ordered_markers)
204	self._subjects = self._ped._subjects
205	self._ordered_subjects = self._ped._ordered_subjects
206	self._genotypes = self._ped._genotypes
207	self._marker_allele_lookup = self._ped._marker_allele_lookup
208
209	### Adjust self._markers based on the allele information
210	### we got from parsing the ped file
211	for m, name in enumerate(self._ordered_markers):
212	a1, a2 = self._marker_allele_lookup[m][HET]
213	self._markers[name][-2] = a1
214	self._markers[name][-1] = a2
215	if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow())
216
217	def getSubjectInfo(self, fid, oiid):
218	"""
219	"""
220	return self._subject_info[(fid, oiid)]
221
222	def getSubjectInfoByLine(self, line):
223	"""
224	"""
225	return self._subject_info[self._ordered_subjects[line]]
226
227	def getGenotypesByIndices(self, s, mlist, format):
228	""" needed for grr if lped - deprecated but..
229	"""
230	mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ?
231	raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)])
232	if format == 'raw':
233	return raw_array
234	elif format == 'ref':
235	result = array('i', [0]*len(mlist))
236	for m, gcode in enumerate(raw_array):
237	if gcode == HOM0:
238	nref = 3
239	elif gcode == HET:
240	nref = 2
241	elif gcode == HOM1:
242	nref = 1
243	else:
244	nref = 0
245	result[m] = nref
246	return result
247	else:
248	result = []
249	for m, gcode in enumerate(raw_array):
250	result.append(self._marker_allele_lookup[m][gcode])
251	return result
252
253	def writebed(self, base):
254	"""
255	"""
256	dst_name = '%s.fam' % (base)
257	print 'Writing pedigree information to [ %s ]' % (dst_name)
258	dst = open(dst_name, 'w')
259	for skey in self._ordered_subjects:
260	(fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey]
261	dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe))
262	dst.close()
263
264	dst_name = '%s.bim' % (base)
265	print 'Writing map (extended format) information to [ %s ]' % (dst_name)
266	dst = open(dst_name, 'w')
267	for m, marker in enumerate(self._ordered_markers):
268	chrom, name, genpos, abspos, a1, a2 = self._markers[marker]
269	dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2))
270	dst.close()
271
272	bed_name = '%s.bed' % (base)
273	print 'Writing genotype bitfile to [ %s ]' % (bed_name)
274	print 'Using (default) SNP-major mode'
275	bed = open(bed_name, 'w')
276
277	### Write the 3 header bytes
278	bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2)))
279	bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2)))
280	bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2)))
281
282	### Calculate how many "pad bits" we should add after the last subject
283	nsubjects = len(self._ordered_subjects)
284	nmarkers = len(self._ordered_markers)
285	total_bytes = nbytes(nsubjects)
286	nbits = nsubjects * 2
287	pad_nibbles = ((total_bytes * 8) - nbits)/2
288	pad = array('i', [0]*pad_nibbles)
289
290	### And now write genotypes to the file
291	for m in xrange(nmarkers):
292	geno = self._genotypes[m]
293	geno.extend(pad)
294	bytes = len(geno)/4
295	for b in range(bytes):
296	idx = b*4
297	gcode = tuple(geno[idx:idx+4])
298	try:
299	byte = struct.pack('B', GCODE_TO_INT[gcode])
300	except KeyError:
301	print m, b, gcode
302	raise
303	bed.write(byte)
304	bed.close()
305
306	def autosomal_indices(self):
307	""" Return the indices of markers in this ped/map that are autosomal.
308	This is used by rgGRR so that it can select a random set of markers
309	from the autosomes (sex chroms screw up the plot)
310	"""
311	return self._autosomal_indices
312
313	class Ped:
314	def __init__(self, path):
315	self.path = path
316	self._subjects = {}
317	self._ordered_subjects = []
318	self._genotypes = []
319	self._marker_allele_lookup = {}
320
321	def lineCount(self,infile):
322	""" count the number of lines in a file - efficiently using wget
323	"""
324	return int(commands.getoutput('wc -l %s' % (infile)).split()[0])
325
326
327	def parse(self, markers):
328	""" Parse a given file -- this needs to be memory-efficient so that large
329	files can be parsed (~1 million markers on ~5000 subjects?). It
330	should also be fast, if possible.
331	"""
332
333	### Find out how many lines are in the file so we can ...
334	nsubjects = self.lineCount(self.path)
335	### ... Pre-allocate the genotype arrays
336	nmarkers = len(markers)
337	_marker_alleles = [['0', '0'] for _ in xrange(nmarkers)]
338	self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)]
339
340	if self.path.endswith('.gz'):
341	pfile = gzip.open(self.path, 'r')
342	else:
343	pfile = open(self.path, 'r')
344
345	for s, line in enumerate(pfile):
346	line = line.strip()
347	if not line:
348	continue
349
350	fid, iid, did, mid, sex, phe, genos = line.split(None, 6)
351	sid = iid.split('.')[0]
352	d_sid = did.split('.')[0]
353	m_sid = mid.split('.')[0]
354
355	skey = (fid, iid)
356	self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
357	self._ordered_subjects.append(skey)
358
359	genotypes = genos.split()
360
361	for m, marker in enumerate(markers):
362	idx = m*2
363	a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m
364	s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m
365
366	### FIXME: I think this can still be faster, and simpler to read
367	# Two pieces of logic intertwined here: first, we need to code
368	# this genotype as HOM0, HOM1, HET or MISS. Second, we need to
369	# keep an ongoing record of the genotypes seen for this marker
370	if a1 == a2:
371	if a1 in MISSING_ALLELES:
372	geno = MISS_GENO
373	else:
374	if s1 == '0':
375	seen[0] = a1
376	elif s1 == a1 or s2 == a2:
377	pass
378	elif s2 == '0':
379	seen[1] = a1
380	else:
381	raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen)))
382
383	if a1 == seen[0]:
384	geno = HOM0_GENO
385	elif a1 == seen[1]:
386	geno = HOM1_GENO
387	else:
388	raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen)))
389	elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES:
390	geno = MISS_GENO
391	else:
392	geno = HET_GENO
393	if s1 == '0':
394	seen[0] = a1
395	seen[1] = a2
396	elif s2 == '0':
397	if s1 == a1:
398	seen[1] = a2
399	elif s1 == a2:
400	seen[1] = a1
401	else:
402	raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
403	else:
404	if sorted(seen) != sorted((a1, a2)):
405	raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
406
407	gcode = GENO_TO_GCODE.get(geno, None)
408	if gcode is None:
409	raise UnknownGeno(str(geno))
410	self._genotypes[m][s] = gcode
411
412	# Build the _marker_allele_lookup table
413	for m, alleles in enumerate(_marker_alleles):
414	if len(alleles) == 2:
415	a1, a2 = alleles
416	elif len(alleles) == 1:
417	a1 = alleles[0]
418	a2 = '0'
419	else:
420	print 'All alleles blank for %s: %s' % (m, str(alleles))
421	raise
422
423	self._marker_allele_lookup[m] = {
424	HOM0: (a2, a2),
425	HOM1: (a1, a1),
426	HET : (a1, a2),
427	MISS: ('0','0'),
428	}
429
430	if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects), nsubjects, self.path)
431
432	class Map:
433	def __init__(self, path=None):
434	self.path = path
435	self._markers = {}
436	self._ordered_markers = []
437	self._autosomal_indices = set()
438
439	def __len__(self):
440	return len(self._markers)
441
442	def parse(self):
443	""" Parse a Linkage-format map file
444	"""
445	if self.path.endswith('.gz'):
446	fh = gzip.open(self.path, 'r')
447	else:
448	fh = open(self.path, 'r')
449
450	for i, line in enumerate(fh):
451	line = line.strip()
452	if not line:
453	continue
454
455	fields = line.split()
456	if len(fields) < 4:
457	raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line), len(fields)))
458	else:
459	chrom, name, genpos, abspos = fields
460	if name in self._markers:
461	raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path))
462	abspos = int(abspos)
463	if abspos < 0:
464	continue
465	if chrom in AUTOSOMES:
466	self._autosomal_indices.add(i)
467	chrom = CHROM_REPLACE.get(chrom, chrom)
468	self._markers[name] = [chrom, name, genpos, abspos, None, None]
469	self._ordered_markers.append(name)
470	fh.close()
471	if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers), i, self.path)
472
473	class BPed:
474	""" The uber-class for processing Plink's Binary Ped file format .bed/.bim/*.fam
475	"""
476	def __init__(self, base):
477	self.base = base
478	self._bed = Bed('%s.bed' % (self.base))
479	self._bim = Bim('%s.bim' % (self.base))
480	self._fam = Fam('%s.fam' % (self.base))
481
482	self._markers = {}
483	self._ordered_markers = []
484	self._marker_allele_lookup = {}
485	self._autosomal_indices = set()
486
487	self._subjects = {}
488	self._ordered_subjects = []
489
490	self._genotypes = []
491
492	def parse(self, quick=False):
493	"""
494	"""
495	self._quick = quick
496
497	self._bim.parse()
498	self._markers = self._bim._markers
499	self._ordered_markers = self._bim._ordered_markers
500	self._marker_allele_lookup = self._bim._marker_allele_lookup
501	self._autosomal_indices = self._bim._autosomal_indices
502
503	self._fam.parse()
504	self._subjects = self._fam._subjects
505	self._ordered_subjects = self._fam._ordered_subjects
506
507	self._bed.parse(self._ordered_subjects, self._ordered_markers, quick=quick)
508	self._bedf = self._bed._fh
509	self._genotypes = self._bed._genotypes
510	self.nsubjects = len(self._ordered_subjects)
511	self.nmarkers = len(self._ordered_markers)
512	self._bytes_per_marker = nbytes(self.nsubjects)
513
514	def writeped(self, path=None):
515	"""
516	"""
517	path = self.path = path or self.path
518
519	map_name = self.path.replace('.bed', '.map')
520	print 'Writing map file [ %s ]' % (map_name)
521	dst = open(map_name, 'w')
522	for m in self._ordered_markers:
523	chrom, snp, genpos, abspos, a1, a2 = self._markers[m]
524	dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos))
525	dst.close()
526
527	ped_name = self.path.replace('.bed', '.ped')
528	print 'Writing ped file [ %s ]' % (ped_name)
529	ped = open(ped_name, 'w')
530	firstyikes = False
531	for s, skey in enumerate(self._ordered_subjects):
532	idx = s*2
533	(fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey]
534	ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe))
535	genotypes_for_subject = self.getGenotypesForSubject(s)
536	for m, snp in enumerate(self._ordered_markers):
537	#a1, a2 = self.getGenotypeByIndices(s, m)
538	a1,a2 = genotypes_for_subject[m]
539	ped.write(' %s %s' % (a1, a2))
540	ped.write('\n')
541	ped.close()
542
543	def getGenotype(self, subject, marker):
544	""" Retrieve a genotype for a particular subject/marker pair
545	"""
546	m = self._ordered_markers.index(marker)
547	s = self._ordered_subjects.index(subject)
548	return self.getGenotypeByIndices(s, m)
549
550	def getGenotypesForSubject(self, s, raw=False):
551	""" Returns list of genotypes for all m markers
552	for subject s. If raw==True, then an array
553	of raw integer gcodes is returned instead
554	"""
555	if self._quick:
556	nmarkers = len(self._markers)
557	raw_array = array('i', [0]*nmarkers)
558	seek_nibble = s % 4
559	for m in xrange(nmarkers):
560	seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
561	self._bedf.seek(seek_byte)
562	geno = struct.unpack('B', self._bedf.read(1))[0]
563	quartet = INT_TO_GCODE[geno]
564	gcode = quartet[seek_nibble]
565	raw_array[m] = gcode
566	else:
567	raw_array = array('i', [row[s] for row in self._genotypes])
568
569	if raw:
570	return raw_array
571	else:
572	result = []
573	for m, gcode in enumerate(raw_array):
574	result.append(self._marker_allele_lookup[m][gcode])
575	return result
576
577	def getGenotypeByIndices(self, s, m):
578	"""
579	"""
580	if self._quick:
581	# Determine which byte we need to seek to, and
582	# which nibble within the byte we need
583	seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
584	seek_nibble = s % 4
585	self._bedf.seek(seek_byte)
586	geno = struct.unpack('B', self._bedf.read(1))[0]
587	quartet = INT_TO_GCODE[geno]
588	gcode = quartet[seek_nibble]
589	else:
590	# Otherwise, just grab the genotypes from the
591	# list of arrays
592	genos_for_marker = self._genotypes[m]
593	gcode = genos_for_marker[s]
594
595	return self._marker_allele_lookup[m][gcode]
596
597	def getGenotypesByIndices(self, s, mlist, format):
598	"""
599	"""
600	if self._quick:
601	raw_array = array('i', [0]*len(mlist))
602	seek_nibble = s % 4
603	for i,m in enumerate(mlist):
604	seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
605	self._bedf.seek(seek_byte)
606	geno = struct.unpack('B', self._bedf.read(1))[0]
607	quartet = INT_TO_GCODE[geno]
608	gcode = quartet[seek_nibble]
609	raw_array[i] = gcode
610	mlist = set(mlist)
611	else:
612	mlist = set(mlist)
613	raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist])
614
615	if format == 'raw':
616	return raw_array
617	elif format == 'ref':
618	result = array('i', [0]*len(mlist))
619	for m, gcode in enumerate(raw_array):
620	if gcode == HOM0:
621	nref = 3
622	elif gcode == HET:
623	nref = 2
624	elif gcode == HOM1:
625	nref = 1
626	else:
627	nref = 0
628	result[m] = nref
629	return result
630	else:
631	result = []
632	for m, gcode in enumerate(raw_array):
633	result.append(self._marker_allele_lookup[m][gcode])
634	return result
635
636	def getSubject(self, s):
637	"""
638	"""
639	skey = self._ordered_subjects[s]
640	return self._subjects[skey]
641
642	def autosomal_indices(self):
643	""" Return the indices of markers in this ped/map that are autosomal.
644	This is used by rgGRR so that it can select a random set of markers
645	from the autosomes (sex chroms screw up the plot)
646	"""
647	return self._autosomal_indices
648
649	class Bed:
650
651	def __init__(self, path):
652	self.path = path
653	self._genotypes = []
654	self._fh = None
655
656	def parse(self, subjects, markers, quick=False):
657	""" Parse the bed file, indicated either by the path parameter,
658	or as the self.path indicated in __init__. If quick is
659	True, then just parse the bim and fam, then genotypes will
660	be looked up dynamically by indices
661	"""
662	self._quick = quick
663
664	ordered_markers = markers
665	ordered_subjects = subjects
666	nsubjects = len(ordered_subjects)
667	nmarkers = len(ordered_markers)
668
669	bed = open(self.path, 'rb')
670	self._fh = bed
671
672	byte1 = bed.read(1)
673	byte2 = bed.read(1)
674	byte3 = bed.read(1)
675	format_flag = struct.unpack('B', byte3)[0]
676
677	h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]])
678	h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]])
679	h3 = tuple(INT_TO_GCODE[format_flag])
680
681	if h1 != MAGIC1 or h2 != MAGIC2:
682	raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2))
683	if format_flag:
684	print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3)
685	else:
686	raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3)
687
688	print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects)
689
690	### If quick mode was specified, we're done ...
691	self._quick = quick
692	if quick:
693	return
694
695	### ... Otherwise, parse genotypes into an array, and append that
696	### array to self._genotypes
697	ngcodes = ceiling(nsubjects, 4)
698	bytes_per_marker = nbytes(nsubjects)
699	for m in xrange(nmarkers):
700	genotype_array = array('i', [-1]*(ngcodes))
701	for byte in xrange(bytes_per_marker):
702	intval = struct.unpack('B', bed.read(1))[0]
703	idx = byte*4
704	genotype_array[idx:idx+4] = INT_TO_GCODE[intval]
705	self._genotypes.append(genotype_array)
706
707	class Bim:
708	def __init__(self, path):
709	"""
710	"""
711	self.path = path
712	self._markers = {}
713	self._ordered_markers = []
714	self._marker_allele_lookup = {}
715	self._autosomal_indices = set()
716
717	def parse(self):
718	"""
719	"""
720	print 'Reading map (extended format) from [ %s ]' % (self.path)
721	bim = open(self.path, 'r')
722	for m, line in enumerate(bim):
723	chrom, snp, gpos, apos, a1, a2 = line.strip().split()
724	self._markers[snp] = (chrom, snp, gpos, apos, a1, a2)
725	self._marker_allele_lookup[m] = {
726	HOM0: (a2, a2),
727	HOM1: (a1, a1),
728	HET : (a1, a2),
729	MISS: ('0','0'),
730	}
731	self._ordered_markers.append(snp)
732	if chrom in AUTOSOMES:
733	self._autosomal_indices.add(m)
734	bim.close()
735	print '%s markers to be included from [ %s ]' % (m+1, self.path)
736
737	class Fam:
738	def __init__(self, path):
739	"""
740	"""
741	self.path = path
742	self._subjects = {}
743	self._ordered_subjects = []
744
745	def parse(self):
746	"""
747	"""
748	print 'Reading pedigree information from [ %s ]' % (self.path)
749	fam = open(self.path, 'r')
750	for s, line in enumerate(fam):
751	fid, iid, did, mid, sex, phe = line.strip().split()
752	sid = iid.split('.')[0]
753	d_sid = did.split('.')[0]
754	m_sid = mid.split('.')[0]
755	skey = (fid, iid)
756	self._ordered_subjects.append(skey)
757	self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
758	fam.close()
759	print '%s individuals read from [ %s ]' % (s+1, self.path)
760
761	### Command-line functionality and testing
762	def test(arg):
763	'''
764	'''
765
766	import time
767
768	if arg == 'CAMP_AFFY.ped':
769	print 'Testing bed.parse(quick=True)'
770	s = time.time()
771	bed = Bed(arg.replace('.ped', '.bed'))
772	bed.parse(quick=True)
773	print bed.getGenotype(('400118', '10300283'), 'rs2000467')
774	print bed.getGenotype(('400118', '10101384'), 'rs2294019')
775	print bed.getGenotype(('400121', '10101149'), 'rs2294019')
776	print bed.getGenotype(('400123', '10200290'), 'rs2294019')
777	assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4')
778	e = time.time()
779	print 'e-s = %s\n' % (e-s)
780
781	print 'Testing bed.parse'
782	s = time.time()
783	bed = BPed(arg)
784	bed.parse(quick=False)
785	e = time.time()
786	print 'e-s = %s\n' % (e-s)
787
788	print 'Testing bed.writeped'
789	s = time.time()
790	outname = '%s_BEDTEST' % (arg)
791	bed.writeped(outname)
792	e = time.time()
793	print 'e-s = %s\n' % (e-s)
794	del(bed)
795
796	print 'Testing ped.parse'
797	s = time.time()
798	ped = LPed(arg)
799	ped.parse()
800	e = time.time()
801	print 'e-s = %s\n' % (e-s)
802
803	print 'Testing ped.writebed'
804	s = time.time()
805	outname = '%s_PEDTEST' % (arg)
806	ped.writebed(outname)
807	e = time.time()
808	print 'e-s = %s\n' % (e-s)
809	del(ped)
810
811	def profile_bed(arg):
812	"""
813	"""
814	bed = BPed(arg)
815	bed.parse(quick=False)
816	outname = '%s_BEDPROFILE' % (arg)
817	bed.writeped(outname)
818
819	def profile_ped(arg):
820	"""
821	"""
822	ped = LPed(arg)
823	ped.parse()
824	outname = '%s_PEDPROFILE' % (arg)
825	ped.writebed(outname)
826
827	if __name__ == '__main__':
828	""" Run as a command-line, this script should get one or more arguments,
829	each one a ped file to be parsed with the PedParser (unit tests?)
830	"""
831	op = optparse.OptionParser()
832	op.add_option('--profile-bed', action='store_true', default=False)
833	op.add_option('--profile-ped', action='store_true', default=False)
834	opts, args = op.parse_args()
835
836	if opts.profile_bed:
837	import profile
838	import pstats
839	profile.run('profile_bed(args[0])', 'fooprof')
840	p = pstats.Stats('fooprof')
841	p.sort_stats('cumulative').print_stats(10)
842	elif opts.profile_ped:
843	import profile
844	import pstats
845	profile.run('profile_ped(args[0])', 'fooprof')
846	p = pstats.Stats('fooprof')
847	p.sort_stats('cumulative').print_stats(10)
848	else:
849	for arg in args:
850	test(arg)
851
852	### Code used to generate the INT_TO_GCODE dictionary
853	#print '{\n ',
854	#for i in range(256):
855	# b = INT2BIN[i]
856	# ints = []
857	# s = str(i).rjust(3)
858	# #print b
859	# for j in range(4):
860	# idx = j*2
861	# #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2)
862	# ints.append(int(b[idx:idx+2], 2))
863	# print '%s: array(\'i\', %s),' % (s,tuple(ints)),
864	# if i > 0 and (i+1) % 4 == 0:
865	# print '\n ',
866	#print '}'
867
868

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/rgenetics/plinkbinJZ.py @ 3

異なるフォーマットでダウンロード: