Context Navigation

plinkbinJZ.py @ 2

リビジョン 2, 35.6 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python2.4
	2	"""
	3	"""
	4
	5	import optparse,os,subprocess,gzip,struct,time,commands
	6	from array import array
	7
	8	#from AIMS import util
	9	#from pga import util as pgautil
	10
	11	__FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $'
	12
	13	VERBOSE = True
	14
	15	MISSING_ALLELES = set(['N', '0', '.', '-',''])
	16
	17	AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)])
	18
	19	MAGIC_BYTE1 = '00110110'
	20	MAGIC_BYTE2 = '11011000'
	21	FORMAT_SNP_MAJOR_BYTE = '10000000'
	22	FORMAT_IND_MAJOR_BYTE = '00000000'
	23	MAGIC1 = (0, 3, 1, 2)
	24	MAGIC2 = (3, 1, 2, 0)
	25	FORMAT_SNP_MAJOR = (2, 0, 0, 0)
	26	FORMAT_IND_MAJOR = (0, 0, 0, 0)
	27	HEADER_LENGTH = 3
	28
	29	HOM0 = 3
	30	HOM1 = 0
	31	MISS = 2
	32	HET = 1
	33	HOM0_GENO = (0, 0)
	34	HOM1_GENO = (1, 1)
	35	HET_GENO = (0, 1)
	36	MISS_GENO = (-9, -9)
	37
	38	GENO_TO_GCODE = {
	39	HOM0_GENO: HOM0,
	40	HET_GENO: HET,
	41	HOM1_GENO: HOM1,
	42	MISS_GENO: MISS,
	43	}
	44
	45	CHROM_REPLACE = {
	46	'X': '23',
	47	'Y': '24',
	48	'XY': '25',
	49	'MT': '26',
	50	'M': '26',
	51	}
	52
	53	MAP_LINE_EXCEPTION_TEXT = """
	54	One or more lines in the *.map file has only three fields.
	55	The line was:
	56
	57	%s
	58
	59	If you are running rgGRR through EPMP, this is usually a
	60	sign that you are using an old version of the map file.
	61	You can correct the problem by re-running Subject QC. If
	62	you have already tried this, please contact the developers,
	63	or file a bug.
	64	"""
	65
	66	INT_TO_GCODE = {
	67	0: array('i', (0, 0, 0, 0)), 1: array('i', (2, 0, 0, 0)), 2: array('i', (1, 0, 0, 0)), 3: array('i', (3, 0, 0, 0)),
	68	4: array('i', (0, 2, 0, 0)), 5: array('i', (2, 2, 0, 0)), 6: array('i', (1, 2, 0, 0)), 7: array('i', (3, 2, 0, 0)),
	69	8: array('i', (0, 1, 0, 0)), 9: array('i', (2, 1, 0, 0)), 10: array('i', (1, 1, 0, 0)), 11: array('i', (3, 1, 0, 0)),
	70	12: array('i', (0, 3, 0, 0)), 13: array('i', (2, 3, 0, 0)), 14: array('i', (1, 3, 0, 0)), 15: array('i', (3, 3, 0, 0)),
	71	16: array('i', (0, 0, 2, 0)), 17: array('i', (2, 0, 2, 0)), 18: array('i', (1, 0, 2, 0)), 19: array('i', (3, 0, 2, 0)),
	72	20: array('i', (0, 2, 2, 0)), 21: array('i', (2, 2, 2, 0)), 22: array('i', (1, 2, 2, 0)), 23: array('i', (3, 2, 2, 0)),
	73	24: array('i', (0, 1, 2, 0)), 25: array('i', (2, 1, 2, 0)), 26: array('i', (1, 1, 2, 0)), 27: array('i', (3, 1, 2, 0)),
	74	28: array('i', (0, 3, 2, 0)), 29: array('i', (2, 3, 2, 0)), 30: array('i', (1, 3, 2, 0)), 31: array('i', (3, 3, 2, 0)),
	75	32: array('i', (0, 0, 1, 0)), 33: array('i', (2, 0, 1, 0)), 34: array('i', (1, 0, 1, 0)), 35: array('i', (3, 0, 1, 0)),
	76	36: array('i', (0, 2, 1, 0)), 37: array('i', (2, 2, 1, 0)), 38: array('i', (1, 2, 1, 0)), 39: array('i', (3, 2, 1, 0)),
	77	40: array('i', (0, 1, 1, 0)), 41: array('i', (2, 1, 1, 0)), 42: array('i', (1, 1, 1, 0)), 43: array('i', (3, 1, 1, 0)),
	78	44: array('i', (0, 3, 1, 0)), 45: array('i', (2, 3, 1, 0)), 46: array('i', (1, 3, 1, 0)), 47: array('i', (3, 3, 1, 0)),
	79	48: array('i', (0, 0, 3, 0)), 49: array('i', (2, 0, 3, 0)), 50: array('i', (1, 0, 3, 0)), 51: array('i', (3, 0, 3, 0)),
	80	52: array('i', (0, 2, 3, 0)), 53: array('i', (2, 2, 3, 0)), 54: array('i', (1, 2, 3, 0)), 55: array('i', (3, 2, 3, 0)),
	81	56: array('i', (0, 1, 3, 0)), 57: array('i', (2, 1, 3, 0)), 58: array('i', (1, 1, 3, 0)), 59: array('i', (3, 1, 3, 0)),
	82	60: array('i', (0, 3, 3, 0)), 61: array('i', (2, 3, 3, 0)), 62: array('i', (1, 3, 3, 0)), 63: array('i', (3, 3, 3, 0)),
	83	64: array('i', (0, 0, 0, 2)), 65: array('i', (2, 0, 0, 2)), 66: array('i', (1, 0, 0, 2)), 67: array('i', (3, 0, 0, 2)),
	84	68: array('i', (0, 2, 0, 2)), 69: array('i', (2, 2, 0, 2)), 70: array('i', (1, 2, 0, 2)), 71: array('i', (3, 2, 0, 2)),
	85	72: array('i', (0, 1, 0, 2)), 73: array('i', (2, 1, 0, 2)), 74: array('i', (1, 1, 0, 2)), 75: array('i', (3, 1, 0, 2)),
	86	76: array('i', (0, 3, 0, 2)), 77: array('i', (2, 3, 0, 2)), 78: array('i', (1, 3, 0, 2)), 79: array('i', (3, 3, 0, 2)),
	87	80: array('i', (0, 0, 2, 2)), 81: array('i', (2, 0, 2, 2)), 82: array('i', (1, 0, 2, 2)), 83: array('i', (3, 0, 2, 2)),
	88	84: array('i', (0, 2, 2, 2)), 85: array('i', (2, 2, 2, 2)), 86: array('i', (1, 2, 2, 2)), 87: array('i', (3, 2, 2, 2)),
	89	88: array('i', (0, 1, 2, 2)), 89: array('i', (2, 1, 2, 2)), 90: array('i', (1, 1, 2, 2)), 91: array('i', (3, 1, 2, 2)),
	90	92: array('i', (0, 3, 2, 2)), 93: array('i', (2, 3, 2, 2)), 94: array('i', (1, 3, 2, 2)), 95: array('i', (3, 3, 2, 2)),
	91	96: array('i', (0, 0, 1, 2)), 97: array('i', (2, 0, 1, 2)), 98: array('i', (1, 0, 1, 2)), 99: array('i', (3, 0, 1, 2)),
	92	100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)),
	93	104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)),
	94	108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)),
	95	112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)),
	96	116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)),
	97	120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)),
	98	124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)),
	99	128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)),
	100	132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)),
	101	136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)),
	102	140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)),
	103	144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)),
	104	148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)),
	105	152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)),
	106	156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)),
	107	160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)),
	108	164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)),
	109	168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)),
	110	172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)),
	111	176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)),
	112	180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)),
	113	184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)),
	114	188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)),
	115	192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)),
	116	196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)),
	117	200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)),
	118	204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)),
	119	208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)),
	120	212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)),
	121	216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)),
	122	220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)),
	123	224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)),
	124	228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)),
	125	232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)),
	126	236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)),
	127	240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)),
	128	244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)),
	129	248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)),
	130	252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)),
	131	}
	132
	133	GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()])
	134
	135	### Exceptions
	136	class DuplicateMarkerInMapFile(Exception): pass
	137	class MapLineTooShort(Exception): pass
	138	class ThirdAllele(Exception): pass
	139	class PedError(Exception): pass
	140	class BadMagic(Exception):
	141	""" Raised when one of the MAGIC bytes in a bed file does not match
	142	"""
	143	pass
	144	class BedError(Exception):
	145	""" Raised when parsing a bed file runs into problems
	146	"""
	147	pass
	148	class UnknownGenocode(Exception):
	149	""" Raised when we get a 2-bit genotype that is undecipherable (is it possible?)
	150	"""
	151	pass
	152	class UnknownGeno(Exception): pass
	153
	154	### Utility functions
	155
	156	def timenow():
	157	"""return current time as a string
	158	"""
	159	return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
	160
	161	def ceiling(n, k):
	162	''' Return the least multiple of k which is greater than n
	163	'''
	164	m = n % k
	165	if m == 0:
	166	return n
	167	else:
	168	return n + k - m
	169
	170	def nbytes(n):
	171	''' Return the number of bytes required for n subjects
	172	'''
	173	return 2*ceiling(n, 4)/8
	174
	175	### Primary module functionality
	176	class LPed:
	177	""" The uber-class for processing the Linkage-format .ped/.map files
	178	"""
	179	def __init__(self, base):
	180	self.base = base
	181	self._ped = Ped('%s.ped' % (self.base))
	182	self._map = Map('%s.map' % (self.base))
	183
	184	self._markers = {}
	185	self._ordered_markers = []
	186	self._marker_allele_lookup = {}
	187	self._autosomal_indices = set()
	188
	189	self._subjects = {}
	190	self._ordered_subjects = []
	191
	192	self._genotypes = []
	193
	194	def parse(self):
	195	"""
	196	"""
	197	if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow())
	198	self._map.parse()
	199	self._markers = self._map._markers
	200	self._ordered_markers = self._map._ordered_markers
	201	self._autosomal_indices = self._map._autosomal_indices
	202
	203	self._ped.parse(self._ordered_markers)
	204	self._subjects = self._ped._subjects
	205	self._ordered_subjects = self._ped._ordered_subjects
	206	self._genotypes = self._ped._genotypes
	207	self._marker_allele_lookup = self._ped._marker_allele_lookup
	208
	209	### Adjust self._markers based on the allele information
	210	### we got from parsing the ped file
	211	for m, name in enumerate(self._ordered_markers):
	212	a1, a2 = self._marker_allele_lookup[m][HET]
	213	self._markers[name][-2] = a1
	214	self._markers[name][-1] = a2
	215	if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow())
	216
	217	def getSubjectInfo(self, fid, oiid):
	218	"""
	219	"""
	220	return self._subject_info[(fid, oiid)]
	221
	222	def getSubjectInfoByLine(self, line):
	223	"""
	224	"""
	225	return self._subject_info[self._ordered_subjects[line]]
	226
	227	def getGenotypesByIndices(self, s, mlist, format):
	228	""" needed for grr if lped - deprecated but..
	229	"""
	230	mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ?
	231	raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)])
	232	if format == 'raw':
	233	return raw_array
	234	elif format == 'ref':
	235	result = array('i', [0]*len(mlist))
	236	for m, gcode in enumerate(raw_array):
	237	if gcode == HOM0:
	238	nref = 3
	239	elif gcode == HET:
	240	nref = 2
	241	elif gcode == HOM1:
	242	nref = 1
	243	else:
	244	nref = 0
	245	result[m] = nref
	246	return result
	247	else:
	248	result = []
	249	for m, gcode in enumerate(raw_array):
	250	result.append(self._marker_allele_lookup[m][gcode])
	251	return result
	252
	253	def writebed(self, base):
	254	"""
	255	"""
	256	dst_name = '%s.fam' % (base)
	257	print 'Writing pedigree information to [ %s ]' % (dst_name)
	258	dst = open(dst_name, 'w')
	259	for skey in self._ordered_subjects:
	260	(fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey]
	261	dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe))
	262	dst.close()
	263
	264	dst_name = '%s.bim' % (base)
	265	print 'Writing map (extended format) information to [ %s ]' % (dst_name)
	266	dst = open(dst_name, 'w')
	267	for m, marker in enumerate(self._ordered_markers):
	268	chrom, name, genpos, abspos, a1, a2 = self._markers[marker]
	269	dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2))
	270	dst.close()
	271
	272	bed_name = '%s.bed' % (base)
	273	print 'Writing genotype bitfile to [ %s ]' % (bed_name)
	274	print 'Using (default) SNP-major mode'
	275	bed = open(bed_name, 'w')
	276
	277	### Write the 3 header bytes
	278	bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2)))
	279	bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2)))
	280	bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2)))
	281
	282	### Calculate how many "pad bits" we should add after the last subject
	283	nsubjects = len(self._ordered_subjects)
	284	nmarkers = len(self._ordered_markers)
	285	total_bytes = nbytes(nsubjects)
	286	nbits = nsubjects * 2
	287	pad_nibbles = ((total_bytes * 8) - nbits)/2
	288	pad = array('i', [0]*pad_nibbles)
	289
	290	### And now write genotypes to the file
	291	for m in xrange(nmarkers):
	292	geno = self._genotypes[m]
	293	geno.extend(pad)
	294	bytes = len(geno)/4
	295	for b in range(bytes):
	296	idx = b*4
	297	gcode = tuple(geno[idx:idx+4])
	298	try:
	299	byte = struct.pack('B', GCODE_TO_INT[gcode])
	300	except KeyError:
	301	print m, b, gcode
	302	raise
	303	bed.write(byte)
	304	bed.close()
	305
	306	def autosomal_indices(self):
	307	""" Return the indices of markers in this ped/map that are autosomal.
	308	This is used by rgGRR so that it can select a random set of markers
	309	from the autosomes (sex chroms screw up the plot)
	310	"""
	311	return self._autosomal_indices
	312
	313	class Ped:
	314	def __init__(self, path):
	315	self.path = path
	316	self._subjects = {}
	317	self._ordered_subjects = []
	318	self._genotypes = []
	319	self._marker_allele_lookup = {}
	320
	321	def lineCount(self,infile):
	322	""" count the number of lines in a file - efficiently using wget
	323	"""
	324	return int(commands.getoutput('wc -l %s' % (infile)).split()[0])
	325
	326
	327	def parse(self, markers):
	328	""" Parse a given file -- this needs to be memory-efficient so that large
	329	files can be parsed (~1 million markers on ~5000 subjects?). It
	330	should also be fast, if possible.
	331	"""
	332
	333	### Find out how many lines are in the file so we can ...
	334	nsubjects = self.lineCount(self.path)
	335	### ... Pre-allocate the genotype arrays
	336	nmarkers = len(markers)
	337	_marker_alleles = [['0', '0'] for _ in xrange(nmarkers)]
	338	self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)]
	339
	340	if self.path.endswith('.gz'):
	341	pfile = gzip.open(self.path, 'r')
	342	else:
	343	pfile = open(self.path, 'r')
	344
	345	for s, line in enumerate(pfile):
	346	line = line.strip()
	347	if not line:
	348	continue
	349
	350	fid, iid, did, mid, sex, phe, genos = line.split(None, 6)
	351	sid = iid.split('.')[0]
	352	d_sid = did.split('.')[0]
	353	m_sid = mid.split('.')[0]
	354
	355	skey = (fid, iid)
	356	self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
	357	self._ordered_subjects.append(skey)
	358
	359	genotypes = genos.split()
	360
	361	for m, marker in enumerate(markers):
	362	idx = m*2
	363	a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m
	364	s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m
	365
	366	### FIXME: I think this can still be faster, and simpler to read
	367	# Two pieces of logic intertwined here: first, we need to code
	368	# this genotype as HOM0, HOM1, HET or MISS. Second, we need to
	369	# keep an ongoing record of the genotypes seen for this marker
	370	if a1 == a2:
	371	if a1 in MISSING_ALLELES:
	372	geno = MISS_GENO
	373	else:
	374	if s1 == '0':
	375	seen[0] = a1
	376	elif s1 == a1 or s2 == a2:
	377	pass
	378	elif s2 == '0':
	379	seen[1] = a1
	380	else:
	381	raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen)))
	382
	383	if a1 == seen[0]:
	384	geno = HOM0_GENO
	385	elif a1 == seen[1]:
	386	geno = HOM1_GENO
	387	else:
	388	raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen)))
	389	elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES:
	390	geno = MISS_GENO
	391	else:
	392	geno = HET_GENO
	393	if s1 == '0':
	394	seen[0] = a1
	395	seen[1] = a2
	396	elif s2 == '0':
	397	if s1 == a1:
	398	seen[1] = a2
	399	elif s1 == a2:
	400	seen[1] = a1
	401	else:
	402	raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
	403	else:
	404	if sorted(seen) != sorted((a1, a2)):
	405	raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
	406
	407	gcode = GENO_TO_GCODE.get(geno, None)
	408	if gcode is None:
	409	raise UnknownGeno(str(geno))
	410	self._genotypes[m][s] = gcode
	411
	412	# Build the _marker_allele_lookup table
	413	for m, alleles in enumerate(_marker_alleles):
	414	if len(alleles) == 2:
	415	a1, a2 = alleles
	416	elif len(alleles) == 1:
	417	a1 = alleles[0]
	418	a2 = '0'
	419	else:
	420	print 'All alleles blank for %s: %s' % (m, str(alleles))
	421	raise
	422
	423	self._marker_allele_lookup[m] = {
	424	HOM0: (a2, a2),
	425	HOM1: (a1, a1),
	426	HET : (a1, a2),
	427	MISS: ('0','0'),
	428	}
	429
	430	if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects), nsubjects, self.path)
	431
	432	class Map:
	433	def __init__(self, path=None):
	434	self.path = path
	435	self._markers = {}
	436	self._ordered_markers = []
	437	self._autosomal_indices = set()
	438
	439	def __len__(self):
	440	return len(self._markers)
	441
	442	def parse(self):
	443	""" Parse a Linkage-format map file
	444	"""
	445	if self.path.endswith('.gz'):
	446	fh = gzip.open(self.path, 'r')
	447	else:
	448	fh = open(self.path, 'r')
	449
	450	for i, line in enumerate(fh):
	451	line = line.strip()
	452	if not line:
	453	continue
	454
	455	fields = line.split()
	456	if len(fields) < 4:
	457	raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line), len(fields)))
	458	else:
	459	chrom, name, genpos, abspos = fields
	460	if name in self._markers:
	461	raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path))
	462	abspos = int(abspos)
	463	if abspos < 0:
	464	continue
	465	if chrom in AUTOSOMES:
	466	self._autosomal_indices.add(i)
	467	chrom = CHROM_REPLACE.get(chrom, chrom)
	468	self._markers[name] = [chrom, name, genpos, abspos, None, None]
	469	self._ordered_markers.append(name)
	470	fh.close()
	471	if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers), i, self.path)
	472
	473	class BPed:
	474	""" The uber-class for processing Plink's Binary Ped file format .bed/.bim/*.fam
	475	"""
	476	def __init__(self, base):
	477	self.base = base
	478	self._bed = Bed('%s.bed' % (self.base))
	479	self._bim = Bim('%s.bim' % (self.base))
	480	self._fam = Fam('%s.fam' % (self.base))
	481
	482	self._markers = {}
	483	self._ordered_markers = []
	484	self._marker_allele_lookup = {}
	485	self._autosomal_indices = set()
	486
	487	self._subjects = {}
	488	self._ordered_subjects = []
	489
	490	self._genotypes = []
	491
	492	def parse(self, quick=False):
	493	"""
	494	"""
	495	self._quick = quick
	496
	497	self._bim.parse()
	498	self._markers = self._bim._markers
	499	self._ordered_markers = self._bim._ordered_markers
	500	self._marker_allele_lookup = self._bim._marker_allele_lookup
	501	self._autosomal_indices = self._bim._autosomal_indices
	502
	503	self._fam.parse()
	504	self._subjects = self._fam._subjects
	505	self._ordered_subjects = self._fam._ordered_subjects
	506
	507	self._bed.parse(self._ordered_subjects, self._ordered_markers, quick=quick)
	508	self._bedf = self._bed._fh
	509	self._genotypes = self._bed._genotypes
	510	self.nsubjects = len(self._ordered_subjects)
	511	self.nmarkers = len(self._ordered_markers)
	512	self._bytes_per_marker = nbytes(self.nsubjects)
	513
	514	def writeped(self, path=None):
	515	"""
	516	"""
	517	path = self.path = path or self.path
	518
	519	map_name = self.path.replace('.bed', '.map')
	520	print 'Writing map file [ %s ]' % (map_name)
	521	dst = open(map_name, 'w')
	522	for m in self._ordered_markers:
	523	chrom, snp, genpos, abspos, a1, a2 = self._markers[m]
	524	dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos))
	525	dst.close()
	526
	527	ped_name = self.path.replace('.bed', '.ped')
	528	print 'Writing ped file [ %s ]' % (ped_name)
	529	ped = open(ped_name, 'w')
	530	firstyikes = False
	531	for s, skey in enumerate(self._ordered_subjects):
	532	idx = s*2
	533	(fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey]
	534	ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe))
	535	genotypes_for_subject = self.getGenotypesForSubject(s)
	536	for m, snp in enumerate(self._ordered_markers):
	537	#a1, a2 = self.getGenotypeByIndices(s, m)
	538	a1,a2 = genotypes_for_subject[m]
	539	ped.write(' %s %s' % (a1, a2))
	540	ped.write('\n')
	541	ped.close()
	542
	543	def getGenotype(self, subject, marker):
	544	""" Retrieve a genotype for a particular subject/marker pair
	545	"""
	546	m = self._ordered_markers.index(marker)
	547	s = self._ordered_subjects.index(subject)
	548	return self.getGenotypeByIndices(s, m)
	549
	550	def getGenotypesForSubject(self, s, raw=False):
	551	""" Returns list of genotypes for all m markers
	552	for subject s. If raw==True, then an array
	553	of raw integer gcodes is returned instead
	554	"""
	555	if self._quick:
	556	nmarkers = len(self._markers)
	557	raw_array = array('i', [0]*nmarkers)
	558	seek_nibble = s % 4
	559	for m in xrange(nmarkers):
	560	seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
	561	self._bedf.seek(seek_byte)
	562	geno = struct.unpack('B', self._bedf.read(1))[0]
	563	quartet = INT_TO_GCODE[geno]
	564	gcode = quartet[seek_nibble]
	565	raw_array[m] = gcode
	566	else:
	567	raw_array = array('i', [row[s] for row in self._genotypes])
	568
	569	if raw:
	570	return raw_array
	571	else:
	572	result = []
	573	for m, gcode in enumerate(raw_array):
	574	result.append(self._marker_allele_lookup[m][gcode])
	575	return result
	576
	577	def getGenotypeByIndices(self, s, m):
	578	"""
	579	"""
	580	if self._quick:
	581	# Determine which byte we need to seek to, and
	582	# which nibble within the byte we need
	583	seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
	584	seek_nibble = s % 4
	585	self._bedf.seek(seek_byte)
	586	geno = struct.unpack('B', self._bedf.read(1))[0]
	587	quartet = INT_TO_GCODE[geno]
	588	gcode = quartet[seek_nibble]
	589	else:
	590	# Otherwise, just grab the genotypes from the
	591	# list of arrays
	592	genos_for_marker = self._genotypes[m]
	593	gcode = genos_for_marker[s]
	594
	595	return self._marker_allele_lookup[m][gcode]
	596
	597	def getGenotypesByIndices(self, s, mlist, format):
	598	"""
	599	"""
	600	if self._quick:
	601	raw_array = array('i', [0]*len(mlist))
	602	seek_nibble = s % 4
	603	for i,m in enumerate(mlist):
	604	seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
	605	self._bedf.seek(seek_byte)
	606	geno = struct.unpack('B', self._bedf.read(1))[0]
	607	quartet = INT_TO_GCODE[geno]
	608	gcode = quartet[seek_nibble]
	609	raw_array[i] = gcode
	610	mlist = set(mlist)
	611	else:
	612	mlist = set(mlist)
	613	raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist])
	614
	615	if format == 'raw':
	616	return raw_array
	617	elif format == 'ref':
	618	result = array('i', [0]*len(mlist))
	619	for m, gcode in enumerate(raw_array):
	620	if gcode == HOM0:
	621	nref = 3
	622	elif gcode == HET:
	623	nref = 2
	624	elif gcode == HOM1:
	625	nref = 1
	626	else:
	627	nref = 0
	628	result[m] = nref
	629	return result
	630	else:
	631	result = []
	632	for m, gcode in enumerate(raw_array):
	633	result.append(self._marker_allele_lookup[m][gcode])
	634	return result
	635
	636	def getSubject(self, s):
	637	"""
	638	"""
	639	skey = self._ordered_subjects[s]
	640	return self._subjects[skey]
	641
	642	def autosomal_indices(self):
	643	""" Return the indices of markers in this ped/map that are autosomal.
	644	This is used by rgGRR so that it can select a random set of markers
	645	from the autosomes (sex chroms screw up the plot)
	646	"""
	647	return self._autosomal_indices
	648
	649	class Bed:
	650
	651	def __init__(self, path):
	652	self.path = path
	653	self._genotypes = []
	654	self._fh = None
	655
	656	def parse(self, subjects, markers, quick=False):
	657	""" Parse the bed file, indicated either by the path parameter,
	658	or as the self.path indicated in __init__. If quick is
	659	True, then just parse the bim and fam, then genotypes will
	660	be looked up dynamically by indices
	661	"""
	662	self._quick = quick
	663
	664	ordered_markers = markers
	665	ordered_subjects = subjects
	666	nsubjects = len(ordered_subjects)
	667	nmarkers = len(ordered_markers)
	668
	669	bed = open(self.path, 'rb')
	670	self._fh = bed
	671
	672	byte1 = bed.read(1)
	673	byte2 = bed.read(1)
	674	byte3 = bed.read(1)
	675	format_flag = struct.unpack('B', byte3)[0]
	676
	677	h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]])
	678	h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]])
	679	h3 = tuple(INT_TO_GCODE[format_flag])
	680
	681	if h1 != MAGIC1 or h2 != MAGIC2:
	682	raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2))
	683	if format_flag:
	684	print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3)
	685	else:
	686	raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3)
	687
	688	print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects)
	689
	690	### If quick mode was specified, we're done ...
	691	self._quick = quick
	692	if quick:
	693	return
	694
	695	### ... Otherwise, parse genotypes into an array, and append that
	696	### array to self._genotypes
	697	ngcodes = ceiling(nsubjects, 4)
	698	bytes_per_marker = nbytes(nsubjects)
	699	for m in xrange(nmarkers):
	700	genotype_array = array('i', [-1]*(ngcodes))
	701	for byte in xrange(bytes_per_marker):
	702	intval = struct.unpack('B', bed.read(1))[0]
	703	idx = byte*4
	704	genotype_array[idx:idx+4] = INT_TO_GCODE[intval]
	705	self._genotypes.append(genotype_array)
	706
	707	class Bim:
	708	def __init__(self, path):
	709	"""
	710	"""
	711	self.path = path
	712	self._markers = {}
	713	self._ordered_markers = []
	714	self._marker_allele_lookup = {}
	715	self._autosomal_indices = set()
	716
	717	def parse(self):
	718	"""
	719	"""
	720	print 'Reading map (extended format) from [ %s ]' % (self.path)
	721	bim = open(self.path, 'r')
	722	for m, line in enumerate(bim):
	723	chrom, snp, gpos, apos, a1, a2 = line.strip().split()
	724	self._markers[snp] = (chrom, snp, gpos, apos, a1, a2)
	725	self._marker_allele_lookup[m] = {
	726	HOM0: (a2, a2),
	727	HOM1: (a1, a1),
	728	HET : (a1, a2),
	729	MISS: ('0','0'),
	730	}
	731	self._ordered_markers.append(snp)
	732	if chrom in AUTOSOMES:
	733	self._autosomal_indices.add(m)
	734	bim.close()
	735	print '%s markers to be included from [ %s ]' % (m+1, self.path)
	736
	737	class Fam:
	738	def __init__(self, path):
	739	"""
	740	"""
	741	self.path = path
	742	self._subjects = {}
	743	self._ordered_subjects = []
	744
	745	def parse(self):
	746	"""
	747	"""
	748	print 'Reading pedigree information from [ %s ]' % (self.path)
	749	fam = open(self.path, 'r')
	750	for s, line in enumerate(fam):
	751	fid, iid, did, mid, sex, phe = line.strip().split()
	752	sid = iid.split('.')[0]
	753	d_sid = did.split('.')[0]
	754	m_sid = mid.split('.')[0]
	755	skey = (fid, iid)
	756	self._ordered_subjects.append(skey)
	757	self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
	758	fam.close()
	759	print '%s individuals read from [ %s ]' % (s+1, self.path)
	760
	761	### Command-line functionality and testing
	762	def test(arg):
	763	'''
	764	'''
	765
	766	import time
	767
	768	if arg == 'CAMP_AFFY.ped':
	769	print 'Testing bed.parse(quick=True)'
	770	s = time.time()
	771	bed = Bed(arg.replace('.ped', '.bed'))
	772	bed.parse(quick=True)
	773	print bed.getGenotype(('400118', '10300283'), 'rs2000467')
	774	print bed.getGenotype(('400118', '10101384'), 'rs2294019')
	775	print bed.getGenotype(('400121', '10101149'), 'rs2294019')
	776	print bed.getGenotype(('400123', '10200290'), 'rs2294019')
	777	assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4')
	778	e = time.time()
	779	print 'e-s = %s\n' % (e-s)
	780
	781	print 'Testing bed.parse'
	782	s = time.time()
	783	bed = BPed(arg)
	784	bed.parse(quick=False)
	785	e = time.time()
	786	print 'e-s = %s\n' % (e-s)
	787
	788	print 'Testing bed.writeped'
	789	s = time.time()
	790	outname = '%s_BEDTEST' % (arg)
	791	bed.writeped(outname)
	792	e = time.time()
	793	print 'e-s = %s\n' % (e-s)
	794	del(bed)
	795
	796	print 'Testing ped.parse'
	797	s = time.time()
	798	ped = LPed(arg)
	799	ped.parse()
	800	e = time.time()
	801	print 'e-s = %s\n' % (e-s)
	802
	803	print 'Testing ped.writebed'
	804	s = time.time()
	805	outname = '%s_PEDTEST' % (arg)
	806	ped.writebed(outname)
	807	e = time.time()
	808	print 'e-s = %s\n' % (e-s)
	809	del(ped)
	810
	811	def profile_bed(arg):
	812	"""
	813	"""
	814	bed = BPed(arg)
	815	bed.parse(quick=False)
	816	outname = '%s_BEDPROFILE' % (arg)
	817	bed.writeped(outname)
	818
	819	def profile_ped(arg):
	820	"""
	821	"""
	822	ped = LPed(arg)
	823	ped.parse()
	824	outname = '%s_PEDPROFILE' % (arg)
	825	ped.writebed(outname)
	826
	827	if __name__ == '__main__':
	828	""" Run as a command-line, this script should get one or more arguments,
	829	each one a ped file to be parsed with the PedParser (unit tests?)
	830	"""
	831	op = optparse.OptionParser()
	832	op.add_option('--profile-bed', action='store_true', default=False)
	833	op.add_option('--profile-ped', action='store_true', default=False)
	834	opts, args = op.parse_args()
	835
	836	if opts.profile_bed:
	837	import profile
	838	import pstats
	839	profile.run('profile_bed(args[0])', 'fooprof')
	840	p = pstats.Stats('fooprof')
	841	p.sort_stats('cumulative').print_stats(10)
	842	elif opts.profile_ped:
	843	import profile
	844	import pstats
	845	profile.run('profile_ped(args[0])', 'fooprof')
	846	p = pstats.Stats('fooprof')
	847	p.sort_stats('cumulative').print_stats(10)
	848	else:
	849	for arg in args:
	850	test(arg)
	851
	852	### Code used to generate the INT_TO_GCODE dictionary
	853	#print '{\n ',
	854	#for i in range(256):
	855	# b = INT2BIN[i]
	856	# ints = []
	857	# s = str(i).rjust(3)
	858	# #print b
	859	# for j in range(4):
	860	# idx = j*2
	861	# #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2)
	862	# ints.append(int(b[idx:idx+2], 2))
	863	# print '%s: array(\'i\', %s),' % (s,tuple(ints)),
	864	# if i > 0 and (i+1) % 4 == 0:
	865	# print '\n ',
	866	#print '}'
	867
	868

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/rgenetics/plinkbinJZ.py @ 2

異なるフォーマットでダウンロード: