Context Navigation

pagetag.py @ 2

リビジョン 2, 9.1 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central
属性 svn:executable の設定値 ``*

Rev	行番号
[2]	1	#!/usr/bin/env python
	2
	3	"""
	4	This accepts as input a file of the following format:
	5
	6	Site Sample Allele1 Allele2
	7
	8	for example:
	9
	10	000834 D001 G G
	11	000834 D002 G G
	12	000834 D003 G G
	13	000834 D004 G G
	14	000834 D005 N N
	15	000834 E001 G G
	16	000834 E002 G G
	17	000834 E003 G G
	18	000834 E004 G G
	19	000834 E005 G G
	20	000963 D001 T T
	21	000963 D002 T T
	22	000963 D003 T T
	23	000963 D004 T T
	24	000963 D005 N N
	25	000963 E001 T T
	26	000963 E002 N N
	27	000963 E003 G T
	28	000963 E004 G G
	29	000963 E005 G T
	30
	31	and a rsquare threshold and outputs two files:
	32
	33	a) a file of input snps (one on each line). A SNP is identified by the "Site"
	34	column in the input file
	35
	36	b) a file where each line has the following:
	37	SNP list
	38	where SNP is one of the SNPs and the "list" is a comma separated list of SNPs
	39	that exceed the rsquare threshold with the first SNP.
	40	"""
	41
	42	from sys import argv, stderr, exit
	43	from getopt import getopt, GetoptError
	44
	45	__author__ = "Aakrosh Ratan"
	46	__email__ = "ratan@bx.psu.edu"
	47
	48	# do we want the debug information to be printed?
	49	debug_flag = False
	50
	51	# denote different combos of alleles in code
	52	HOMC = str(1)
	53	HOMR = str(2)
	54	HETE = str(3)
	55	OTHER = str(4)
	56
	57	indexcalculator = {(HOMC,HOMC) : 0,
	58	(HOMC,HOMR) : 1,
	59	(HOMC,HETE) : 2,
	60	(HOMR,HOMC) : 3,
	61	(HOMR,HOMR) : 4,
	62	(HOMR,HETE) : 5,
	63	(HETE,HOMC) : 6,
	64	(HETE,HOMR) : 7,
	65	(HETE,HETE) : 8}
	66
	67	def read_inputfile(filename, samples):
	68	input = {}
	69
	70	file = open(filename, "r")
	71
	72	for line in file:
	73	position,sample,allele1,allele2 = line.split()
	74
	75	# if the user specified a list of samples, then only use those samples
	76	if samples != None and sample not in samples: continue
	77
	78	if position in input:
	79	v = input[position]
	80	v[sample] = (allele1,allele2)
	81	else:
	82	v = {sample : (allele1, allele2)}
	83	input[position] = v
	84
	85	file.close()
	86	return input
	87
	88	def annotate_locus(input, minorallelefrequency, snpsfile):
	89	locus = {}
	90	for k,v in input.items():
	91	genotypes = [x for x in v.values()]
	92	alleles = [y for x in genotypes for y in x]
	93	alleleset = list(set(alleles))
	94	alleleset = list(set(alleles) - set(["N","X"]))
	95
	96	if len(alleleset) == 2:
	97	genotypevec = ""
	98	num1 = len([x for x in alleles if x == alleleset[0]])
	99	num2 = len([x for x in alleles if x == alleleset[1]])
	100
	101	if num1 > num2:
	102	major = alleleset[0]
	103	minor = alleleset[1]
	104	minorfreq = (num2 * 1.0)/(num1 + num2)
	105	else:
	106	major = alleleset[1]
	107	minor = alleleset[0]
	108	minorfreq = (num1 * 1.0)/(num1 + num2)
	109
	110	if minorfreq < minorallelefrequency: continue
	111
	112	for gen in genotypes:
	113	if gen == (major,major):
	114	genotypevec += HOMC
	115	elif gen == (minor,minor):
	116	genotypevec += HOMR
	117	elif gen == (major, minor) or gen == (minor, major):
	118	genotypevec += HETE
	119	else:
	120	genotypevec += OTHER
	121
	122	locus[k] = genotypevec,minorfreq
	123	elif len(alleleset) > 2:
	124	print >> snpsfile, k
	125	return locus
	126
	127	def calculateLD(loci, rsqthreshold):
	128	snps = list(loci)
	129	rsquare = {}
	130
	131	for index,loc1 in enumerate(snps):
	132	for loc2 in snps[index + 1:]:
	133	matrix = [0]*9
	134
	135	vec1 = loci[loc1][0]
	136	vec2 = loci[loc2][0]
	137
	138	for gen in zip(vec1,vec2):
	139	if gen[0] == OTHER or gen[1] == OTHER: continue
	140	matrix[indexcalculator[gen]] += 1
	141
	142	n = sum(matrix)
	143	x11 = 2*matrix[0] + matrix[2] + matrix[6]
	144	x12 = 2*matrix[1] + matrix[2] + matrix[7]
	145	x21 = 2*matrix[3] + matrix[6] + matrix[5]
	146	x22 = 2*matrix[4] + matrix[6] + matrix[5]
	147
	148	p = (x11 + x12 + matrix[8] * 1.0) / (2 * n)
	149	q = (x11 + x21 + matrix[8] * 1.0) / (2 * n)
	150
	151	p11 = p * q
	152
	153	oldp11 = p11
	154	range = 0.0
	155	converged = False
	156	convergentcounter = 0
	157	if p11 > 0.0:
	158	while converged == False and convergentcounter < 100:
	159	if (1.0 - p - q + p11) != 0.0 and oldp11 != 0.0:
	160	num = matrix[8] * p11 * (1.0 - p - q + p11)
	161	den = p11 * (1.0 - p - q + p11) + (p - p11)*(q - p11)
	162	p11 = (x11 + (num/den))/(2.0*n)
	163	range = p11/oldp11
	164	if range >= 0.9999 and range <= 1.001:
	165	converged = True
	166	oldp11 = p11
	167	convergentcounter += 1
	168	else:
	169	converged = True
	170
	171	dvalue = 0.0
	172	if converged == True:
	173	dvalue = p11 - (p * q)
	174
	175	if dvalue != 0.0:
	176	rsq = (dvalue*2)/(pq(1-p)(1-q))
	177	if rsq >= rsqthreshold:
	178	rsquare["%s %s" % (loc1,loc2)] = rsq
	179
	180	return rsquare
	181
	182	def main(inputfile, snpsfile, neigborhoodfile, \
	183	rsquare, minorallelefrequency, samples):
	184	# read the input file
	185	input = read_inputfile(inputfile, samples)
	186	print >> stderr, "Read %d locations" % len(input)
	187
	188	# open the snpsfile to print
	189	file = open(snpsfile, "w")
	190
	191	# annotate the inputs, remove the abnormal loci (which do not have 2 alleles
	192	# and add the major and minor allele to each loci
	193	loci = annotate_locus(input, minorallelefrequency, file)
	194	print >> stderr, "Read %d interesting locations" % len(loci)
	195
	196	# print all the interesting loci as candidate snps
	197	for k in loci.keys(): print >> file, k
	198	file.close()
	199	print >> stderr, "Finished creating the snpsfile"
	200
	201	# calculate the LD values and store it if it exceeds the threshold
	202	lds = calculateLD(loci, rsquare)
	203	print >> stderr, "Calculated all the LD values"
	204
	205	# create a list of SNPs
	206	snps = {}
	207	ldvals = {}
	208	for k,v in lds.items():
	209	s1,s2 = k.split()
	210	if s1 in snps: snps[s1].append(s2)
	211	else : snps[s1] = [s2]
	212	if s2 in snps: snps[s2].append(s1)
	213	else : snps[s2] = [s1]
	214
	215	if s1 in ldvals: ldvals[s1].append(str(v))
	216	else : ldvals[s1] = [str(v)]
	217	if s2 in ldvals: ldvals[s2].append(str(v))
	218	else : ldvals[s2] = [str(v)]
	219
	220	# print the snps to the output file
	221	file = open(neigborhoodfile, "w")
	222
	223	for k,v in snps.items():
	224	ldv = ldvals[k]
	225	if debug_flag == True:
	226	print >> file, "%s\t%s\t%s" % (k, ",".join(v), ",".join(ldv))
	227	else:
	228	print >> file, "%s\t%s" % (k, ",".join(v))
	229
	230	file.close()
	231
	232
	233	def read_list(filename):
	234	file = open(filename, "r")
	235	list = {}
	236
	237	for line in file:
	238	list[line.strip()] = 1
	239
	240	file.close()
	241	return list
	242
	243	def usage():
	244	f = stderr
	245	print >> f, "usage:"
	246	print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt"
	247	print >> f, "where input.txt is the prettybase file"
	248	print >> f, "where snps.txt is the first output file with the snps"
	249	print >> f, "where neighborhood.txt is the output neighborhood file"
	250	print >> f, "where the options are:"
	251	print >> f, "-h,--help : print usage and quit"
	252	print >> f, "-d,--debug: print debug information"
	253	print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)"
	254	print >> f, "-f,--freq : the minimum MAF required (default: 0.0)"
	255	print >> f, "-s,--sample : a list of samples to be clustered"
	256
	257	if __name__ == "__main__":
	258	try:
	259	opts, args = getopt(argv[1:], "hds:r:f:",\
	260	["help", "debug", "rsquare=","freq=", "sample="])
	261	except GetoptError, err:
	262	print str(err)
	263	usage()
	264	exit(2)
	265
	266	rsquare = 0.64
	267	minorallelefrequency = 0.0
	268	samples = None
	269
	270	for o, a in opts:
	271	if o in ("-h", "--help"):
	272	usage()
	273	exit()
	274	elif o in ("-d", "--debug"):
	275	debug_flag = True
	276	elif o in ("-r", "--rsquare"):
	277	rsquare = float(a)
	278	elif o in ("-f", "--freq"):
	279	minorallelefrequency = float(a)
	280	elif o in ("-s", "--sample"):
	281	samples = read_list(a)
	282	else:
	283	assert False, "unhandled option"
	284
	285	if rsquare < 0.00 or rsquare > 1.00:
	286	print >> stderr, "input value of rsquare should be in [0.00, 1.00]"
	287	exit(3)
	288
	289	if minorallelefrequency < 0.0 or minorallelefrequency > 0.5:
	290	print >> stderr, "input value of MAF should be (0.00,0.50]"
	291	exit(4)
	292
	293	if len(args) != 3:
	294	usage()
	295	exit(5)
	296
	297	main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples)

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/human_genome_variation/pagetag.py @ 2

異なるフォーマットでダウンロード: