Context Navigation

pagetag.py

リビジョン 2, 9.1 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central
属性 svn:executable の設定値 ``*

行番号
1	#!/usr/bin/env python
2
3	"""
4	This accepts as input a file of the following format:
5
6	Site Sample Allele1 Allele2
7
8	for example:
9
10	000834 D001 G G
11	000834 D002 G G
12	000834 D003 G G
13	000834 D004 G G
14	000834 D005 N N
15	000834 E001 G G
16	000834 E002 G G
17	000834 E003 G G
18	000834 E004 G G
19	000834 E005 G G
20	000963 D001 T T
21	000963 D002 T T
22	000963 D003 T T
23	000963 D004 T T
24	000963 D005 N N
25	000963 E001 T T
26	000963 E002 N N
27	000963 E003 G T
28	000963 E004 G G
29	000963 E005 G T
30
31	and a rsquare threshold and outputs two files:
32
33	a) a file of input snps (one on each line). A SNP is identified by the "Site"
34	column in the input file
35
36	b) a file where each line has the following:
37	SNP list
38	where SNP is one of the SNPs and the "list" is a comma separated list of SNPs
39	that exceed the rsquare threshold with the first SNP.
40	"""
41
42	from sys import argv, stderr, exit
43	from getopt import getopt, GetoptError
44
45	__author__ = "Aakrosh Ratan"
46	__email__ = "ratan@bx.psu.edu"
47
48	# do we want the debug information to be printed?
49	debug_flag = False
50
51	# denote different combos of alleles in code
52	HOMC = str(1)
53	HOMR = str(2)
54	HETE = str(3)
55	OTHER = str(4)
56
57	indexcalculator = {(HOMC,HOMC) : 0,
58	(HOMC,HOMR) : 1,
59	(HOMC,HETE) : 2,
60	(HOMR,HOMC) : 3,
61	(HOMR,HOMR) : 4,
62	(HOMR,HETE) : 5,
63	(HETE,HOMC) : 6,
64	(HETE,HOMR) : 7,
65	(HETE,HETE) : 8}
66
67	def read_inputfile(filename, samples):
68	input = {}
69
70	file = open(filename, "r")
71
72	for line in file:
73	position,sample,allele1,allele2 = line.split()
74
75	# if the user specified a list of samples, then only use those samples
76	if samples != None and sample not in samples: continue
77
78	if position in input:
79	v = input[position]
80	v[sample] = (allele1,allele2)
81	else:
82	v = {sample : (allele1, allele2)}
83	input[position] = v
84
85	file.close()
86	return input
87
88	def annotate_locus(input, minorallelefrequency, snpsfile):
89	locus = {}
90	for k,v in input.items():
91	genotypes = [x for x in v.values()]
92	alleles = [y for x in genotypes for y in x]
93	alleleset = list(set(alleles))
94	alleleset = list(set(alleles) - set(["N","X"]))
95
96	if len(alleleset) == 2:
97	genotypevec = ""
98	num1 = len([x for x in alleles if x == alleleset[0]])
99	num2 = len([x for x in alleles if x == alleleset[1]])
100
101	if num1 > num2:
102	major = alleleset[0]
103	minor = alleleset[1]
104	minorfreq = (num2 * 1.0)/(num1 + num2)
105	else:
106	major = alleleset[1]
107	minor = alleleset[0]
108	minorfreq = (num1 * 1.0)/(num1 + num2)
109
110	if minorfreq < minorallelefrequency: continue
111
112	for gen in genotypes:
113	if gen == (major,major):
114	genotypevec += HOMC
115	elif gen == (minor,minor):
116	genotypevec += HOMR
117	elif gen == (major, minor) or gen == (minor, major):
118	genotypevec += HETE
119	else:
120	genotypevec += OTHER
121
122	locus[k] = genotypevec,minorfreq
123	elif len(alleleset) > 2:
124	print >> snpsfile, k
125	return locus
126
127	def calculateLD(loci, rsqthreshold):
128	snps = list(loci)
129	rsquare = {}
130
131	for index,loc1 in enumerate(snps):
132	for loc2 in snps[index + 1:]:
133	matrix = [0]*9
134
135	vec1 = loci[loc1][0]
136	vec2 = loci[loc2][0]
137
138	for gen in zip(vec1,vec2):
139	if gen[0] == OTHER or gen[1] == OTHER: continue
140	matrix[indexcalculator[gen]] += 1
141
142	n = sum(matrix)
143	x11 = 2*matrix[0] + matrix[2] + matrix[6]
144	x12 = 2*matrix[1] + matrix[2] + matrix[7]
145	x21 = 2*matrix[3] + matrix[6] + matrix[5]
146	x22 = 2*matrix[4] + matrix[6] + matrix[5]
147
148	p = (x11 + x12 + matrix[8] * 1.0) / (2 * n)
149	q = (x11 + x21 + matrix[8] * 1.0) / (2 * n)
150
151	p11 = p * q
152
153	oldp11 = p11
154	range = 0.0
155	converged = False
156	convergentcounter = 0
157	if p11 > 0.0:
158	while converged == False and convergentcounter < 100:
159	if (1.0 - p - q + p11) != 0.0 and oldp11 != 0.0:
160	num = matrix[8] * p11 * (1.0 - p - q + p11)
161	den = p11 * (1.0 - p - q + p11) + (p - p11)*(q - p11)
162	p11 = (x11 + (num/den))/(2.0*n)
163	range = p11/oldp11
164	if range >= 0.9999 and range <= 1.001:
165	converged = True
166	oldp11 = p11
167	convergentcounter += 1
168	else:
169	converged = True
170
171	dvalue = 0.0
172	if converged == True:
173	dvalue = p11 - (p * q)
174
175	if dvalue != 0.0:
176	rsq = (dvalue*2)/(pq(1-p)(1-q))
177	if rsq >= rsqthreshold:
178	rsquare["%s %s" % (loc1,loc2)] = rsq
179
180	return rsquare
181
182	def main(inputfile, snpsfile, neigborhoodfile, \
183	rsquare, minorallelefrequency, samples):
184	# read the input file
185	input = read_inputfile(inputfile, samples)
186	print >> stderr, "Read %d locations" % len(input)
187
188	# open the snpsfile to print
189	file = open(snpsfile, "w")
190
191	# annotate the inputs, remove the abnormal loci (which do not have 2 alleles
192	# and add the major and minor allele to each loci
193	loci = annotate_locus(input, minorallelefrequency, file)
194	print >> stderr, "Read %d interesting locations" % len(loci)
195
196	# print all the interesting loci as candidate snps
197	for k in loci.keys(): print >> file, k
198	file.close()
199	print >> stderr, "Finished creating the snpsfile"
200
201	# calculate the LD values and store it if it exceeds the threshold
202	lds = calculateLD(loci, rsquare)
203	print >> stderr, "Calculated all the LD values"
204
205	# create a list of SNPs
206	snps = {}
207	ldvals = {}
208	for k,v in lds.items():
209	s1,s2 = k.split()
210	if s1 in snps: snps[s1].append(s2)
211	else : snps[s1] = [s2]
212	if s2 in snps: snps[s2].append(s1)
213	else : snps[s2] = [s1]
214
215	if s1 in ldvals: ldvals[s1].append(str(v))
216	else : ldvals[s1] = [str(v)]
217	if s2 in ldvals: ldvals[s2].append(str(v))
218	else : ldvals[s2] = [str(v)]
219
220	# print the snps to the output file
221	file = open(neigborhoodfile, "w")
222
223	for k,v in snps.items():
224	ldv = ldvals[k]
225	if debug_flag == True:
226	print >> file, "%s\t%s\t%s" % (k, ",".join(v), ",".join(ldv))
227	else:
228	print >> file, "%s\t%s" % (k, ",".join(v))
229
230	file.close()
231
232
233	def read_list(filename):
234	file = open(filename, "r")
235	list = {}
236
237	for line in file:
238	list[line.strip()] = 1
239
240	file.close()
241	return list
242
243	def usage():
244	f = stderr
245	print >> f, "usage:"
246	print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt"
247	print >> f, "where input.txt is the prettybase file"
248	print >> f, "where snps.txt is the first output file with the snps"
249	print >> f, "where neighborhood.txt is the output neighborhood file"
250	print >> f, "where the options are:"
251	print >> f, "-h,--help : print usage and quit"
252	print >> f, "-d,--debug: print debug information"
253	print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)"
254	print >> f, "-f,--freq : the minimum MAF required (default: 0.0)"
255	print >> f, "-s,--sample : a list of samples to be clustered"
256
257	if __name__ == "__main__":
258	try:
259	opts, args = getopt(argv[1:], "hds:r:f:",\
260	["help", "debug", "rsquare=","freq=", "sample="])
261	except GetoptError, err:
262	print str(err)
263	usage()
264	exit(2)
265
266	rsquare = 0.64
267	minorallelefrequency = 0.0
268	samples = None
269
270	for o, a in opts:
271	if o in ("-h", "--help"):
272	usage()
273	exit()
274	elif o in ("-d", "--debug"):
275	debug_flag = True
276	elif o in ("-r", "--rsquare"):
277	rsquare = float(a)
278	elif o in ("-f", "--freq"):
279	minorallelefrequency = float(a)
280	elif o in ("-s", "--sample"):
281	samples = read_list(a)
282	else:
283	assert False, "unhandled option"
284
285	if rsquare < 0.00 or rsquare > 1.00:
286	print >> stderr, "input value of rsquare should be in [0.00, 1.00]"
287	exit(3)
288
289	if minorallelefrequency < 0.0 or minorallelefrequency > 0.5:
290	print >> stderr, "input value of MAF should be (0.00,0.50]"
291	exit(4)
292
293	if len(args) != 3:
294	usage()
295	exit(5)
296
297	main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples)

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/human_genome_variation/pagetag.py

異なるフォーマットでダウンロード: