Context Navigation

microsats_mutability.py @ 2

リビジョン 2, 22.4 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	#Guruprasad Ananda
3	"""
4	This tool computes microsatellite mutability for the orthologous microsatellites fetched from 'Extract Orthologous Microsatellites from pair-wise alignments' tool.
5	"""
6	from galaxy import eggs
7	import sys, string, re, commands, tempfile, os, fileinput
8	from galaxy.tools.util.galaxyops import *
9	from bx.intervals.io import *
10	from bx.intervals.operations import quicksect
11
12	fout = open(sys.argv[2],'w')
13	p_group = int(sys.argv[3]) #primary "group-by" feature
14	p_bin_size = int(sys.argv[4])
15	s_group = int(sys.argv[5]) #sub-group by feature
16	s_bin_size = int(sys.argv[6])
17	mono_threshold = 9
18	non_mono_threshold = 4
19	p_group_cols = [p_group, p_group+7]
20	s_group_cols = [s_group, s_group+7]
21	num_generations = int(sys.argv[7])
22	region = sys.argv[8]
23	int_file = sys.argv[9]
24	if int_file != "None": #User has specified an interval file
25	try:
26	fint = open(int_file, 'r')
27	dbkey_i = sys.argv[10]
28	chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[11] )
29	except:
30	stop_err("Unable to open input Interval file")
31
32	def stop_err(msg):
33	sys.stderr.write(msg)
34	sys.exit()
35
36	def reverse_complement(text):
37	DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
38	comp = [ch for ch in text.translate(DNA_COMP)]
39	comp.reverse()
40	return "".join(comp)
41
42	def get_unique_elems(elems):
43	seen=set()
44	return[x for x in elems if x not in seen and not seen.add(x)]
45
46	def get_binned_lists(uniqlist, binsize):
47	binnedlist=[]
48	uniqlist.sort()
49	start = int(uniqlist[0])
50	bin_ind=0
51	l_ind=0
52	binnedlist.append([])
53	while l_ind < len(uniqlist):
54	elem = int(uniqlist[l_ind])
55	if elem in range(start,start+binsize):
56	binnedlist[bin_ind].append(elem)
57	else:
58	start += binsize
59	bin_ind += 1
60	binnedlist.append([])
61	binnedlist[bin_ind].append(elem)
62	l_ind += 1
63	return binnedlist
64
65	def fetch_weight(H,C,t):
66	if (H-(C-H)) < t:
67	return 2.0
68	else:
69	return 1.0
70
71	def mutabilityEstimator(repeats1,repeats2,thresholds):
72	mut_num = 0.0 #Mutability Numerator
73	mut_den = 0.0 #Mutability denominator
74	for ind,H in enumerate(repeats1):
75	C = repeats2[ind]
76	t = thresholds[ind]
77	w = fetch_weight(H,C,t)
78	mut_num += ((H-C)(H-C)w)
79	mut_den += w
80	return [mut_num, mut_den]
81
82	def output_writer(blk, blk_lines):
83	global winspecies, speciesind
84	all_elems_1=[]
85	all_elems_2=[]
86	all_s_elems_1=[]
87	all_s_elems_2=[]
88	for bline in blk_lines:
89	if not(bline):
90	continue
91	items = bline.split('\t')
92	seq1 = items[1]
93	start1 = items[2]
94	end1 = items[3]
95	seq2 = items[8]
96	start2 = items[9]
97	end2 = items[10]
98	if p_group_cols[0] == 6:
99	items[p_group_cols[0]] = int(items[p_group_cols[0]])
100	items[p_group_cols[1]] = int(items[p_group_cols[1]])
101	if s_group_cols[0] == 6:
102	items[s_group_cols[0]] = int(items[s_group_cols[0]])
103	items[s_group_cols[1]] = int(items[s_group_cols[1]])
104	all_elems_1.append(items[p_group_cols[0]]) #primary col elements for species 1
105	all_elems_2.append(items[p_group_cols[1]]) #primary col elements for species 2
106	if s_group_cols[0] != -1: #sub-group is not None
107	all_s_elems_1.append(items[s_group_cols[0]]) #secondary col elements for species 1
108	all_s_elems_2.append(items[s_group_cols[1]]) #secondary col elements for species 2
109	uniq_elems_1 = get_unique_elems(all_elems_1)
110	uniq_elems_2 = get_unique_elems(all_elems_2)
111	if s_group_cols[0] != -1:
112	uniq_s_elems_1 = get_unique_elems(all_s_elems_1)
113	uniq_s_elems_2 = get_unique_elems(all_s_elems_2)
114	mut1={}
115	mut2={}
116	count1 = {}
117	count2 = {}
118	"""
119	if p_group_cols[0] == 7: #i.e. the option chosen is group-by unit(AG, GTC, etc)
120	uniq_elems_1 = get_unique_units(j.sort(lambda x, y: len(x)-len(y)))
121	"""
122	if p_group_cols[0] == 6: #i.e. the option chosen is group-by repeat number.
123	uniq_elems_1 = get_binned_lists(uniq_elems_1,p_bin_size)
124	uniq_elems_2 = get_binned_lists(uniq_elems_2,p_bin_size)
125
126	if s_group_cols[0] == 6: #i.e. the option chosen is subgroup-by repeat number.
127	uniq_s_elems_1 = get_binned_lists(uniq_s_elems_1,s_bin_size)
128	uniq_s_elems_2 = get_binned_lists(uniq_s_elems_2,s_bin_size)
129
130	for pitem1 in uniq_elems_1:
131	#repeats1 = []
132	#repeats2 = []
133	thresholds = []
134	if s_group_cols[0] != -1: #Sub-group by feature is not None
135	for sitem1 in uniq_s_elems_1:
136	repeats1 = []
137	repeats2 = []
138	if type(sitem1) == type(''):
139	sitem1 = sitem1.strip()
140	for bline in blk_lines:
141	belems = bline.split('\t')
142	if type(pitem1) == list:
143	if p_group_cols[0] == 6:
144	belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
145	if belems[p_group_cols[0]] in pitem1:
146	if belems[s_group_cols[0]]==sitem1:
147	repeats1.append(int(belems[6]))
148	repeats2.append(int(belems[13]))
149	if belems[4] == 'mononucleotide':
150	thresholds.append(mono_threshold)
151	else:
152	thresholds.append(non_mono_threshold)
153	mut1[str(pitem1)+'\t'+str(sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
154	if region == 'align':
155	count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
156	else:
157	if winspecies == 1:
158	count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
159	elif winspecies == 2:
160	count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
161	else:
162	if type(sitem1) == list:
163	if s_group_cols[0] == 6:
164	belems[s_group_cols[0]] = int(belems[s_group_cols[0]])
165	if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]] in sitem1:
166	repeats1.append(int(belems[6]))
167	repeats2.append(int(belems[13]))
168	if belems[4] == 'mononucleotide':
169	thresholds.append(mono_threshold)
170	else:
171	thresholds.append(non_mono_threshold)
172	mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
173	if region == 'align':
174	count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
175	else:
176	if winspecies == 1:
177	count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats1)
178	elif winspecies == 2:
179	count1[str(pitem1)+'\t'+str(sitem1)]=sum(repeats2)
180	else:
181	if belems[p_group_cols[0]]==pitem1 and belems[s_group_cols[0]]==sitem1:
182	repeats1.append(int(belems[6]))
183	repeats2.append(int(belems[13]))
184	if belems[4] == 'mononucleotide':
185	thresholds.append(mono_threshold)
186	else:
187	thresholds.append(non_mono_threshold)
188	mut1["%s\t%s" %(pitem1,sitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
189	if region == 'align':
190	count1[str(pitem1)+'\t'+str(sitem1)]=min(sum(repeats1),sum(repeats2))
191	else:
192	if winspecies == 1:
193	count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats1)
194	elif winspecies == 2:
195	count1["%s\t%s" %(pitem1,sitem1)]=sum(repeats2)
196	else: #Sub-group by feature is None
197	for bline in blk_lines:
198	belems = bline.split('\t')
199	if type(pitem1) == list:
200	#print >>sys.stderr, "item: " + str(item1)
201	if p_group_cols[0] == 6:
202	belems[p_group_cols[0]] = int(belems[p_group_cols[0]])
203	if belems[p_group_cols[0]] in pitem1:
204	repeats1.append(int(belems[6]))
205	repeats2.append(int(belems[13]))
206	if belems[4] == 'mononucleotide':
207	thresholds.append(mono_threshold)
208	else:
209	thresholds.append(non_mono_threshold)
210	else:
211	if belems[p_group_cols[0]]==pitem1:
212	repeats1.append(int(belems[6]))
213	repeats2.append(int(belems[13]))
214	if belems[4] == 'mononucleotide':
215	thresholds.append(mono_threshold)
216	else:
217	thresholds.append(non_mono_threshold)
218	mut1["%s" %(pitem1)]=mutabilityEstimator(repeats1,repeats2,thresholds)
219	if region == 'align':
220	count1["%s" %(pitem1)]=min(sum(repeats1),sum(repeats2))
221	else:
222	if winspecies == 1:
223	count1[str(pitem1)]=sum(repeats1)
224	elif winspecies == 2:
225	count1[str(pitem1)]=sum(repeats2)
226
227	for pitem2 in uniq_elems_2:
228	#repeats1 = []
229	#repeats2 = []
230	thresholds = []
231	if s_group_cols[0] != -1: #Sub-group by feature is not None
232	for sitem2 in uniq_s_elems_2:
233	repeats1 = []
234	repeats2 = []
235	if type(sitem2)==type(''):
236	sitem2 = sitem2.strip()
237	for bline in blk_lines:
238	belems = bline.split('\t')
239	if type(pitem2) == list:
240	if p_group_cols[0] == 6:
241	belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
242	if belems[p_group_cols[1]] in pitem2 and belems[s_group_cols[1]]==sitem2:
243	repeats2.append(int(belems[13]))
244	repeats1.append(int(belems[6]))
245	if belems[4] == 'mononucleotide':
246	thresholds.append(mono_threshold)
247	else:
248	thresholds.append(non_mono_threshold)
249	mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
250	#count2[str(pitem2)+'\t'+str(sitem2)]=len(repeats2)
251	if region == 'align':
252	count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
253	else:
254	if winspecies == 1:
255	count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
256	elif winspecies == 2:
257	count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
258	else:
259	if type(sitem2) == list:
260	if s_group_cols[0] == 6:
261	belems[s_group_cols[1]] = int(belems[s_group_cols[1]])
262	if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]] in sitem2:
263	repeats2.append(int(belems[13]))
264	repeats1.append(int(belems[6]))
265	if belems[4] == 'mononucleotide':
266	thresholds.append(mono_threshold)
267	else:
268	thresholds.append(non_mono_threshold)
269	mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
270	if region == 'align':
271	count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
272	else:
273	if winspecies == 1:
274	count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
275	elif winspecies == 2:
276	count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
277	else:
278	if belems[p_group_cols[1]]==pitem2 and belems[s_group_cols[1]]==sitem2:
279	repeats1.append(int(belems[13]))
280	repeats2.append(int(belems[6]))
281	if belems[4] == 'mononucleotide':
282	thresholds.append(mono_threshold)
283	else:
284	thresholds.append(non_mono_threshold)
285	mut2["%s\t%s" %(pitem2,sitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
286	if region == 'align':
287	count2["%s\t%s" %(pitem2,sitem2)]=min(sum(repeats1),sum(repeats2))
288	else:
289	if winspecies == 1:
290	count2["%s\t%s" %(pitem2,sitem2)]=len(repeats2)
291	elif winspecies == 2:
292	count2["%s\t%s" %(pitem2,sitem2)]=len(repeats1)
293	else: #Sub-group by feature is None
294	for bline in blk_lines:
295	belems = bline.split('\t')
296	if type(pitem2) == list:
297	if p_group_cols[0] == 6:
298	belems[p_group_cols[1]] = int(belems[p_group_cols[1]])
299	if belems[p_group_cols[1]] in pitem2:
300	repeats2.append(int(belems[13]))
301	repeats1.append(int(belems[6]))
302	if belems[4] == 'mononucleotide':
303	thresholds.append(mono_threshold)
304	else:
305	thresholds.append(non_mono_threshold)
306	else:
307	if belems[p_group_cols[1]]==pitem2:
308	repeats2.append(int(belems[13]))
309	repeats1.append(int(belems[6]))
310	if belems[4] == 'mononucleotide':
311	thresholds.append(mono_threshold)
312	else:
313	thresholds.append(non_mono_threshold)
314	mut2["%s" %(pitem2)]=mutabilityEstimator(repeats2,repeats1,thresholds)
315	if region == 'align':
316	count2["%s" %(pitem2)]=min(sum(repeats1),sum(repeats2))
317	else:
318	if winspecies == 1:
319	count2["%s" %(pitem2)]=sum(repeats2)
320	elif winspecies == 2:
321	count2["%s" %(pitem2)]=sum(repeats1)
322	for key in mut1.keys():
323	if key in mut2.keys():
324	mut = (mut1[key][0]+mut2[key][0])/(mut1[key][1]+mut2[key][1])
325	count = count1[key]
326	del mut2[key]
327	else:
328	unit_found = False
329	if p_group_cols[0] == 7 or s_group_cols[0] == 7: #if it is Repeat Unit (AG, GCT etc.) check for reverse-complements too
330	if p_group_cols[0] == 7:
331	this,other = 0,1
332	else:
333	this,other = 1,0
334	groups1 = key.split('\t')
335	mutn = mut1[key][0]
336	mutd = mut1[key][1]
337	count = 0
338	for key2 in mut2.keys():
339	groups2 = key2.split('\t')
340	if groups1[other] == groups2[other]:
341	if groups1[this] in groups2[this]2 or reverse_complement(groups1[this]) in groups2[this]2:
342	#mut = (mut1[key][0]+mut2[key2][0])/(mut1[key][1]+mut2[key2][1])
343	mutn += mut2[key2][0]
344	mutd += mut2[key2][1]
345	count += int(count2[key2])
346	unit_found = True
347	del mut2[key2]
348	#break
349	if unit_found:
350	mut = mutn/mutd
351	else:
352	mut = mut1[key][0]/mut1[key][1]
353	count = count1[key]
354	mut = "%.2e" %(mut/num_generations)
355	if region == 'align':
356	print >>fout, str(blk) + '\t'+seq1 + '\t' + seq2 + '\t' +key.strip()+ '\t'+str(mut) + '\t'+ str(count)
357	elif region == 'win':
358	fout.write("%s\t%s\t%s\t%s\n" %(blk,key.strip(),mut,count))
359	fout.flush()
360
361	#catch any remaining repeats, for instance if the orthologous position contained different repeat units
362	for remaining_key in mut2.keys():
363	mut = mut2[remaining_key][0]/mut2[remaining_key][1]
364	mut = "%.2e" %(mut/num_generations)
365	count = count2[remaining_key]
366	if region == 'align':
367	print >>fout, str(blk) + '\t'+seq1 + '\t'+seq2 + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
368	elif region == 'win':
369	fout.write("%s\t%s\t%s\t%s\n" %(blk,remaining_key.strip(),mut,count))
370	fout.flush()
371	#print >>fout, blk + '\t'+remaining_key.strip()+ '\t'+str(mut)+ '\t'+ str(count)
372
373	def counter(node, start, end, report_func):
374	if start <= node.start < end and start < node.end <= end:
375	report_func(node)
376	if node.right:
377	counter(node.right, start, end, report_func)
378	if node.left:
379	counter(node.left, start, end, report_func)
380	elif node.start < start and node.right:
381	counter(node.right, start, end, report_func)
382	elif node.start >= end and node.left and node.left.maxend > start:
383	counter(node.left, start, end, report_func)
384
385
386	def main():
387	infile = sys.argv[1]
388
389	for i, line in enumerate( file ( infile )):
390	line = line.rstrip('\r\n')
391	if len( line )>0 and not line.startswith( '#' ):
392	elems = line.split( '\t' )
393	break
394	if i == 30:
395	break # Hopefully we'll never get here...
396
397	if len( elems ) != 15:
398	stop_err( "This tool only works on tabular data output by 'Extract Orthologous Microsatellites from pair-wise alignments' tool. The data in your input dataset is either missing or not formatted properly." )
399	global winspecies, speciesind
400	if region == 'win':
401	if dbkey_i in elems[1]:
402	winspecies = 1
403	speciesind = 1
404	elif dbkey_i in elems[8]:
405	winspecies = 2
406	speciesind = 8
407	else:
408	stop_err("The species build corresponding to your interval file is not present in the Microsatellite file.")
409
410	fin = open(infile, 'r')
411	skipped = 0
412	blk=0
413	win=0
414	linestr=""
415
416	if region == 'win':
417
418	msats = NiceReaderWrapper( fileinput.FileInput( infile ),
419	chrom_col = speciesind,
420	start_col = speciesind+1,
421	end_col = speciesind+2,
422	strand_col = -1,
423	fix_strand = True)
424	msatTree = quicksect.IntervalTree()
425	for item in msats:
426	if type( item ) is GenomicInterval:
427	msatTree.insert( item, msats.linenum, item.fields )
428
429	for iline in fint:
430	try:
431	iline = iline.rstrip('\r\n')
432	if not(iline) or iline == "":
433	continue
434	ielems = iline.strip("\r\n").split('\t')
435	ichr = ielems[chr_col_i]
436	istart = int(ielems[start_col_i])
437	iend = int(ielems[end_col_i])
438	isrc = "%s.%s" %(dbkey_i,ichr)
439	if isrc not in msatTree.chroms:
440	continue
441	result = []
442	root = msatTree.chroms[isrc] #root node for the chrom
443	counter(root, istart, iend, lambda node: result.append( node ))
444	if not(result):
445	continue
446	tmpfile1 = tempfile.NamedTemporaryFile('wb+')
447	for node in result:
448	tmpfile1.write("%s\n" % "\t".join( node.other ))
449
450	tmpfile1.seek(0)
451	output_writer(iline, tmpfile1.readlines())
452	except:
453	skipped+=1
454	if skipped:
455	print "Skipped %d intervals as invalid." %(skipped)
456	elif region == 'align':
457	if s_group_cols[0] != -1:
458	print >>fout, "#Window\tSpecies_1\tSpecies_2\tGroupby_Feature\tSubGroupby_Feature\tMutability\tCount"
459	else:
460	print >>fout, "#Window\tSpecies_1\tWindow_Start\tWindow_End\tSpecies_2\tGroupby_Feature\tMutability\tCount"
461	prev_bnum = -1
462	try:
463	for line in fin:
464	line = line.strip("\r\n")
465	if not(line) or line == "":
466	continue
467	elems = line.split('\t')
468	try:
469	assert int(elems[0])
470	assert len(elems) == 15
471	except:
472	continue
473	new_bnum = int(elems[0])
474	if new_bnum != prev_bnum:
475	if prev_bnum != -1:
476	output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
477	linestr = line + "\n"
478	else:
479	linestr += line
480	linestr += "\n"
481	prev_bnum = new_bnum
482	output_writer(prev_bnum, linestr.strip().replace('\r','\n').split('\n'))
483	except Exception, ea:
484	print >>sys.stderr, ea
485	skipped += 1
486	if skipped:
487	print "Skipped %d lines as invalid." %(skipped)
488	if __name__ == "__main__":
489	main()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/regVariation/microsats_mutability.py @ 2

異なるフォーマットでダウンロード: