Context Navigation

lca.py @ 3

リビジョン 2, 6.8 KB (コミッタ: hatakeyama, 16 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	#Guruprasad Ananda
3	"""
4	Least Common Ancestor tool.
5	"""
6	import sys, string, re, commands, tempfile, random
7
8	def stop_err(msg):
9	sys.stderr.write(msg)
10	sys.exit()
11
12	def main():
13	try:
14	inputfile = sys.argv[1]
15	outfile = sys.argv[2]
16	rank_bound = int( sys.argv[3] )
17	"""
18	Mapping of ranks:
19	root :2,
20	superkingdom:3,
21	kingdom :4,
22	subkingdom :5,
23	superphylum :6,
24	phylum :7,
25	subphylum :8,
26	superclass :9,
27	class :10,
28	subclass :11,
29	superorder :12,
30	order :13,
31	suborder :14,
32	superfamily :15,
33	family :16,
34	subfamily :17,
35	tribe :18,
36	subtribe :19,
37	genus :20,
38	subgenus :21,
39	species :22,
40	subspecies :23,
41	"""
42	except:
43	stop_err("Syntax error: Use correct syntax: program infile outfile")
44
45	fin = open(sys.argv[1],'r')
46	for j, line in enumerate( fin ):
47	elems = line.strip().split('\t')
48	if len(elems) < 24:
49	stop_err("The format of the input dataset is incorrect. Taxonomy datatype should contain at least 24 columns.")
50	if j > 30:
51	break
52	cols = range(1,len(elems))
53	fin.close()
54
55	group_col = 0
56	tmpfile = tempfile.NamedTemporaryFile()
57
58	try:
59	"""
60	The -k option for the Posix sort command is as follows:
61	-k, --key=POS1[,POS2]
62	start a key at POS1, end it at POS2 (origin 1)
63	In other words, column positions start at 1 rather than 0, so
64	we need to add 1 to group_col.
65	if POS2 is not specified, the newer versions of sort will consider the entire line for sorting. To prevent this, we set POS2=POS1.
66	"""
67	command_line = "sort -f -k " + str(group_col+1) +"," + str(group_col+1) + " -o " + tmpfile.name + " " + inputfile
68	except Exception, exc:
69	stop_err( 'Initialization error -> %s' %str(exc) )
70
71	error_code, stdout = commands.getstatusoutput(command_line)
72
73	if error_code != 0:
74	stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
75
76	prev_item = ""
77	prev_vals = []
78	remaining_vals = []
79	skipped_lines = 0
80	fout = open(outfile, "w")
81	block_valid = False
82
83
84	for ii, line in enumerate( file( tmpfile.name )):
85	if line and not line.startswith( '#' ) and len(line.split('\t')) >= 24: #Taxonomy datatype should have at least 24 columns
86	line = line.rstrip( '\r\n' )
87	try:
88	fields = line.split("\t")
89	item = fields[group_col]
90	if prev_item != "":
91	# At this level, we're grouping on values (item and prev_item) in group_col
92	if item == prev_item:
93	# Keep iterating and storing values until a new value is encountered.
94	if block_valid:
95	for i, col in enumerate(cols):
96	if col >= 3:
97	prev_vals[i].append(fields[col].strip())
98	if len(set(prev_vals[i])) > 1:
99	block_valid = False
100	break
101
102	else:
103	"""
104	When a new value is encountered, write the previous value and the
105	corresponding aggregate values into the output file. This works
106	due to the sort on group_col we've applied to the data above.
107	"""
108	out_list = ['']*24
109	out_list[0] = str(prev_item)
110	out_list[1] = str(prev_vals[0][0])
111	out_list[2] = str(prev_vals[1][0])
112
113	for k, col in enumerate(cols):
114	if col >= 3 and col < 24:
115	if len(set(prev_vals[k])) == 1:
116	out_list[col] = prev_vals[k][0]
117	else:
118	break
119	while k < 23:
120	out_list[k+1] = 'n'
121	k += 1
122
123	j = 0
124	while True:
125	try:
126	out_list.append(str(prev_vals[23+j][0]))
127	j += 1
128	except:
129	break
130
131	if rank_bound == 0:
132	print >>fout, '\t'.join(out_list).strip()
133	else:
134	if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
135	print >>fout, '\t'.join(out_list).strip()
136
137	block_valid = True
138	prev_item = item
139	prev_vals = []
140	for col in cols:
141	val_list = []
142	val_list.append(fields[col].strip())
143	prev_vals.append(val_list)
144
145	else:
146	# This only occurs once, right at the start of the iteration.
147	block_valid = True
148	prev_item = item #groupby item
149	for col in cols: #everyting else
150	val_list = []
151	val_list.append(fields[col].strip())
152	prev_vals.append(val_list)
153
154	except:
155	skipped_lines += 1
156	else:
157	skipped_lines += 1
158
159	# Handle the last grouped value
160	out_list = ['']*24
161	out_list[0] = str(prev_item)
162	out_list[1] = str(prev_vals[0][0])
163	out_list[2] = str(prev_vals[1][0])
164
165	for k, col in enumerate(cols):
166	if col >= 3 and col < 24:
167	if len(set(prev_vals[k])) == 1:
168	out_list[col] = prev_vals[k][0]
169	else:
170	break
171	while k < 23:
172	out_list[k+1] = 'n'
173	k += 1
174
175	j = 0
176	while True:
177	try:
178	out_list.append(str(prev_vals[23+j][0]))
179	j += 1
180	except:
181	break
182
183	if rank_bound == 0:
184	print >>fout, '\t'.join(out_list).strip()
185	else:
186	if ''.join(out_list[rank_bound:24]) != 'n'*( 24 - rank_bound ):
187	print >>fout, '\t'.join(out_list).strip()
188
189	if skipped_lines > 0:
190	print "Skipped %d invalid lines." % ( skipped_lines )
191
192	if __name__ == "__main__":
193	main()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/taxonomy/lca.py @ 3

異なるフォーマットでダウンロード: