Context Navigation

find_diag_hits.py

リビジョン 2, 4.8 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
5	finds reads that only hit one taxonomic group. For example, consider the folliowing example:
6
7	read1 mammalia
8	read1 insecta
9	read2 insecta
10
11	in this case only read2 will be selected becuase it stays within insecta
12
13	This program takes the following options:
14
15	file in taxonomy format - dataset that complies with Galaxy's taxonomy format
16	id column - integer specifying the number of column containing seq id (starting with 1)
17	taxonomic ranks - a comma separated list of ranks from this list:
18
19	superkingdom
20	kingdom
21	subkingdom
22	superphylum
23	phylum
24	subphylum
25	superclass
26	class
27	subclass
28	superorder
29	order
30	suborder
31	superfamily
32	family
33	subfamily
34	tribe
35	subtribe
36	genus
37	subgenus
38	species
39	subspecies
40
41	output format - reads or counts
42
43	"""
44
45	from galaxy import eggs
46	import pkg_resources
47	pkg_resources.require( 'pysqlite' )
48	from pysqlite2 import dbapi2 as sqlite
49	import string, sys, tempfile
50
51	# This dictionary maps taxonomic ranks to fields of Taxonomy file
52	taxRank = {
53	'root' :2,
54	'superkingdom':3,
55	'kingdom' :4,
56	'subkingdom' :5,
57	'superphylum' :6,
58	'phylum' :7,
59	'subphylum' :8,
60	'superclass' :9,
61	'class' :10,
62	'subclass' :11,
63	'superorder' :12,
64	'ord' :13,
65	'suborder' :14,
66	'superfamily' :15,
67	'family' :16,
68	'subfamily' :17,
69	'tribe' :18,
70	'subtribe' :19,
71	'genus' :20,
72	'subgenus' :21,
73	'species' :22,
74	'subspecies' :23,
75	'order' :13
76	}
77
78
79	def stop_err(msg):
80	sys.stderr.write(msg)
81	sys.exit()
82
83
84	db = tempfile.NamedTemporaryFile('w')
85
86	try:
87	con = sqlite.connect(db.name)
88	cur = con.cursor()
89	except:
90	stop_err('Cannot connect to %s\n') % db.name
91
92	try:
93	tax_file = open(sys.argv[1], 'r')
94	id_col = int(sys.argv[2]) - 1
95	taxa = string.split(sys.argv[3].rstrip(),',')
96
97	if sys.argv[4] == 'reads':
98	out_format = True
99	elif sys.argv[4] == 'counts':
100	out_format = False
101	else:
102	stop_err('Please specify "reads" or "counts" for output format\n')
103	out_file = open(sys.argv[5], 'w')
104
105	except:
106	stop_err('Check arguments\n')
107
108	if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
109
110	sql = ""
111	for i in range(len(taxa)):
112	if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
113	sql += '%s text, ' % taxa[i]
114
115	sql = sql.strip(', ')
116	sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
117
118
119	cur.execute(sql)
120
121	invalid_line_number = 0
122
123	try:
124	for line in tax_file:
125	fields = string.split(line.rstrip(), '\t')
126	if len(fields) < 24:
127	invalid_line_number += 1
128	continue # Skipping malformed taxonomy lines
129
130	val_string = '"' + fields[id_col] + '", '
131
132	for rank in taxa:
133	taxon = fields[taxRank[rank]]
134	val_string += '"%s", ' % taxon
135
136	val_string = val_string.strip(', ')
137	val_string = "insert into tax values(" + val_string + ")"
138	cur.execute(val_string)
139	except Exception, e:
140	stop_err('%s\n' % e)
141
142	tax_file.close()
143
144	try:
145	for rank in taxa:
146	cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank )
147	cur.execute('insert into %s select name, name \|\| %s as id, %s from tax group by id' % ( rank, rank, rank ) )
148	cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
149	cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
150
151	if rank == 'ord':
152	rankName = 'order'
153	else:
154	rankName = rank
155
156	if out_format:
157	cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
158	for item in cur.fetchall():
159	out_string = '%s\t%s\t' % ( item[0], item[1] )
160	out_string += rankName
161	print >>out_file, out_string
162	else:
163	cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
164	for item in cur.fetchall():
165	out_string = '%s\t%s\t' % ( item[0], item[1] )
166	out_string += rankName
167	print >>out_file, out_string
168	except Exception, e:
169	stop_err("%s\n" % e)
170

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/taxonomy/find_diag_hits.py

異なるフォーマットでダウンロード: