root/galaxy-central/tools/taxonomy/find_diag_hits.py

リビジョン 2, 4.8 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3"""
4tax_read_grouping.py <file in taxonomy format> <id column> <taxonomic ranks> <output format> <output file>
5    finds reads that only hit one taxonomic group. For example, consider the folliowing example:
6   
7    read1   mammalia
8    read1   insecta
9    read2   insecta
10   
11    in this case only read2 will be selected becuase it stays within insecta
12   
13    This program takes the following options:
14   
15    file in taxonomy format - dataset that complies with Galaxy's taxonomy format
16    id column               - integer specifying the number of column containing seq id (starting with 1)
17    taxonomic ranks         - a comma separated list of ranks from this list:
18   
19         superkingdom
20         kingdom
21         subkingdom
22         superphylum
23         phylum
24         subphylum
25         superclass
26         class
27         subclass
28         superorder
29         order
30         suborder
31         superfamily
32         family
33         subfamily
34         tribe
35         subtribe
36         genus
37         subgenus
38         species
39         subspecies
40   
41    output format           - reads or counts
42
43"""
44
45from galaxy import eggs
46import pkg_resources
47pkg_resources.require( 'pysqlite' )
48from pysqlite2 import dbapi2 as sqlite
49import string, sys, tempfile
50
51# This dictionary maps taxonomic ranks to fields of Taxonomy file
52taxRank = {
53        'root'        :2,
54        'superkingdom':3,
55        'kingdom'     :4,
56        'subkingdom'  :5,
57        'superphylum' :6,
58        'phylum'      :7,
59        'subphylum'   :8,
60        'superclass'  :9,
61        'class'       :10,
62        'subclass'    :11,
63        'superorder'  :12,
64        'ord'         :13,
65        'suborder'    :14,
66        'superfamily' :15,
67        'family'      :16,
68        'subfamily'   :17,
69        'tribe'       :18,
70        'subtribe'    :19,
71        'genus'       :20,
72        'subgenus'    :21,
73        'species'     :22,
74        'subspecies'  :23,
75        'order'       :13
76    }
77
78
79def stop_err(msg):
80    sys.stderr.write(msg)
81    sys.exit()
82
83
84db = tempfile.NamedTemporaryFile('w')
85
86try:
87    con = sqlite.connect(db.name)
88    cur = con.cursor()
89except:
90    stop_err('Cannot connect to %s\n') % db.name
91   
92try:
93    tax_file   = open(sys.argv[1], 'r')
94    id_col     = int(sys.argv[2]) - 1
95    taxa       = string.split(sys.argv[3].rstrip(),',')
96   
97    if sys.argv[4] == 'reads':
98        out_format = True
99    elif sys.argv[4] == 'counts':
100        out_format = False
101    else:
102        stop_err('Please specify "reads" or "counts" for output format\n')
103    out_file = open(sys.argv[5], 'w')
104   
105except:
106    stop_err('Check arguments\n')
107   
108if taxa[0] == 'None': stop_err('Please, use checkboxes to specify taxonomic ranks.\n')
109
110sql = ""
111for i in range(len(taxa)):
112        if taxa[i] == 'order': taxa[i] = 'ord' # SQL does not like fields to be named 'order'
113        sql += '%s text, ' % taxa[i]
114
115sql = sql.strip(', ')
116sql = 'create table tax (name varchar(50) not null, ' + sql + ')'
117
118   
119cur.execute(sql)
120
121invalid_line_number = 0
122
123try:
124    for line in tax_file:
125        fields = string.split(line.rstrip(), '\t')
126        if len(fields) < 24:
127            invalid_line_number += 1
128            continue # Skipping malformed taxonomy lines
129       
130        val_string = '"' + fields[id_col] + '", '
131       
132        for rank in taxa:
133            taxon = fields[taxRank[rank]]
134            val_string += '"%s", ' % taxon
135               
136        val_string = val_string.strip(', ')
137        val_string = "insert into tax values(" + val_string + ")"
138        cur.execute(val_string)
139except Exception, e:
140    stop_err('%s\n' % e)
141
142tax_file.close()   
143
144try:   
145    for rank in taxa:
146        cur.execute('create temporary table %s (name varchar(50), id text, rank text)' % rank  )
147        cur.execute('insert into %s select name, name || %s as id, %s from tax group by id' % ( rank, rank, rank ) )
148        cur.execute('create temporary table %s_count(name varchar(50), id text, rank text, N int)' % rank)
149        cur.execute('insert into %s_count select name, id, rank, count(*) from %s group by name' % ( rank, rank) )
150       
151        if rank == 'ord':
152            rankName = 'order'
153        else:
154            rankName = rank
155   
156        if out_format:
157            cur.execute('select name,rank from %s_count where N = 1 and length(rank)>1' % rank)
158            for item in cur.fetchall():
159                out_string = '%s\t%s\t' % ( item[0], item[1] )
160                out_string += rankName
161                print >>out_file, out_string
162        else:
163            cur.execute('select rank, count(*) from %s_count where N = 1 and length(rank)>1 group by rank' % rank)
164            for item in cur.fetchall():
165                out_string = '%s\t%s\t' % ( item[0], item[1] )
166                out_string += rankName
167                print >>out_file, out_string
168except Exception, e:
169    stop_err("%s\n" % e)
170   
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。