root/galaxy-central/tools/taxonomy/gi2taxonomy.py @ 3

リビジョン 2, 5.1 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1import sys
2import string
3import tempfile
4import subprocess
5from os import path
6
7# -----------------------------------------------------------------------------------
8
9def stop_err(msg):
10    sys.stderr.write(msg)
11    sys.exit()
12
13# -----------------------------------------------------------------------------------
14def gi_name_to_sorted_list(file_name, gi_col, name_col):
15    """ Suppose input file looks like this:
16        a       2
17        b       4
18        c       5
19        d       5
20        where column 1 is gi_col and column 0 is name_col
21        output of this function will look like this:
22        [[2, 'a'], [4, 'b'], [5, 'c'], [5, 'd']]
23    """
24
25    result = []
26    try:
27        F = open( file_name, 'r' )
28        try:
29            for line in F:
30                file_cols = string.split(line.rstrip(), '\t')
31                file_cols[gi_col] = int(  file_cols[gi_col] )
32                result.append( [ file_cols[gi_col], file_cols[name_col] ] )
33        except:
34            print >>sys.stderr, 'Non numeric GI field...skipping'
35           
36    except Exception, e:
37        stop_err('%s\n' % e)
38    F.close()
39    result.sort()
40    return result   
41
42# -----------------------------------------------------------------------------------
43
44def collapse_repeating_gis( L ):
45    """ Accepts 2-d array of gi-key pairs such as this
46        L = [
47                [gi1, 'key1'],
48                [gi1, 'key2'],
49                [gi2','key3']
50            ]
51
52         Returns this:
53         [      [gi1, 'key1', 'key2'],
54                [gi2, 'key3' ]
55         ]
56         
57         The first value in each sublist MUST be int
58    """
59    gi = []
60    i = 0
61    result = []
62   
63    try:
64        for item in L:
65            if i == 0:
66                prev = item[0]
67           
68            if prev != item[0]:
69                prev_L = []
70                prev_L.append( prev )
71                result.append( prev_L + gi )
72                prev = item[0]
73                gi =[]
74               
75            gi.append( item[1] )
76            i += 1
77           
78    except Exception, e:
79        stop_err('%s\n' % e)
80       
81    prev_L = []
82    prev_L.append( prev )
83    result.append( prev_L + gi )
84    del(L)
85    return result
86
87# -----------------------------------------------------------------------------------
88
89def get_taxId( gi2tax_file, gi_name_list, out_file ):
90    """ Maps GI numbers from gi_name_list to TaxId identifiers from gi2tax_file and
91        prints result to out_file
92
93        gi2tax_file MUST be sorted on GI column
94
95        gi_name_list is a list that look slike this:
96        [[1,'a'], [2,'b','x'], [7,'c'], [10,'d'], [90,'f']]
97        where the first element of each sublist is a GI number
98        this list MUST also be sorted on GI
99
100        This function searches through 117,000,000 rows of gi2taxId file from NCBI
101        in approximately 4 minutes. This time is not dependent on the length of
102        gi_name_list
103    """
104
105    L = gi_name_list.pop(0)
106    my_gi = L[0]
107    F = open( out_file, 'w' )
108    gi = 0
109    for line in file( gi2tax_file ):
110        line = line.rstrip()
111        gi, taxId = string.split( line, '\t' )
112        gi = int( gi )
113       
114        if gi > my_gi:
115            try:
116                while ( my_gi < gi ):
117                    L = gi_name_list.pop(0)
118                    my_gi = L[0]
119            except:
120                break
121   
122        if  gi == my_gi:
123            for i in range( 1,len( L ) ):
124                print >>F, '%s\t%s\t%d' % (L[i], taxId, gi)
125            try:
126                L = gi_name_list.pop(0)
127                my_gi = L[0]
128            except:
129                break
130
131# -----------------------------------------------------------------------------------
132
133
134try:
135    in_f          = sys.argv[1]            # input file with GIs
136    gi_col        = int( sys.argv[2] ) - 1 # column in input containing GIs
137    name_col      = int( sys.argv[3] ) - 1 # column containing sequence names
138    out_f         = sys.argv[4]            # output file
139    tool_data     = sys.argv[5]
140except:
141    stop_err('Check arguments\n')
142
143#  GI2TAX point to a file produced by concatenation of:
144#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_nucl.zip
145#  and
146#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/gi_taxid_prot.zip
147#  a sorting using this command:
148#  sort -n -k 1
149
150GI2TAX = path.join( tool_data, 'taxonomy', 'gi_taxid_sorted.txt' )
151
152#  NAME_FILE and NODE_FILE point to names.dmg and nodes.dmg
153#  files contained within:
154#  ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
155
156NAME_FILE = path.join( tool_data, 'taxonomy', 'names.dmp' )
157NODE_FILE = path.join( tool_data, 'taxonomy', 'nodes.dmp' )
158
159g2n =  gi_name_to_sorted_list(in_f, gi_col, name_col)
160
161if len(g2n) == 0:
162    stop_err('No valid GI-containing fields. Please, check your column assignments.\n')
163
164tb_F = tempfile.NamedTemporaryFile('w')
165
166get_taxId( GI2TAX, collapse_repeating_gis( g2n ), tb_F.name )
167
168try:
169    tb_cmd = 'taxBuilder %s %s %s %s' % ( NAME_FILE, NODE_FILE, tb_F.name, out_f )
170    retcode = subprocess.call( tb_cmd, shell=True )
171    if retcode < 0:
172        print >>sys.stderr, "Execution of taxBuilder terminated by signal", -retcode
173except OSError, e:
174    print >>sys.stderr, "Execution of taxBuilder2tree failed:", e
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。