root/galaxy-central/tools/metag_tools/megablast_wrapper.py @ 2

リビジョン 2, 4.7 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2"""
3run megablast for metagenomics data
4
5usage: %prog [options]
6   -d, --db_build=d: The database to use
7   -i, --input=i: Input FASTQ candidate file
8   -w, --word_size=w: Size of best perfect match
9   -c, --identity_cutoff=c: Report hits at or above this identity
10   -e, --eval_cutoff=e: Expectation value cutoff
11   -f, --filter_query=f: Filter out low complexity regions
12   -x, --index_dir=x: Data index directory
13   -o, --output=o: Output file
14   
15usage: %prog db_build input_file word_size identity_cutoff eval_cutoff filter_query index_dir output_file
16"""
17
18import os, subprocess, sys, tempfile
19from galaxy import eggs
20import pkg_resources; pkg_resources.require( "bx-python" )
21from bx.cookbook import doc_optparse
22
23assert sys.version_info[:2] >= ( 2, 4 )
24
25def stop_err( msg ):
26    sys.stderr.write( "%s\n" % msg )
27    sys.exit()
28
29def __main__():
30    #Parse Command Line
31    options, args = doc_optparse.parse( __doc__ )
32
33    db_build = options.db_build
34    query_filename = options.input.strip()
35    output_filename = options.output.strip()
36    mega_word_size = options.word_size        # -W
37    mega_iden_cutoff = options.identity_cutoff      # -p
38    mega_evalue_cutoff = options.eval_cutoff      # -e
39    mega_temp_output = tempfile.NamedTemporaryFile().name
40    mega_filter = options.filter_query           # -F
41    GALAXY_DATA_INDEX_DIR = options.index_dir
42    DB_LOC = "%s/blastdb.loc" % GALAXY_DATA_INDEX_DIR
43
44    # megablast parameters
45    try:
46        int( mega_word_size )   
47    except:
48        stop_err( 'Invalid value for word size' )
49    try:
50        float(mega_iden_cutoff)
51    except:
52        stop_err( 'Invalid value for identity cut-off' )
53    try:
54        float(mega_evalue_cutoff)
55    except:
56        stop_err( 'Invalid value for Expectation value' )
57
58    # prepare the database
59    db = {}
60    for i, line in enumerate( file( DB_LOC ) ):
61        line = line.rstrip( '\r\n' )
62        if not line or line.startswith( '#' ):
63            continue
64        fields = line.split( '\t' )
65        db[ fields[0] ] = fields[1]
66
67    if not db.has_key( db_build ):
68        stop_err( 'Cannot locate the target database. Please check your location file.' )
69
70    # arguments for megablast   
71    chunk = db[ ( db_build ) ]
72    megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null " \
73        % ( chunk, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, mega_filter )
74
75    print megablast_command
76
77    tmp = tempfile.NamedTemporaryFile().name
78    try:
79        tmp_stderr = open( tmp, 'wb' )
80        proc = subprocess.Popen( args=megablast_command, shell=True, stderr=tmp_stderr.fileno() )
81        returncode = proc.wait()
82        tmp_stderr.close()
83        # get stderr, allowing for case where it's very large
84        tmp_stderr = open( tmp, 'rb' )
85        stderr = ''
86        buffsize = 1048576
87        try:
88            while True:
89                stderr += tmp_stderr.read( buffsize )
90                if not stderr or len( stderr ) % buffsize != 0:
91                    break
92        except OverflowError:
93            pass
94        tmp_stderr.close()
95        if returncode != 0:
96            raise Exception, stderr
97        if os.path.exists( tmp ):
98            os.unlink( tmp )
99    except Exception, e:
100        if os.path.exists( mega_temp_output ):
101            os.unlink( mega_temp_output )
102        if os.path.exists( tmp ):
103            os.unlink( tmp )
104        stop_err( 'Error indexing reference sequence. ' + str( e ) )
105
106    output = open( output_filename, 'w' )
107    invalid_lines = 0
108    for i, line in enumerate( file( mega_temp_output ) ):
109        line = line.rstrip( '\r\n' )
110        fields = line.split()
111        try:
112            # get gi and length of that gi seq
113            gi, gi_len = fields[1].split( '_' )
114            # convert the last column (causing problem in filter tool) to float
115            fields[-1] = float( fields[-1] )
116            new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] )
117        except:
118            new_line = line
119            invalid_lines += 1
120        output.write( "%s\n" % new_line )
121    output.close()
122
123    if os.path.exists( mega_temp_output ):
124        os.unlink( mega_temp_output ) #remove the tempfile that we just reformatted the contents of
125
126    if invalid_lines:
127        print "Unable to parse %d lines. Keep the default format." % invalid_lines
128
129    # megablast generates a file called error.log, if empty, delete it, if not, show the contents
130    if os.path.exists( './error.log' ):
131        for i, line in enumerate( file( './error.log' ) ):
132            line = line.rstrip( '\r\n' )
133            print line
134        os.remove( './error.log' )
135
136if __name__ == "__main__" : __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。