root/galaxy-central/tools/ngs_rna/tophat_wrapper.py

リビジョン 2, 11.5 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3import optparse, os, shutil, subprocess, sys, tempfile, fileinput
4
5def stop_err( msg ):
6    sys.stderr.write( "%s\n" % msg )
7    sys.exit()
8
9def __main__():
10    #Parse Command Line
11    parser = optparse.OptionParser()
12    parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' )
13    parser.add_option( '-C', '--coverage-output', dest='coverage_output_file', help='Coverage output file; formate is WIG.' )
14    parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', help='Junctions output file; formate is BED.' )
15    parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', help='Accepted hits output file; formate is SAM.' )
16    parser.add_option( '', '--own-file', dest='own_file', help='' )
17    parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' )
18    parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \
19                                                                                For, example, for paired end runs with fragments selected at 300bp, \
20                                                                                where each end is 50bp, you should set -r to be 200. There is no default, \
21                                                                                and this parameter is required for paired end runs.')
22    parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' )
23    parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length',
24                        help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' )
25    parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' )
26    parser.add_option( '-i', '--min-intron-length', dest='min_intron_length',
27                        help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' )
28    parser.add_option( '-I', '--max-intron-length', dest='max_intron_length',
29                        help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' )
30    parser.add_option( '-F', '--junction_filter', dest='junction_filter', help='Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)' )
31    parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' )
32    parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.')
33    parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)')
34    parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' )
35    parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.')
36    parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' )
37    parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' )
38    parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' )
39    parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' )
40    parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' )
41    parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' )
42    parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' )
43    parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' )
44    parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' )
45    parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' )
46   
47    # Wrapper options.
48    parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
49    parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
50    parser.add_option( '', '--single-paired', dest='single_paired', help='' )
51    parser.add_option( '', '--settings', dest='settings', help='' )
52   
53    (options, args) = parser.parse_args()
54   
55    #sys.stderr.write('*'*50+'\n'+str(options)+'\n'+'*'*50+'\n')
56   
57    # Creat bowtie index if necessary.
58    tmp_index_dir = tempfile.mkdtemp()
59    if options.own_file != 'None':
60        index_path = os.path.join( tmp_index_dir, os.path.split( options.own_file )[1] )
61        cmd_index = 'bowtie-build -f %s %s' % ( options.own_file, index_path )
62        try:
63            tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
64            tmp_stderr = open( tmp, 'wb' )
65            proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
66            returncode = proc.wait()
67            tmp_stderr.close()
68            # get stderr, allowing for case where it's very large
69            tmp_stderr = open( tmp, 'rb' )
70            stderr = ''
71            buffsize = 1048576
72            try:
73                while True:
74                    stderr += tmp_stderr.read( buffsize )
75                    if not stderr or len( stderr ) % buffsize != 0:
76                        break
77            except OverflowError:
78                pass
79            tmp_stderr.close()
80            if returncode != 0:
81                raise Exception, stderr
82        except Exception, e:
83            if os.path.exists( tmp_index_dir ):
84                shutil.rmtree( tmp_index_dir )
85            stop_err( 'Error indexing reference sequence\n' + str( e ) )
86    else:
87        index_path = options.index_path
88   
89    # Build tophat command.
90    tmp_output_dir = tempfile.mkdtemp()
91    cmd = 'tophat -o %s %s %s %s'
92    reads = options.input1
93    if options.input2 != 'None':
94        reads += ' ' + options.input2
95    opts = '-p %s' % options.num_threads
96    if options.single_paired == 'paired':
97        opts += ' -r %s' % options.mate_inner_dist
98    if options.settings == 'preSet':
99        cmd = cmd % ( tmp_output_dir, opts, index_path, reads )
100    else:
101        try:
102            if int( options.min_anchor_length ) >= 3:
103                opts += ' -a %s' % options.min_anchor_length
104            else:
105                raise Exception, 'Minimum anchor length must be 3 or greater'
106            opts += ' -m %s' % options.splice_mismatches
107            opts += ' -i %s' % options.min_intron_length
108            opts += ' -I %s' % options.max_intron_length
109            if float( options.junction_filter ) != 0.0:
110                opts += ' -F %s' % options.junction_filter
111            opts += ' -g %s' % options.max_multihits
112            if options.coverage_search:
113                opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron )
114            else:
115                opts += ' --no-coverage-search'
116            if options.closure_search:
117                opts += ' --closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s'  % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron )
118            else:
119                opts += ' --no-closure-search'
120            if options.microexon_search:
121                opts += ' --microexon-search'
122            if options.single_paired == 'paired':
123                opts += ' --mate-std-dev %s' % options.mate_std_dev
124            cmd = cmd % ( tmp_output_dir, opts, index_path, reads )
125        except Exception, e:
126            # Clean up temp dirs
127            if os.path.exists( tmp_index_dir ):
128                shutil.rmtree( tmp_index_dir )
129            if os.path.exists( tmp_output_dir ):
130                shutil.rmtree( tmp_output_dir )
131            stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) )
132    print cmd
133           
134    # Run
135    try:
136        tmp_out = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name
137        tmp_stdout = open( tmp_out, 'wb' )
138        tmp_err = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name
139        tmp_stderr = open( tmp_err, 'wb' )
140        proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_output_dir, stdout=tmp_stdout, stderr=tmp_stderr )
141        returncode = proc.wait()
142        tmp_stderr.close()
143        # get stderr, allowing for case where it's very large
144        tmp_stderr = open( tmp_err, 'rb' )
145        stderr = ''
146        buffsize = 1048576
147        try:
148            while True:
149                stderr += tmp_stderr.read( buffsize )
150                if not stderr or len( stderr ) % buffsize != 0:
151                    break
152        except OverflowError:
153            pass
154        tmp_stdout.close()
155        tmp_stderr.close()
156        if returncode != 0:
157            raise Exception, stderr
158    except Exception, e:
159        # Clean up temp dirs
160        if os.path.exists( tmp_output_dir ):
161            shutil.rmtree( tmp_output_dir )
162        stop_err( 'Error in tophat:\n' + str( e ) )
163       
164    # TODO: look for errors in program output.
165       
166    # Postprocessing: copy output files from tmp directory to specified files. Also need to remove header lines from SAM file.
167    try:
168        try:
169            shutil.copyfile( tmp_output_dir + "/coverage.wig", options.coverage_output_file )
170            shutil.copyfile( tmp_output_dir + "/junctions.bed", options.junctions_output_file )
171           
172            # Remove headers from SAM file in place.
173            in_header = True # Headers always at start of file.
174            for line in fileinput.input( tmp_output_dir + "/accepted_hits.sam", inplace=1 ):
175                if in_header and line.startswith("@"):
176                    continue
177                else:
178                    in_header = False
179                    sys.stdout.write( line )
180                   
181            # Copy SAM File.
182            shutil.copyfile( tmp_output_dir + "/accepted_hits.sam", options.accepted_hits_output_file )
183        except Exception, e:
184            stop_err( 'Error in tophat:\n' + str( e ) )
185    finally:
186        # Clean up temp dirs
187        if os.path.exists( tmp_index_dir ):
188            shutil.rmtree( tmp_index_dir )
189        if os.path.exists( tmp_output_dir ):
190            shutil.rmtree( tmp_output_dir )
191
192if __name__=="__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。