[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | import optparse, os, shutil, subprocess, sys, tempfile, fileinput |
---|
| 4 | |
---|
| 5 | def stop_err( msg ): |
---|
| 6 | sys.stderr.write( "%s\n" % msg ) |
---|
| 7 | sys.exit() |
---|
| 8 | |
---|
| 9 | def __main__(): |
---|
| 10 | #Parse Command Line |
---|
| 11 | parser = optparse.OptionParser() |
---|
| 12 | parser.add_option( '-p', '--num-threads', dest='num_threads', help='Use this many threads to align reads. The default is 1.' ) |
---|
| 13 | parser.add_option( '-C', '--coverage-output', dest='coverage_output_file', help='Coverage output file; formate is WIG.' ) |
---|
| 14 | parser.add_option( '-J', '--junctions-output', dest='junctions_output_file', help='Junctions output file; formate is BED.' ) |
---|
| 15 | parser.add_option( '-H', '--hits-output', dest='accepted_hits_output_file', help='Accepted hits output file; formate is SAM.' ) |
---|
| 16 | parser.add_option( '', '--own-file', dest='own_file', help='' ) |
---|
| 17 | parser.add_option( '-D', '--indexes-path', dest='index_path', help='Indexes directory; location of .ebwt and .fa files.' ) |
---|
| 18 | parser.add_option( '-r', '--mate-inner-dist', dest='mate_inner_dist', help='This is the expected (mean) inner distance between mate pairs. \ |
---|
| 19 | For, example, for paired end runs with fragments selected at 300bp, \ |
---|
| 20 | where each end is 50bp, you should set -r to be 200. There is no default, \ |
---|
| 21 | and this parameter is required for paired end runs.') |
---|
| 22 | parser.add_option( '', '--mate-std-dev', dest='mate_std_dev', help='Standard deviation of distribution on inner distances between male pairs.' ) |
---|
| 23 | parser.add_option( '-a', '--min-anchor-length', dest='min_anchor_length', |
---|
| 24 | help='The "anchor length". TopHat will report junctions spanned by reads with at least this many bases on each side of the junction.' ) |
---|
| 25 | parser.add_option( '-m', '--splice-mismatches', dest='splice_mismatches', help='The maximum number of mismatches that can appear in the anchor region of a spliced alignment.' ) |
---|
| 26 | parser.add_option( '-i', '--min-intron-length', dest='min_intron_length', |
---|
| 27 | help='The minimum intron length. TopHat will ignore donor/acceptor pairs closer than this many bases apart.' ) |
---|
| 28 | parser.add_option( '-I', '--max-intron-length', dest='max_intron_length', |
---|
| 29 | help='The maximum intron length. When searching for junctions ab initio, TopHat will ignore donor/acceptor pairs farther than this many bases apart, except when such a pair is supported by a split segment alignment of a long read.' ) |
---|
| 30 | parser.add_option( '-F', '--junction_filter', dest='junction_filter', help='Filter out junctions supported by too few alignments (number of reads divided by average depth of coverage)' ) |
---|
| 31 | parser.add_option( '-g', '--max_multihits', dest='max_multihits', help='Maximum number of alignments to be allowed' ) |
---|
| 32 | parser.add_option( '', '--microexon-search', action="store_true", dest='microexon_search', help='With this option, the pipeline will attempt to find alignments incident to microexons. Works only for reads 50bp or longer.') |
---|
| 33 | parser.add_option( '', '--closure-search', action="store_true", dest='closure_search', help='Enables the mate pair closure-based search for junctions. Closure-based search should only be used when the expected inner distance between mates is small (<= 50bp)') |
---|
| 34 | parser.add_option( '', '--no-closure-search', action="store_false", dest='closure_search' ) |
---|
| 35 | parser.add_option( '', '--coverage-search', action="store_true", dest='coverage_search', help='Enables the coverage based search for junctions. Use when coverage search is disabled by default (such as for reads 75bp or longer), for maximum sensitivity.') |
---|
| 36 | parser.add_option( '', '--no-coverage-search', action="store_false", dest='coverage_search' ) |
---|
| 37 | parser.add_option( '', '--min-segment-intron', dest='min_segment_intron', help='Minimum intron length that may be found during split-segment search' ) |
---|
| 38 | parser.add_option( '', '--max-segment-intron', dest='max_segment_intron', help='Maximum intron length that may be found during split-segment search' ) |
---|
| 39 | parser.add_option( '', '--min-closure-exon', dest='min_closure_exon', help='Minimum length for exonic hops in potential splice graph' ) |
---|
| 40 | parser.add_option( '', '--min-closure-intron', dest='min_closure_intron', help='Minimum intron length that may be found during closure search' ) |
---|
| 41 | parser.add_option( '', '--max-closure-intron', dest='max_closure_intron', help='Maximum intron length that may be found during closure search' ) |
---|
| 42 | parser.add_option( '', '--min-coverage-intron', dest='min_coverage_intron', help='Minimum intron length that may be found during coverage search' ) |
---|
| 43 | parser.add_option( '', '--max-coverage-intron', dest='max_coverage_intron', help='Maximum intron length that may be found during coverage search' ) |
---|
| 44 | parser.add_option( '', '--seg-mismatches', dest='seg_mismatches', help='Number of mismatches allowed in each segment alignment for reads mapped independently' ) |
---|
| 45 | parser.add_option( '', '--seg-length', dest='seg_length', help='Minimum length of read segments' ) |
---|
| 46 | |
---|
| 47 | # Wrapper options. |
---|
| 48 | parser.add_option( '-1', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' ) |
---|
| 49 | parser.add_option( '-2', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' ) |
---|
| 50 | parser.add_option( '', '--single-paired', dest='single_paired', help='' ) |
---|
| 51 | parser.add_option( '', '--settings', dest='settings', help='' ) |
---|
| 52 | |
---|
| 53 | (options, args) = parser.parse_args() |
---|
| 54 | |
---|
| 55 | #sys.stderr.write('*'*50+'\n'+str(options)+'\n'+'*'*50+'\n') |
---|
| 56 | |
---|
| 57 | # Creat bowtie index if necessary. |
---|
| 58 | tmp_index_dir = tempfile.mkdtemp() |
---|
| 59 | if options.own_file != 'None': |
---|
| 60 | index_path = os.path.join( tmp_index_dir, os.path.split( options.own_file )[1] ) |
---|
| 61 | cmd_index = 'bowtie-build -f %s %s' % ( options.own_file, index_path ) |
---|
| 62 | try: |
---|
| 63 | tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name |
---|
| 64 | tmp_stderr = open( tmp, 'wb' ) |
---|
| 65 | proc = subprocess.Popen( args=cmd_index, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) |
---|
| 66 | returncode = proc.wait() |
---|
| 67 | tmp_stderr.close() |
---|
| 68 | # get stderr, allowing for case where it's very large |
---|
| 69 | tmp_stderr = open( tmp, 'rb' ) |
---|
| 70 | stderr = '' |
---|
| 71 | buffsize = 1048576 |
---|
| 72 | try: |
---|
| 73 | while True: |
---|
| 74 | stderr += tmp_stderr.read( buffsize ) |
---|
| 75 | if not stderr or len( stderr ) % buffsize != 0: |
---|
| 76 | break |
---|
| 77 | except OverflowError: |
---|
| 78 | pass |
---|
| 79 | tmp_stderr.close() |
---|
| 80 | if returncode != 0: |
---|
| 81 | raise Exception, stderr |
---|
| 82 | except Exception, e: |
---|
| 83 | if os.path.exists( tmp_index_dir ): |
---|
| 84 | shutil.rmtree( tmp_index_dir ) |
---|
| 85 | stop_err( 'Error indexing reference sequence\n' + str( e ) ) |
---|
| 86 | else: |
---|
| 87 | index_path = options.index_path |
---|
| 88 | |
---|
| 89 | # Build tophat command. |
---|
| 90 | tmp_output_dir = tempfile.mkdtemp() |
---|
| 91 | cmd = 'tophat -o %s %s %s %s' |
---|
| 92 | reads = options.input1 |
---|
| 93 | if options.input2 != 'None': |
---|
| 94 | reads += ' ' + options.input2 |
---|
| 95 | opts = '-p %s' % options.num_threads |
---|
| 96 | if options.single_paired == 'paired': |
---|
| 97 | opts += ' -r %s' % options.mate_inner_dist |
---|
| 98 | if options.settings == 'preSet': |
---|
| 99 | cmd = cmd % ( tmp_output_dir, opts, index_path, reads ) |
---|
| 100 | else: |
---|
| 101 | try: |
---|
| 102 | if int( options.min_anchor_length ) >= 3: |
---|
| 103 | opts += ' -a %s' % options.min_anchor_length |
---|
| 104 | else: |
---|
| 105 | raise Exception, 'Minimum anchor length must be 3 or greater' |
---|
| 106 | opts += ' -m %s' % options.splice_mismatches |
---|
| 107 | opts += ' -i %s' % options.min_intron_length |
---|
| 108 | opts += ' -I %s' % options.max_intron_length |
---|
| 109 | if float( options.junction_filter ) != 0.0: |
---|
| 110 | opts += ' -F %s' % options.junction_filter |
---|
| 111 | opts += ' -g %s' % options.max_multihits |
---|
| 112 | if options.coverage_search: |
---|
| 113 | opts += ' --coverage-search --min-coverage-intron %s --max-coverage-intron %s' % ( options.min_coverage_intron, options.max_coverage_intron ) |
---|
| 114 | else: |
---|
| 115 | opts += ' --no-coverage-search' |
---|
| 116 | if options.closure_search: |
---|
| 117 | opts += ' --closure-search --min-closure-exon %s --min-closure-intron %s --max-closure-intron %s' % ( options.min_closure_exon, options.min_closure_intron, options.max_closure_intron ) |
---|
| 118 | else: |
---|
| 119 | opts += ' --no-closure-search' |
---|
| 120 | if options.microexon_search: |
---|
| 121 | opts += ' --microexon-search' |
---|
| 122 | if options.single_paired == 'paired': |
---|
| 123 | opts += ' --mate-std-dev %s' % options.mate_std_dev |
---|
| 124 | cmd = cmd % ( tmp_output_dir, opts, index_path, reads ) |
---|
| 125 | except Exception, e: |
---|
| 126 | # Clean up temp dirs |
---|
| 127 | if os.path.exists( tmp_index_dir ): |
---|
| 128 | shutil.rmtree( tmp_index_dir ) |
---|
| 129 | if os.path.exists( tmp_output_dir ): |
---|
| 130 | shutil.rmtree( tmp_output_dir ) |
---|
| 131 | stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) ) |
---|
| 132 | print cmd |
---|
| 133 | |
---|
| 134 | # Run |
---|
| 135 | try: |
---|
| 136 | tmp_out = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name |
---|
| 137 | tmp_stdout = open( tmp_out, 'wb' ) |
---|
| 138 | tmp_err = tempfile.NamedTemporaryFile( dir=tmp_output_dir ).name |
---|
| 139 | tmp_stderr = open( tmp_err, 'wb' ) |
---|
| 140 | proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_output_dir, stdout=tmp_stdout, stderr=tmp_stderr ) |
---|
| 141 | returncode = proc.wait() |
---|
| 142 | tmp_stderr.close() |
---|
| 143 | # get stderr, allowing for case where it's very large |
---|
| 144 | tmp_stderr = open( tmp_err, 'rb' ) |
---|
| 145 | stderr = '' |
---|
| 146 | buffsize = 1048576 |
---|
| 147 | try: |
---|
| 148 | while True: |
---|
| 149 | stderr += tmp_stderr.read( buffsize ) |
---|
| 150 | if not stderr or len( stderr ) % buffsize != 0: |
---|
| 151 | break |
---|
| 152 | except OverflowError: |
---|
| 153 | pass |
---|
| 154 | tmp_stdout.close() |
---|
| 155 | tmp_stderr.close() |
---|
| 156 | if returncode != 0: |
---|
| 157 | raise Exception, stderr |
---|
| 158 | except Exception, e: |
---|
| 159 | # Clean up temp dirs |
---|
| 160 | if os.path.exists( tmp_output_dir ): |
---|
| 161 | shutil.rmtree( tmp_output_dir ) |
---|
| 162 | stop_err( 'Error in tophat:\n' + str( e ) ) |
---|
| 163 | |
---|
| 164 | # TODO: look for errors in program output. |
---|
| 165 | |
---|
| 166 | # Postprocessing: copy output files from tmp directory to specified files. Also need to remove header lines from SAM file. |
---|
| 167 | try: |
---|
| 168 | try: |
---|
| 169 | shutil.copyfile( tmp_output_dir + "/coverage.wig", options.coverage_output_file ) |
---|
| 170 | shutil.copyfile( tmp_output_dir + "/junctions.bed", options.junctions_output_file ) |
---|
| 171 | |
---|
| 172 | # Remove headers from SAM file in place. |
---|
| 173 | in_header = True # Headers always at start of file. |
---|
| 174 | for line in fileinput.input( tmp_output_dir + "/accepted_hits.sam", inplace=1 ): |
---|
| 175 | if in_header and line.startswith("@"): |
---|
| 176 | continue |
---|
| 177 | else: |
---|
| 178 | in_header = False |
---|
| 179 | sys.stdout.write( line ) |
---|
| 180 | |
---|
| 181 | # Copy SAM File. |
---|
| 182 | shutil.copyfile( tmp_output_dir + "/accepted_hits.sam", options.accepted_hits_output_file ) |
---|
| 183 | except Exception, e: |
---|
| 184 | stop_err( 'Error in tophat:\n' + str( e ) ) |
---|
| 185 | finally: |
---|
| 186 | # Clean up temp dirs |
---|
| 187 | if os.path.exists( tmp_index_dir ): |
---|
| 188 | shutil.rmtree( tmp_index_dir ) |
---|
| 189 | if os.path.exists( tmp_output_dir ): |
---|
| 190 | shutil.rmtree( tmp_output_dir ) |
---|
| 191 | |
---|
| 192 | if __name__=="__main__": __main__() |
---|