[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | """ |
---|
| 4 | Runs BFAST on single-end or paired-end data. |
---|
| 5 | TODO: more documentation |
---|
| 6 | |
---|
| 7 | TODO: |
---|
| 8 | - auto-detect gzip or bz2 |
---|
| 9 | - split options (?) |
---|
| 10 | - queue lengths (?) |
---|
| 11 | - assumes reference always has been indexed |
---|
| 12 | - main and secondary indexes |
---|
| 13 | - scoring matrix file ? |
---|
| 14 | - read group file ? |
---|
| 15 | |
---|
| 16 | usage: bfast_wrapper.py [options] |
---|
| 17 | -r, --ref=r: The reference genome to use or index |
---|
| 18 | -f, --fastq=f: The fastq file to use for the mapping |
---|
| 19 | -F, --output=u: The file to save the output (SAM format) |
---|
| 20 | -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history) |
---|
| 21 | -p, --params=p: Parameter setting to use (pre_set or full) |
---|
| 22 | -n, --numThreads=n: The number of threads to use |
---|
| 23 | -A, --space=A: The encoding space (0: base 1: color) |
---|
| 24 | -o, --offsets=o: The offsets for 'match' |
---|
| 25 | -l, --loadAllIndexes=l: Load all indexes into memory |
---|
| 26 | -k, --keySize=k: truncate key size in 'match' |
---|
| 27 | -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored |
---|
| 28 | -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded |
---|
| 29 | -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse) |
---|
| 30 | -t, --timing=t: output timing information to stderr |
---|
| 31 | -u, --ungapped=u: performed ungapped local alignment |
---|
| 32 | -U, --unconstrained=U: performed local alignment without mask constraints |
---|
| 33 | -O, --offset=O: the number of bases before and after each hit to consider in local alignment |
---|
| 34 | -q, --avgMismatchQuality=q: average mismatch quality |
---|
| 35 | -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all) |
---|
| 36 | -P, --disallowPairing=P: do not choose alignments based on pairing |
---|
| 37 | -R, --reverse=R: paired end reads are given on reverse strands |
---|
| 38 | -z, --random=z: output a random best scoring alignment |
---|
| 39 | -D, --dbkey=D: Dbkey for reference genome |
---|
| 40 | """ |
---|
| 41 | |
---|
| 42 | import optparse, os, shutil, subprocess, sys, tempfile |
---|
| 43 | |
---|
| 44 | def stop_err( msg ): |
---|
| 45 | sys.stderr.write( '%s\n' % msg ) |
---|
| 46 | sys.exit() |
---|
| 47 | |
---|
| 48 | def __main__(): |
---|
| 49 | #Parse Command Line |
---|
| 50 | parser = optparse.OptionParser() |
---|
| 51 | parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to index and use' ) |
---|
| 52 | parser.add_option( '-f', '--fastq', dest='fastq', help='The fastq file to use for the mapping' ) |
---|
| 53 | parser.add_option( '-F', '--output', dest='output', help='The file to save the output (SAM format)' ) |
---|
| 54 | parser.add_option( '-A', '--space', dest='space', type="choice", default='0', choices=('0','1' ), help='The encoding space (0: base 1: color)' ) |
---|
| 55 | parser.add_option( '-H', '--suppressHeader', action="store_true", dest='suppressHeader', default=False, help='Suppress header' ) |
---|
| 56 | parser.add_option( '-n', '--numThreads', dest='numThreads', type="int", default="1", help='The number of threads to use' ) |
---|
| 57 | parser.add_option( '-t', '--timing', action="store_true", default=False, dest='timing', help='output timming information to stderr' ) |
---|
| 58 | parser.add_option( '-l', '--loadAllIndexes', action="store_true", default=False, dest='loadAllIndexes', help='Load all indexes into memory' ) |
---|
| 59 | parser.add_option( '-m', '--indexMask', dest='indexMask', help='String containing info on how to build custom indexes' ) |
---|
| 60 | parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help='String containing info on how to build custom indexes' ) |
---|
| 61 | parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help='Do not index lower case sequences. Such as those created by RepeatMasker' ) |
---|
| 62 | parser.add_option( '--indexContigOptions', dest='indexContigOptions', default="", help='The contig range options to use for the indexing' ) |
---|
| 63 | parser.add_option( '--indexExonsFileName', dest='indexExonsFileName', default="", help='The exons file to use for the indexing' ) |
---|
| 64 | |
---|
| 65 | parser.add_option( '-o', '--offsets', dest='offsets', default="", help='The offsets for \'match\'' ) |
---|
| 66 | parser.add_option( '-k', '--keySize', dest='keySize', type="int", default="-1", help='truncate key size in \'match\'' ) |
---|
| 67 | parser.add_option( '-K', '--maxKeyMatches', dest='maxKeyMatches', type="int", default="-1", help='the maximum number of matches to allow before a key is ignored' ) |
---|
| 68 | parser.add_option( '-M', '--maxNumMatches', dest='maxNumMatches', type="int", default="-1", help='the maximum number of matches to allow bfore the read is discarded' ) |
---|
| 69 | parser.add_option( '-w', '--whichStrand', dest='whichStrand', type="choice", default='0', choices=('0','1','2'), help='the strands to consider (0: both 1: forward 2: reverse)' ) |
---|
| 70 | |
---|
| 71 | parser.add_option( '--scoringMatrixFileName', dest='scoringMatrixFileName', help='Scoring Matrix file used to score the alignments' ) |
---|
| 72 | parser.add_option( '-u', '--ungapped', dest='ungapped', action="store_true", default=False, help='performed ungapped local alignment' ) |
---|
| 73 | parser.add_option( '-U', '--unconstrained', dest='unconstrained', action="store_true", default=False, help='performed local alignment without mask constraints' ) |
---|
| 74 | parser.add_option( '-O', '--offset', dest='offset', type="int", default="0", help='the number of bases before and after each hit to consider in local alignment' ) |
---|
| 75 | parser.add_option( '-q', '--avgMismatchQuality', type="int", default="-1", dest='avgMismatchQuality', help='average mismatch quality' ) |
---|
| 76 | |
---|
| 77 | parser.add_option( '-a', '--algorithm', dest='algorithm', default='0', type="choice", choices=('0','1','2','3','4' ), help='post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all' ) |
---|
| 78 | parser.add_option( '--unpaired', dest='unpaired', action="store_true", default=False, help='do not choose alignments based on pairing' ) |
---|
| 79 | parser.add_option( '--reverseStrand', dest='reverseStrand', action="store_true", default=False, help='paired end reads are given on reverse strands' ) |
---|
| 80 | parser.add_option( '--pairedEndInfer', dest='pairedEndInfer', action="store_true", default=False, help='break ties when one end of a paired end read by estimating the insert size distribution' ) |
---|
| 81 | parser.add_option( '--randomBest', dest='randomBest', action="store_true", default=False, help='output a random best scoring alignment' ) |
---|
| 82 | |
---|
| 83 | (options, args) = parser.parse_args() |
---|
| 84 | |
---|
| 85 | buffsize = 1048576 |
---|
| 86 | |
---|
| 87 | # make temp directory for bfast, requires trailing slash |
---|
| 88 | tmp_dir = '%s/' % tempfile.mkdtemp() |
---|
| 89 | |
---|
| 90 | #'generic' options used in all bfast commands here |
---|
| 91 | if options.timing: |
---|
| 92 | all_cmd_options = "-t" |
---|
| 93 | else: |
---|
| 94 | all_cmd_options = "" |
---|
| 95 | |
---|
| 96 | try: |
---|
| 97 | if options.buildIndex: |
---|
| 98 | reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name |
---|
| 99 | #build bfast indexes |
---|
| 100 | os.symlink( options.ref, reference_filepath ) |
---|
| 101 | |
---|
| 102 | #bfast fast2brg |
---|
| 103 | try: |
---|
| 104 | nuc_space = [ "0" ] |
---|
| 105 | if options.space == "1": |
---|
| 106 | #color space localalign appears to require nuc space brg |
---|
| 107 | nuc_space.append( "1" ) |
---|
| 108 | for space in nuc_space: |
---|
| 109 | cmd = 'bfast fasta2brg -f "%s" -A "%s" %s' % ( reference_filepath, space, all_cmd_options ) |
---|
| 110 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
| 111 | tmp_stderr = open( tmp, 'wb' ) |
---|
| 112 | proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
| 113 | returncode = proc.wait() |
---|
| 114 | tmp_stderr.close() |
---|
| 115 | # get stderr, allowing for case where it's very large |
---|
| 116 | tmp_stderr = open( tmp, 'rb' ) |
---|
| 117 | stderr = '' |
---|
| 118 | try: |
---|
| 119 | while True: |
---|
| 120 | stderr += tmp_stderr.read( buffsize ) |
---|
| 121 | if not stderr or len( stderr ) % buffsize != 0: |
---|
| 122 | break |
---|
| 123 | except OverflowError: |
---|
| 124 | pass |
---|
| 125 | tmp_stderr.close() |
---|
| 126 | if returncode != 0: |
---|
| 127 | raise Exception, stderr |
---|
| 128 | except Exception, e: |
---|
| 129 | raise Exception, 'Error in \'bfast fasta2brg\'.\n' + str( e ) |
---|
| 130 | |
---|
| 131 | #bfast index |
---|
| 132 | try: |
---|
| 133 | all_index_cmds = 'bfast index %s -f "%s" -A "%s" -n "%s"' % ( all_cmd_options, reference_filepath, options.space, options.numThreads ) |
---|
| 134 | |
---|
| 135 | if options.indexRepeatMasker: |
---|
| 136 | all_index_cmds += " -R" |
---|
| 137 | |
---|
| 138 | if options.indexContigOptions: |
---|
| 139 | index_contig_options = map( int, options.indexContigOptions.split( ',' ) ) |
---|
| 140 | if index_contig_options[0] >= 0: |
---|
| 141 | all_index_cmds += ' -s "%s"' % index_contig_options[0] |
---|
| 142 | if index_contig_options[1] >= 0: |
---|
| 143 | all_index_cmds += ' -S "%s"' % index_contig_options[1] |
---|
| 144 | if index_contig_options[2] >= 0: |
---|
| 145 | all_index_cmds += ' -e "%s"' % index_contig_options[2] |
---|
| 146 | if index_contig_options[3] >= 0: |
---|
| 147 | all_index_cmds += ' -E "%s"' % index_contig_options[3] |
---|
| 148 | elif options.indexExonsFileName: |
---|
| 149 | all_index_cmds += ' -x "%s"' % options.indexExonsFileName |
---|
| 150 | |
---|
| 151 | index_count = 1 |
---|
| 152 | for mask, hash_width in [ mask.split( ':' ) for mask in options.indexMask.split( ',' ) ]: |
---|
| 153 | cmd = '%s -m "%s" -w "%s" -i "%i"' % ( all_index_cmds, mask, hash_width, index_count ) |
---|
| 154 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
| 155 | tmp_stderr = open( tmp, 'wb' ) |
---|
| 156 | proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
| 157 | returncode = proc.wait() |
---|
| 158 | tmp_stderr.close() |
---|
| 159 | # get stderr, allowing for case where it's very large |
---|
| 160 | tmp_stderr = open( tmp, 'rb' ) |
---|
| 161 | stderr = '' |
---|
| 162 | try: |
---|
| 163 | while True: |
---|
| 164 | stderr += tmp_stderr.read( buffsize ) |
---|
| 165 | if not stderr or len( stderr ) % buffsize != 0: |
---|
| 166 | break |
---|
| 167 | except OverflowError: |
---|
| 168 | pass |
---|
| 169 | tmp_stderr.close() |
---|
| 170 | if returncode != 0: |
---|
| 171 | raise Exception, stderr |
---|
| 172 | index_count += 1 |
---|
| 173 | except Exception, e: |
---|
| 174 | raise Exception, 'Error in \'bfast index\'.\n' + str( e ) |
---|
| 175 | |
---|
| 176 | else: |
---|
| 177 | reference_filepath = options.ref |
---|
| 178 | assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.' |
---|
| 179 | |
---|
| 180 | # set up aligning and generate aligning command options |
---|
| 181 | # set up temp output files |
---|
| 182 | tmp_bmf = tempfile.NamedTemporaryFile( dir=tmp_dir ) |
---|
| 183 | tmp_bmf_name = tmp_bmf.name |
---|
| 184 | tmp_bmf.close() |
---|
| 185 | tmp_baf = tempfile.NamedTemporaryFile( dir=tmp_dir ) |
---|
| 186 | tmp_baf_name = tmp_baf.name |
---|
| 187 | tmp_baf.close() |
---|
| 188 | |
---|
| 189 | bfast_match_cmd = 'bfast match -f "%s" -r "%s" -n "%s" -A "%s" -T "%s" -w "%s" %s' % ( reference_filepath, options.fastq, options.numThreads, options.space, tmp_dir, options.whichStrand, all_cmd_options ) |
---|
| 190 | bfast_localalign_cmd = 'bfast localalign -f "%s" -m "%s" -n "%s" -A "%s" -o "%s" %s' % ( reference_filepath, tmp_bmf_name, options.numThreads, options.space, options.offset, all_cmd_options ) |
---|
| 191 | bfast_postprocess_cmd = 'bfast postprocess -O 1 -f "%s" -i "%s" -n "%s" -A "%s" -a "%s" %s' % ( reference_filepath, tmp_baf_name, options.numThreads, options.space, options.algorithm, all_cmd_options ) |
---|
| 192 | |
---|
| 193 | if options.offsets: |
---|
| 194 | bfast_match_cmd += ' -o "%s"' % options.offsets |
---|
| 195 | if options.keySize >= 0: |
---|
| 196 | bfast_match_cmd += ' -k "%s"' % options.keySize |
---|
| 197 | if options.maxKeyMatches >= 0: |
---|
| 198 | bfast_match_cmd += ' -K "%s"' % options.maxKeyMatches |
---|
| 199 | if options.maxNumMatches >= 0: |
---|
| 200 | bfast_match_cmd += ' -M "%s"' % options.maxNumMatches |
---|
| 201 | bfast_localalign_cmd += ' -M "%s"' % options.maxNumMatches |
---|
| 202 | if options.scoringMatrixFileName: |
---|
| 203 | bfast_localalign_cmd += ' -x "%s"' % options.scoringMatrixFileName |
---|
| 204 | bfast_postprocess_cmd += ' -x "%s"' % options.scoringMatrixFileName |
---|
| 205 | if options.ungapped: |
---|
| 206 | bfast_localalign_cmd += ' -u' |
---|
| 207 | if options.unconstrained: |
---|
| 208 | bfast_localalign_cmd += ' -U' |
---|
| 209 | if options.avgMismatchQuality >= 0: |
---|
| 210 | bfast_localalign_cmd += ' -q "%s"' % options.avgMismatchQuality |
---|
| 211 | bfast_postprocess_cmd += ' -q "%s"' % options.avgMismatchQuality |
---|
| 212 | if options.algorithm == 3: |
---|
| 213 | if options.pairedEndInfer: |
---|
| 214 | bfast_postprocess_cmd += ' -P' |
---|
| 215 | if options.randomBest: |
---|
| 216 | bfast_postprocess_cmd += ' -z' |
---|
| 217 | if options.unpaired: |
---|
| 218 | bfast_postprocess_cmd += ' -U' |
---|
| 219 | if options.reverseStrand: |
---|
| 220 | bfast_postprocess_cmd += ' -R' |
---|
| 221 | |
---|
| 222 | #instead of using temp files, should we stream through pipes? |
---|
| 223 | bfast_match_cmd += " > %s" % tmp_bmf_name |
---|
| 224 | bfast_localalign_cmd += " > %s" % tmp_baf_name |
---|
| 225 | bfast_postprocess_cmd += " > %s" % options.output |
---|
| 226 | |
---|
| 227 | # need to nest try-except in try-finally to handle 2.4 |
---|
| 228 | try: |
---|
| 229 | # bfast 'match' |
---|
| 230 | try: |
---|
| 231 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
| 232 | tmp_stderr = open( tmp, 'wb' ) |
---|
| 233 | proc = subprocess.Popen( args=bfast_match_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
| 234 | returncode = proc.wait() |
---|
| 235 | tmp_stderr.close() |
---|
| 236 | # get stderr, allowing for case where it's very large |
---|
| 237 | tmp_stderr = open( tmp, 'rb' ) |
---|
| 238 | stderr = '' |
---|
| 239 | try: |
---|
| 240 | while True: |
---|
| 241 | stderr += tmp_stderr.read( buffsize ) |
---|
| 242 | if not stderr or len( stderr ) % buffsize != 0: |
---|
| 243 | break |
---|
| 244 | except OverflowError: |
---|
| 245 | pass |
---|
| 246 | tmp_stderr.close() |
---|
| 247 | if returncode != 0: |
---|
| 248 | raise Exception, stderr |
---|
| 249 | except Exception, e: |
---|
| 250 | raise Exception, 'Error in \'bfast match\'. \n' + str( e ) |
---|
| 251 | # bfast 'localalign' |
---|
| 252 | try: |
---|
| 253 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
| 254 | tmp_stderr = open( tmp, 'wb' ) |
---|
| 255 | proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
| 256 | returncode = proc.wait() |
---|
| 257 | tmp_stderr.close() |
---|
| 258 | # get stderr, allowing for case where it's very large |
---|
| 259 | tmp_stderr = open( tmp, 'rb' ) |
---|
| 260 | stderr = '' |
---|
| 261 | try: |
---|
| 262 | while True: |
---|
| 263 | stderr += tmp_stderr.read( buffsize ) |
---|
| 264 | if not stderr or len( stderr ) % buffsize != 0: |
---|
| 265 | break |
---|
| 266 | except OverflowError: |
---|
| 267 | pass |
---|
| 268 | tmp_stderr.close() |
---|
| 269 | if returncode != 0: |
---|
| 270 | raise Exception, stderr |
---|
| 271 | except Exception, e: |
---|
| 272 | raise Exception, 'Error in \'bfast localalign\'. \n' + str( e ) |
---|
| 273 | # bfast 'postprocess' |
---|
| 274 | try: |
---|
| 275 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
| 276 | tmp_stderr = open( tmp, 'wb' ) |
---|
| 277 | proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
| 278 | returncode = proc.wait() |
---|
| 279 | tmp_stderr.close() |
---|
| 280 | # get stderr, allowing for case where it's very large |
---|
| 281 | tmp_stderr = open( tmp, 'rb' ) |
---|
| 282 | stderr = '' |
---|
| 283 | try: |
---|
| 284 | while True: |
---|
| 285 | stderr += tmp_stderr.read( buffsize ) |
---|
| 286 | if not stderr or len( stderr ) % buffsize != 0: |
---|
| 287 | break |
---|
| 288 | except OverflowError: |
---|
| 289 | pass |
---|
| 290 | tmp_stderr.close() |
---|
| 291 | if returncode != 0: |
---|
| 292 | raise Exception, stderr |
---|
| 293 | except Exception, e: |
---|
| 294 | raise Exception, 'Error in \'bfast postprocess\'. \n' + str( e ) |
---|
| 295 | # remove header if necessary |
---|
| 296 | if options.suppressHeader: |
---|
| 297 | tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) |
---|
| 298 | tmp_out_name = tmp_out.name |
---|
| 299 | tmp_out.close() |
---|
| 300 | try: |
---|
| 301 | shutil.move( options.output, tmp_out_name ) |
---|
| 302 | except Exception, e: |
---|
| 303 | raise Exception, 'Error moving output file before removing headers. \n' + str( e ) |
---|
| 304 | fout = file( options.output, 'w' ) |
---|
| 305 | for line in file( tmp_out.name, 'r' ): |
---|
| 306 | if len( line ) < 3 or line[0:3] not in [ '@HD', '@SQ', '@RG', '@PG', '@CO' ]: |
---|
| 307 | fout.write( line ) |
---|
| 308 | fout.close() |
---|
| 309 | # check that there are results in the output file |
---|
| 310 | if os.path.getsize( options.output ) > 0: |
---|
| 311 | if "0" == options.space: |
---|
| 312 | sys.stdout.write( 'BFAST run on Base Space data' ) |
---|
| 313 | else: |
---|
| 314 | sys.stdout.write( 'BFAST run on Color Space data' ) |
---|
| 315 | else: |
---|
| 316 | raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' |
---|
| 317 | except Exception, e: |
---|
| 318 | stop_err( 'The alignment failed.\n' + str( e ) ) |
---|
| 319 | finally: |
---|
| 320 | # clean up temp dir |
---|
| 321 | if os.path.exists( tmp_dir ): |
---|
| 322 | shutil.rmtree( tmp_dir ) |
---|
| 323 | |
---|
| 324 | if __name__=="__main__": __main__() |
---|