1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """ |
---|
4 | Runs BFAST on single-end or paired-end data. |
---|
5 | TODO: more documentation |
---|
6 | |
---|
7 | TODO: |
---|
8 | - auto-detect gzip or bz2 |
---|
9 | - split options (?) |
---|
10 | - queue lengths (?) |
---|
11 | - assumes reference always has been indexed |
---|
12 | - main and secondary indexes |
---|
13 | - scoring matrix file ? |
---|
14 | - read group file ? |
---|
15 | |
---|
16 | usage: bfast_wrapper.py [options] |
---|
17 | -r, --ref=r: The reference genome to use or index |
---|
18 | -f, --fastq=f: The fastq file to use for the mapping |
---|
19 | -F, --output=u: The file to save the output (SAM format) |
---|
20 | -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history) |
---|
21 | -p, --params=p: Parameter setting to use (pre_set or full) |
---|
22 | -n, --numThreads=n: The number of threads to use |
---|
23 | -A, --space=A: The encoding space (0: base 1: color) |
---|
24 | -o, --offsets=o: The offsets for 'match' |
---|
25 | -l, --loadAllIndexes=l: Load all indexes into memory |
---|
26 | -k, --keySize=k: truncate key size in 'match' |
---|
27 | -K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored |
---|
28 | -M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded |
---|
29 | -w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse) |
---|
30 | -t, --timing=t: output timing information to stderr |
---|
31 | -u, --ungapped=u: performed ungapped local alignment |
---|
32 | -U, --unconstrained=U: performed local alignment without mask constraints |
---|
33 | -O, --offset=O: the number of bases before and after each hit to consider in local alignment |
---|
34 | -q, --avgMismatchQuality=q: average mismatch quality |
---|
35 | -a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all) |
---|
36 | -P, --disallowPairing=P: do not choose alignments based on pairing |
---|
37 | -R, --reverse=R: paired end reads are given on reverse strands |
---|
38 | -z, --random=z: output a random best scoring alignment |
---|
39 | -D, --dbkey=D: Dbkey for reference genome |
---|
40 | """ |
---|
41 | |
---|
42 | import optparse, os, shutil, subprocess, sys, tempfile |
---|
43 | |
---|
44 | def stop_err( msg ): |
---|
45 | sys.stderr.write( '%s\n' % msg ) |
---|
46 | sys.exit() |
---|
47 | |
---|
48 | def __main__(): |
---|
49 | #Parse Command Line |
---|
50 | parser = optparse.OptionParser() |
---|
51 | parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to index and use' ) |
---|
52 | parser.add_option( '-f', '--fastq', dest='fastq', help='The fastq file to use for the mapping' ) |
---|
53 | parser.add_option( '-F', '--output', dest='output', help='The file to save the output (SAM format)' ) |
---|
54 | parser.add_option( '-A', '--space', dest='space', type="choice", default='0', choices=('0','1' ), help='The encoding space (0: base 1: color)' ) |
---|
55 | parser.add_option( '-H', '--suppressHeader', action="store_true", dest='suppressHeader', default=False, help='Suppress header' ) |
---|
56 | parser.add_option( '-n', '--numThreads', dest='numThreads', type="int", default="1", help='The number of threads to use' ) |
---|
57 | parser.add_option( '-t', '--timing', action="store_true", default=False, dest='timing', help='output timming information to stderr' ) |
---|
58 | parser.add_option( '-l', '--loadAllIndexes', action="store_true", default=False, dest='loadAllIndexes', help='Load all indexes into memory' ) |
---|
59 | parser.add_option( '-m', '--indexMask', dest='indexMask', help='String containing info on how to build custom indexes' ) |
---|
60 | parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help='String containing info on how to build custom indexes' ) |
---|
61 | parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help='Do not index lower case sequences. Such as those created by RepeatMasker' ) |
---|
62 | parser.add_option( '--indexContigOptions', dest='indexContigOptions', default="", help='The contig range options to use for the indexing' ) |
---|
63 | parser.add_option( '--indexExonsFileName', dest='indexExonsFileName', default="", help='The exons file to use for the indexing' ) |
---|
64 | |
---|
65 | parser.add_option( '-o', '--offsets', dest='offsets', default="", help='The offsets for \'match\'' ) |
---|
66 | parser.add_option( '-k', '--keySize', dest='keySize', type="int", default="-1", help='truncate key size in \'match\'' ) |
---|
67 | parser.add_option( '-K', '--maxKeyMatches', dest='maxKeyMatches', type="int", default="-1", help='the maximum number of matches to allow before a key is ignored' ) |
---|
68 | parser.add_option( '-M', '--maxNumMatches', dest='maxNumMatches', type="int", default="-1", help='the maximum number of matches to allow bfore the read is discarded' ) |
---|
69 | parser.add_option( '-w', '--whichStrand', dest='whichStrand', type="choice", default='0', choices=('0','1','2'), help='the strands to consider (0: both 1: forward 2: reverse)' ) |
---|
70 | |
---|
71 | parser.add_option( '--scoringMatrixFileName', dest='scoringMatrixFileName', help='Scoring Matrix file used to score the alignments' ) |
---|
72 | parser.add_option( '-u', '--ungapped', dest='ungapped', action="store_true", default=False, help='performed ungapped local alignment' ) |
---|
73 | parser.add_option( '-U', '--unconstrained', dest='unconstrained', action="store_true", default=False, help='performed local alignment without mask constraints' ) |
---|
74 | parser.add_option( '-O', '--offset', dest='offset', type="int", default="0", help='the number of bases before and after each hit to consider in local alignment' ) |
---|
75 | parser.add_option( '-q', '--avgMismatchQuality', type="int", default="-1", dest='avgMismatchQuality', help='average mismatch quality' ) |
---|
76 | |
---|
77 | parser.add_option( '-a', '--algorithm', dest='algorithm', default='0', type="choice", choices=('0','1','2','3','4' ), help='post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all' ) |
---|
78 | parser.add_option( '--unpaired', dest='unpaired', action="store_true", default=False, help='do not choose alignments based on pairing' ) |
---|
79 | parser.add_option( '--reverseStrand', dest='reverseStrand', action="store_true", default=False, help='paired end reads are given on reverse strands' ) |
---|
80 | parser.add_option( '--pairedEndInfer', dest='pairedEndInfer', action="store_true", default=False, help='break ties when one end of a paired end read by estimating the insert size distribution' ) |
---|
81 | parser.add_option( '--randomBest', dest='randomBest', action="store_true", default=False, help='output a random best scoring alignment' ) |
---|
82 | |
---|
83 | (options, args) = parser.parse_args() |
---|
84 | |
---|
85 | buffsize = 1048576 |
---|
86 | |
---|
87 | # make temp directory for bfast, requires trailing slash |
---|
88 | tmp_dir = '%s/' % tempfile.mkdtemp() |
---|
89 | |
---|
90 | #'generic' options used in all bfast commands here |
---|
91 | if options.timing: |
---|
92 | all_cmd_options = "-t" |
---|
93 | else: |
---|
94 | all_cmd_options = "" |
---|
95 | |
---|
96 | try: |
---|
97 | if options.buildIndex: |
---|
98 | reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name |
---|
99 | #build bfast indexes |
---|
100 | os.symlink( options.ref, reference_filepath ) |
---|
101 | |
---|
102 | #bfast fast2brg |
---|
103 | try: |
---|
104 | nuc_space = [ "0" ] |
---|
105 | if options.space == "1": |
---|
106 | #color space localalign appears to require nuc space brg |
---|
107 | nuc_space.append( "1" ) |
---|
108 | for space in nuc_space: |
---|
109 | cmd = 'bfast fasta2brg -f "%s" -A "%s" %s' % ( reference_filepath, space, all_cmd_options ) |
---|
110 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
111 | tmp_stderr = open( tmp, 'wb' ) |
---|
112 | proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
113 | returncode = proc.wait() |
---|
114 | tmp_stderr.close() |
---|
115 | # get stderr, allowing for case where it's very large |
---|
116 | tmp_stderr = open( tmp, 'rb' ) |
---|
117 | stderr = '' |
---|
118 | try: |
---|
119 | while True: |
---|
120 | stderr += tmp_stderr.read( buffsize ) |
---|
121 | if not stderr or len( stderr ) % buffsize != 0: |
---|
122 | break |
---|
123 | except OverflowError: |
---|
124 | pass |
---|
125 | tmp_stderr.close() |
---|
126 | if returncode != 0: |
---|
127 | raise Exception, stderr |
---|
128 | except Exception, e: |
---|
129 | raise Exception, 'Error in \'bfast fasta2brg\'.\n' + str( e ) |
---|
130 | |
---|
131 | #bfast index |
---|
132 | try: |
---|
133 | all_index_cmds = 'bfast index %s -f "%s" -A "%s" -n "%s"' % ( all_cmd_options, reference_filepath, options.space, options.numThreads ) |
---|
134 | |
---|
135 | if options.indexRepeatMasker: |
---|
136 | all_index_cmds += " -R" |
---|
137 | |
---|
138 | if options.indexContigOptions: |
---|
139 | index_contig_options = map( int, options.indexContigOptions.split( ',' ) ) |
---|
140 | if index_contig_options[0] >= 0: |
---|
141 | all_index_cmds += ' -s "%s"' % index_contig_options[0] |
---|
142 | if index_contig_options[1] >= 0: |
---|
143 | all_index_cmds += ' -S "%s"' % index_contig_options[1] |
---|
144 | if index_contig_options[2] >= 0: |
---|
145 | all_index_cmds += ' -e "%s"' % index_contig_options[2] |
---|
146 | if index_contig_options[3] >= 0: |
---|
147 | all_index_cmds += ' -E "%s"' % index_contig_options[3] |
---|
148 | elif options.indexExonsFileName: |
---|
149 | all_index_cmds += ' -x "%s"' % options.indexExonsFileName |
---|
150 | |
---|
151 | index_count = 1 |
---|
152 | for mask, hash_width in [ mask.split( ':' ) for mask in options.indexMask.split( ',' ) ]: |
---|
153 | cmd = '%s -m "%s" -w "%s" -i "%i"' % ( all_index_cmds, mask, hash_width, index_count ) |
---|
154 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
155 | tmp_stderr = open( tmp, 'wb' ) |
---|
156 | proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
157 | returncode = proc.wait() |
---|
158 | tmp_stderr.close() |
---|
159 | # get stderr, allowing for case where it's very large |
---|
160 | tmp_stderr = open( tmp, 'rb' ) |
---|
161 | stderr = '' |
---|
162 | try: |
---|
163 | while True: |
---|
164 | stderr += tmp_stderr.read( buffsize ) |
---|
165 | if not stderr or len( stderr ) % buffsize != 0: |
---|
166 | break |
---|
167 | except OverflowError: |
---|
168 | pass |
---|
169 | tmp_stderr.close() |
---|
170 | if returncode != 0: |
---|
171 | raise Exception, stderr |
---|
172 | index_count += 1 |
---|
173 | except Exception, e: |
---|
174 | raise Exception, 'Error in \'bfast index\'.\n' + str( e ) |
---|
175 | |
---|
176 | else: |
---|
177 | reference_filepath = options.ref |
---|
178 | assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.' |
---|
179 | |
---|
180 | # set up aligning and generate aligning command options |
---|
181 | # set up temp output files |
---|
182 | tmp_bmf = tempfile.NamedTemporaryFile( dir=tmp_dir ) |
---|
183 | tmp_bmf_name = tmp_bmf.name |
---|
184 | tmp_bmf.close() |
---|
185 | tmp_baf = tempfile.NamedTemporaryFile( dir=tmp_dir ) |
---|
186 | tmp_baf_name = tmp_baf.name |
---|
187 | tmp_baf.close() |
---|
188 | |
---|
189 | bfast_match_cmd = 'bfast match -f "%s" -r "%s" -n "%s" -A "%s" -T "%s" -w "%s" %s' % ( reference_filepath, options.fastq, options.numThreads, options.space, tmp_dir, options.whichStrand, all_cmd_options ) |
---|
190 | bfast_localalign_cmd = 'bfast localalign -f "%s" -m "%s" -n "%s" -A "%s" -o "%s" %s' % ( reference_filepath, tmp_bmf_name, options.numThreads, options.space, options.offset, all_cmd_options ) |
---|
191 | bfast_postprocess_cmd = 'bfast postprocess -O 1 -f "%s" -i "%s" -n "%s" -A "%s" -a "%s" %s' % ( reference_filepath, tmp_baf_name, options.numThreads, options.space, options.algorithm, all_cmd_options ) |
---|
192 | |
---|
193 | if options.offsets: |
---|
194 | bfast_match_cmd += ' -o "%s"' % options.offsets |
---|
195 | if options.keySize >= 0: |
---|
196 | bfast_match_cmd += ' -k "%s"' % options.keySize |
---|
197 | if options.maxKeyMatches >= 0: |
---|
198 | bfast_match_cmd += ' -K "%s"' % options.maxKeyMatches |
---|
199 | if options.maxNumMatches >= 0: |
---|
200 | bfast_match_cmd += ' -M "%s"' % options.maxNumMatches |
---|
201 | bfast_localalign_cmd += ' -M "%s"' % options.maxNumMatches |
---|
202 | if options.scoringMatrixFileName: |
---|
203 | bfast_localalign_cmd += ' -x "%s"' % options.scoringMatrixFileName |
---|
204 | bfast_postprocess_cmd += ' -x "%s"' % options.scoringMatrixFileName |
---|
205 | if options.ungapped: |
---|
206 | bfast_localalign_cmd += ' -u' |
---|
207 | if options.unconstrained: |
---|
208 | bfast_localalign_cmd += ' -U' |
---|
209 | if options.avgMismatchQuality >= 0: |
---|
210 | bfast_localalign_cmd += ' -q "%s"' % options.avgMismatchQuality |
---|
211 | bfast_postprocess_cmd += ' -q "%s"' % options.avgMismatchQuality |
---|
212 | if options.algorithm == 3: |
---|
213 | if options.pairedEndInfer: |
---|
214 | bfast_postprocess_cmd += ' -P' |
---|
215 | if options.randomBest: |
---|
216 | bfast_postprocess_cmd += ' -z' |
---|
217 | if options.unpaired: |
---|
218 | bfast_postprocess_cmd += ' -U' |
---|
219 | if options.reverseStrand: |
---|
220 | bfast_postprocess_cmd += ' -R' |
---|
221 | |
---|
222 | #instead of using temp files, should we stream through pipes? |
---|
223 | bfast_match_cmd += " > %s" % tmp_bmf_name |
---|
224 | bfast_localalign_cmd += " > %s" % tmp_baf_name |
---|
225 | bfast_postprocess_cmd += " > %s" % options.output |
---|
226 | |
---|
227 | # need to nest try-except in try-finally to handle 2.4 |
---|
228 | try: |
---|
229 | # bfast 'match' |
---|
230 | try: |
---|
231 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
232 | tmp_stderr = open( tmp, 'wb' ) |
---|
233 | proc = subprocess.Popen( args=bfast_match_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
234 | returncode = proc.wait() |
---|
235 | tmp_stderr.close() |
---|
236 | # get stderr, allowing for case where it's very large |
---|
237 | tmp_stderr = open( tmp, 'rb' ) |
---|
238 | stderr = '' |
---|
239 | try: |
---|
240 | while True: |
---|
241 | stderr += tmp_stderr.read( buffsize ) |
---|
242 | if not stderr or len( stderr ) % buffsize != 0: |
---|
243 | break |
---|
244 | except OverflowError: |
---|
245 | pass |
---|
246 | tmp_stderr.close() |
---|
247 | if returncode != 0: |
---|
248 | raise Exception, stderr |
---|
249 | except Exception, e: |
---|
250 | raise Exception, 'Error in \'bfast match\'. \n' + str( e ) |
---|
251 | # bfast 'localalign' |
---|
252 | try: |
---|
253 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
254 | tmp_stderr = open( tmp, 'wb' ) |
---|
255 | proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
256 | returncode = proc.wait() |
---|
257 | tmp_stderr.close() |
---|
258 | # get stderr, allowing for case where it's very large |
---|
259 | tmp_stderr = open( tmp, 'rb' ) |
---|
260 | stderr = '' |
---|
261 | try: |
---|
262 | while True: |
---|
263 | stderr += tmp_stderr.read( buffsize ) |
---|
264 | if not stderr or len( stderr ) % buffsize != 0: |
---|
265 | break |
---|
266 | except OverflowError: |
---|
267 | pass |
---|
268 | tmp_stderr.close() |
---|
269 | if returncode != 0: |
---|
270 | raise Exception, stderr |
---|
271 | except Exception, e: |
---|
272 | raise Exception, 'Error in \'bfast localalign\'. \n' + str( e ) |
---|
273 | # bfast 'postprocess' |
---|
274 | try: |
---|
275 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
276 | tmp_stderr = open( tmp, 'wb' ) |
---|
277 | proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
278 | returncode = proc.wait() |
---|
279 | tmp_stderr.close() |
---|
280 | # get stderr, allowing for case where it's very large |
---|
281 | tmp_stderr = open( tmp, 'rb' ) |
---|
282 | stderr = '' |
---|
283 | try: |
---|
284 | while True: |
---|
285 | stderr += tmp_stderr.read( buffsize ) |
---|
286 | if not stderr or len( stderr ) % buffsize != 0: |
---|
287 | break |
---|
288 | except OverflowError: |
---|
289 | pass |
---|
290 | tmp_stderr.close() |
---|
291 | if returncode != 0: |
---|
292 | raise Exception, stderr |
---|
293 | except Exception, e: |
---|
294 | raise Exception, 'Error in \'bfast postprocess\'. \n' + str( e ) |
---|
295 | # remove header if necessary |
---|
296 | if options.suppressHeader: |
---|
297 | tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) |
---|
298 | tmp_out_name = tmp_out.name |
---|
299 | tmp_out.close() |
---|
300 | try: |
---|
301 | shutil.move( options.output, tmp_out_name ) |
---|
302 | except Exception, e: |
---|
303 | raise Exception, 'Error moving output file before removing headers. \n' + str( e ) |
---|
304 | fout = file( options.output, 'w' ) |
---|
305 | for line in file( tmp_out.name, 'r' ): |
---|
306 | if len( line ) < 3 or line[0:3] not in [ '@HD', '@SQ', '@RG', '@PG', '@CO' ]: |
---|
307 | fout.write( line ) |
---|
308 | fout.close() |
---|
309 | # check that there are results in the output file |
---|
310 | if os.path.getsize( options.output ) > 0: |
---|
311 | if "0" == options.space: |
---|
312 | sys.stdout.write( 'BFAST run on Base Space data' ) |
---|
313 | else: |
---|
314 | sys.stdout.write( 'BFAST run on Color Space data' ) |
---|
315 | else: |
---|
316 | raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' |
---|
317 | except Exception, e: |
---|
318 | stop_err( 'The alignment failed.\n' + str( e ) ) |
---|
319 | finally: |
---|
320 | # clean up temp dir |
---|
321 | if os.path.exists( tmp_dir ): |
---|
322 | shutil.rmtree( tmp_dir ) |
---|
323 | |
---|
324 | if __name__=="__main__": __main__() |
---|