1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """ |
---|
4 | Runs BWA on single-end or paired-end data. |
---|
5 | Produces a SAM file containing the mappings. |
---|
6 | Works with BWA version 0.5.3-0.5.5. |
---|
7 | |
---|
8 | usage: bwa_wrapper.py [options] |
---|
9 | -t, --threads=t: The number of threads to use |
---|
10 | -r, --ref=r: The reference genome to use or index |
---|
11 | -f, --fastq=f: The (forward) fastq file to use for the mapping |
---|
12 | -F, --rfastq=F: The reverse fastq file to use for mapping if paired-end data |
---|
13 | -u, --output=u: The file to save the output (SAM format) |
---|
14 | -g, --genAlignType=g: The type of pairing (single or paired) |
---|
15 | -p, --params=p: Parameter setting to use (pre_set or full) |
---|
16 | -s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history) |
---|
17 | -n, --maxEditDist=n: Maximum edit distance if integer |
---|
18 | -m, --fracMissingAligns=m: Fraction of missing alignments given 2% uniform base error rate if fraction |
---|
19 | -o, --maxGapOpens=o: Maximum number of gap opens |
---|
20 | -e, --maxGapExtens=e: Maximum number of gap extensions |
---|
21 | -d, --disallowLongDel=d: Disallow a long deletion within specified bps |
---|
22 | -i, --disallowIndel=i: Disallow indel within specified bps |
---|
23 | -l, --seed=l: Take the first specified subsequences |
---|
24 | -k, --maxEditDistSeed=k: Maximum edit distance to the seed |
---|
25 | -M, --mismatchPenalty=M: Mismatch penalty |
---|
26 | -O, --gapOpenPenalty=O: Gap open penalty |
---|
27 | -E, --gapExtensPenalty=E: Gap extension penalty |
---|
28 | -R, --suboptAlign=R: Proceed with suboptimal alignments even if the top hit is a repeat |
---|
29 | -N, --noIterSearch=N: Disable iterative search |
---|
30 | -T, --outputTopN=T: Output top specified hits |
---|
31 | -S, --maxInsertSize=S: Maximum insert size for a read pair to be considered mapped good |
---|
32 | -P, --maxOccurPairing=P: Maximum occurrences of a read for pairings |
---|
33 | -D, --dbkey=D: Dbkey for reference genome |
---|
34 | -H, --suppressHeader=h: Suppress header |
---|
35 | """ |
---|
36 | |
---|
37 | import optparse, os, shutil, subprocess, sys, tempfile |
---|
38 | |
---|
39 | def stop_err( msg ): |
---|
40 | sys.stderr.write( '%s\n' % msg ) |
---|
41 | sys.exit() |
---|
42 | |
---|
43 | def __main__(): |
---|
44 | #Parse Command Line |
---|
45 | parser = optparse.OptionParser() |
---|
46 | parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to use' ) |
---|
47 | parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' ) |
---|
48 | parser.add_option( '-f', '--fastq', dest='fastq', help='The (forward) fastq file to use for the mapping' ) |
---|
49 | parser.add_option( '-F', '--rfastq', dest='rfastq', help='The reverse fastq file to use for mapping if paired-end data' ) |
---|
50 | parser.add_option( '-u', '--output', dest='output', help='The file to save the output (SAM format)' ) |
---|
51 | parser.add_option( '-g', '--genAlignType', dest='genAlignType', help='The type of pairing (single or paired)' ) |
---|
52 | parser.add_option( '-p', '--params', dest='params', help='Parameter setting to use (pre_set or full)' ) |
---|
53 | parser.add_option( '-s', '--fileSource', dest='fileSource', help='Whether to use a previously indexed reference sequence or one form history (indexed or history)' ) |
---|
54 | parser.add_option( '-n', '--maxEditDist', dest='maxEditDist', help='Maximum edit distance if integer' ) |
---|
55 | parser.add_option( '-m', '--fracMissingAligns', dest='fracMissingAligns', help='Fraction of missing alignments given 2% uniform base error rate if fraction' ) |
---|
56 | parser.add_option( '-o', '--maxGapOpens', dest='maxGapOpens', help='Maximum number of gap opens' ) |
---|
57 | parser.add_option( '-e', '--maxGapExtens', dest='maxGapExtens', help='Maximum number of gap extensions' ) |
---|
58 | parser.add_option( '-d', '--disallowLongDel', dest='disallowLongDel', help='Disallow a long deletion within specified bps' ) |
---|
59 | parser.add_option( '-i', '--disallowIndel', dest='disallowIndel', help='Disallow indel within specified bps' ) |
---|
60 | parser.add_option( '-l', '--seed', dest='seed', help='Take the first specified subsequences' ) |
---|
61 | parser.add_option( '-k', '--maxEditDistSeed', dest='maxEditDistSeed', help='Maximum edit distance to the seed' ) |
---|
62 | parser.add_option( '-M', '--mismatchPenalty', dest='mismatchPenalty', help='Mismatch penalty' ) |
---|
63 | parser.add_option( '-O', '--gapOpenPenalty', dest='gapOpenPenalty', help='Gap open penalty' ) |
---|
64 | parser.add_option( '-E', '--gapExtensPenalty', dest='gapExtensPenalty', help='Gap extension penalty' ) |
---|
65 | parser.add_option( '-R', '--suboptAlign', dest='suboptAlign', help='Proceed with suboptimal alignments even if the top hit is a repeat' ) |
---|
66 | parser.add_option( '-N', '--noIterSearch', dest='noIterSearch', help='Disable iterative search' ) |
---|
67 | parser.add_option( '-T', '--outputTopN', dest='outputTopN', help='Output top specified hits' ) |
---|
68 | parser.add_option( '-S', '--maxInsertSize', dest='maxInsertSize', help='Maximum insert size for a read pair to be considered mapped good' ) |
---|
69 | parser.add_option( '-P', '--maxOccurPairing', dest='maxOccurPairing', help='Maximum occurrences of a read for pairings' ) |
---|
70 | parser.add_option( '-D', '--dbkey', dest='dbkey', help='Dbkey for reference genome' ) |
---|
71 | parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' ) |
---|
72 | (options, args) = parser.parse_args() |
---|
73 | # make temp directory for placement of indices |
---|
74 | tmp_index_dir = tempfile.mkdtemp() |
---|
75 | tmp_dir = tempfile.mkdtemp() |
---|
76 | # index if necessary |
---|
77 | if options.fileSource == 'history': |
---|
78 | ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir ) |
---|
79 | ref_file_name = ref_file.name |
---|
80 | ref_file.close() |
---|
81 | os.symlink( options.ref, ref_file_name ) |
---|
82 | # determine which indexing algorithm to use, based on size |
---|
83 | try: |
---|
84 | size = os.stat( options.ref ).st_size |
---|
85 | if size <= 2**30: |
---|
86 | indexingAlg = 'is' |
---|
87 | else: |
---|
88 | indexingAlg = 'bwtsw' |
---|
89 | except: |
---|
90 | indexingAlg = 'is' |
---|
91 | indexing_cmds = '-a %s' % indexingAlg |
---|
92 | cmd1 = 'bwa index %s %s' % ( indexing_cmds, ref_file_name ) |
---|
93 | try: |
---|
94 | tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name |
---|
95 | tmp_stderr = open( tmp, 'wb' ) |
---|
96 | proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() ) |
---|
97 | returncode = proc.wait() |
---|
98 | tmp_stderr.close() |
---|
99 | # get stderr, allowing for case where it's very large |
---|
100 | tmp_stderr = open( tmp, 'rb' ) |
---|
101 | stderr = '' |
---|
102 | buffsize = 1048576 |
---|
103 | try: |
---|
104 | while True: |
---|
105 | stderr += tmp_stderr.read( buffsize ) |
---|
106 | if not stderr or len( stderr ) % buffsize != 0: |
---|
107 | break |
---|
108 | except OverflowError: |
---|
109 | pass |
---|
110 | tmp_stderr.close() |
---|
111 | if returncode != 0: |
---|
112 | raise Exception, stderr |
---|
113 | except Exception, e: |
---|
114 | # clean up temp dirs |
---|
115 | if os.path.exists( tmp_index_dir ): |
---|
116 | shutil.rmtree( tmp_index_dir ) |
---|
117 | if os.path.exists( tmp_dir ): |
---|
118 | shutil.rmtree( tmp_dir ) |
---|
119 | stop_err( 'Error indexing reference sequence. ' + str( e ) ) |
---|
120 | else: |
---|
121 | ref_file_name = options.ref |
---|
122 | # set up aligning and generate aligning command options |
---|
123 | if options.params == 'pre_set': |
---|
124 | aligning_cmds = '-t %s' % options.threads |
---|
125 | gen_alignment_cmds = '' |
---|
126 | else: |
---|
127 | if options.maxEditDist != '0': |
---|
128 | editDist = options.maxEditDist |
---|
129 | else: |
---|
130 | editDist = options.fracMissingAligns |
---|
131 | if options.seed != '-1': |
---|
132 | seed = '-l %s' % options.seed |
---|
133 | else: |
---|
134 | seed = '' |
---|
135 | if options.suboptAlign == 'true': |
---|
136 | suboptAlign = '-R' |
---|
137 | else: |
---|
138 | suboptAlign = '' |
---|
139 | if options.noIterSearch == 'true': |
---|
140 | noIterSearch = '-N' |
---|
141 | else: |
---|
142 | noIterSearch = '' |
---|
143 | aligning_cmds = '-n %s -o %s -e %s -d %s -i %s %s -k %s -t %s -M %s -O %s -E %s %s %s' % \ |
---|
144 | ( editDist, options.maxGapOpens, options.maxGapExtens, options.disallowLongDel, |
---|
145 | options.disallowIndel, seed, options.maxEditDistSeed, options.threads, |
---|
146 | options.mismatchPenalty, options.gapOpenPenalty, options.gapExtensPenalty, |
---|
147 | suboptAlign, noIterSearch ) |
---|
148 | if options.genAlignType == 'single': |
---|
149 | gen_alignment_cmds = '-n %s' % options.outputTopN |
---|
150 | elif options.genAlignType == 'paired': |
---|
151 | gen_alignment_cmds = '-a %s -o %s' % ( options.maxInsertSize, options.maxOccurPairing ) |
---|
152 | # set up output files |
---|
153 | tmp_align_out = tempfile.NamedTemporaryFile( dir=tmp_dir ) |
---|
154 | tmp_align_out_name = tmp_align_out.name |
---|
155 | tmp_align_out.close() |
---|
156 | tmp_align_out2 = tempfile.NamedTemporaryFile( dir=tmp_dir ) |
---|
157 | tmp_align_out2_name = tmp_align_out2.name |
---|
158 | tmp_align_out2.close() |
---|
159 | # prepare actual aligning and generate aligning commands |
---|
160 | cmd2 = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.fastq, tmp_align_out_name ) |
---|
161 | cmd2b = '' |
---|
162 | if options.genAlignType == 'paired': |
---|
163 | cmd2b = 'bwa aln %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.rfastq, tmp_align_out2_name ) |
---|
164 | cmd3 = 'bwa sampe %s %s %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, tmp_align_out2_name, options.fastq, options.rfastq, options.output ) |
---|
165 | else: |
---|
166 | cmd3 = 'bwa samse %s %s %s %s >> %s' % ( gen_alignment_cmds, ref_file_name, tmp_align_out_name, options.fastq, options.output ) |
---|
167 | # perform alignments |
---|
168 | buffsize = 1048576 |
---|
169 | try: |
---|
170 | # need to nest try-except in try-finally to handle 2.4 |
---|
171 | try: |
---|
172 | # align |
---|
173 | try: |
---|
174 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
175 | tmp_stderr = open( tmp, 'wb' ) |
---|
176 | proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
177 | returncode = proc.wait() |
---|
178 | tmp_stderr.close() |
---|
179 | # get stderr, allowing for case where it's very large |
---|
180 | tmp_stderr = open( tmp, 'rb' ) |
---|
181 | stderr = '' |
---|
182 | try: |
---|
183 | while True: |
---|
184 | stderr += tmp_stderr.read( buffsize ) |
---|
185 | if not stderr or len( stderr ) % buffsize != 0: |
---|
186 | break |
---|
187 | except OverflowError: |
---|
188 | pass |
---|
189 | tmp_stderr.close() |
---|
190 | if returncode != 0: |
---|
191 | raise Exception, stderr |
---|
192 | except Exception, e: |
---|
193 | raise Exception, 'Error aligning sequence. ' + str( e ) |
---|
194 | # and again if paired data |
---|
195 | try: |
---|
196 | if cmd2b: |
---|
197 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
198 | tmp_stderr = open( tmp, 'wb' ) |
---|
199 | proc = subprocess.Popen( args=cmd2b, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
200 | returncode = proc.wait() |
---|
201 | tmp_stderr.close() |
---|
202 | # get stderr, allowing for case where it's very large |
---|
203 | tmp_stderr = open( tmp, 'rb' ) |
---|
204 | stderr = '' |
---|
205 | try: |
---|
206 | while True: |
---|
207 | stderr += tmp_stderr.read( buffsize ) |
---|
208 | if not stderr or len( stderr ) % buffsize != 0: |
---|
209 | break |
---|
210 | except OverflowError: |
---|
211 | pass |
---|
212 | tmp_stderr.close() |
---|
213 | if returncode != 0: |
---|
214 | raise Exception, stderr |
---|
215 | except Exception, e: |
---|
216 | raise Exception, 'Error aligning second sequence. ' + str( e ) |
---|
217 | # generate align |
---|
218 | try: |
---|
219 | tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name |
---|
220 | tmp_stderr = open( tmp, 'wb' ) |
---|
221 | proc = subprocess.Popen( args=cmd3, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() ) |
---|
222 | returncode = proc.wait() |
---|
223 | tmp_stderr.close() |
---|
224 | # get stderr, allowing for case where it's very large |
---|
225 | tmp_stderr = open( tmp, 'rb' ) |
---|
226 | stderr = '' |
---|
227 | try: |
---|
228 | while True: |
---|
229 | stderr += tmp_stderr.read( buffsize ) |
---|
230 | if not stderr or len( stderr ) % buffsize != 0: |
---|
231 | break |
---|
232 | except OverflowError: |
---|
233 | pass |
---|
234 | tmp_stderr.close() |
---|
235 | if returncode != 0: |
---|
236 | raise Exception, stderr |
---|
237 | except Exception, e: |
---|
238 | raise Exception, 'Error generating alignments. ' + str( e ) |
---|
239 | # remove header if necessary |
---|
240 | if options.suppressHeader == 'true': |
---|
241 | tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir) |
---|
242 | tmp_out_name = tmp_out.name |
---|
243 | tmp_out.close() |
---|
244 | try: |
---|
245 | shutil.move( options.output, tmp_out_name ) |
---|
246 | except Exception, e: |
---|
247 | raise Exception, 'Error moving output file before removing headers. ' + str( e ) |
---|
248 | fout = file( options.output, 'w' ) |
---|
249 | for line in file( tmp_out.name, 'r' ): |
---|
250 | if not ( line.startswith( '@HD' ) or line.startswith( '@SQ' ) or line.startswith( '@RG' ) or line.startswith( '@PG' ) or line.startswith( '@CO' ) ): |
---|
251 | fout.write( line ) |
---|
252 | fout.close() |
---|
253 | # check that there are results in the output file |
---|
254 | if os.path.getsize( options.output ) > 0: |
---|
255 | sys.stdout.write( 'BWA run on %s-end data' % options.genAlignType ) |
---|
256 | else: |
---|
257 | raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.' |
---|
258 | except Exception, e: |
---|
259 | stop_err( 'The alignment failed.\n' + str( e ) ) |
---|
260 | finally: |
---|
261 | # clean up temp dir |
---|
262 | if os.path.exists( tmp_index_dir ): |
---|
263 | shutil.rmtree( tmp_index_dir ) |
---|
264 | if os.path.exists( tmp_dir ): |
---|
265 | shutil.rmtree( tmp_dir ) |
---|
266 | |
---|
267 | if __name__=="__main__": __main__() |
---|