1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """ |
---|
4 | TODO |
---|
5 | 1. decrease memory usage |
---|
6 | 2. multi-fasta fastq file, ex. 454 |
---|
7 | 3. split reads into small chuncks? |
---|
8 | |
---|
9 | SHRiMP wrapper |
---|
10 | |
---|
11 | Inputs: |
---|
12 | 1. reference seq |
---|
13 | 2. reads |
---|
14 | |
---|
15 | Outputs: |
---|
16 | 1. table of 8 columns: |
---|
17 | chrom ref_loc read_id read_loc ref_nuc read_nuc quality coverage |
---|
18 | 2. SHRiMP output |
---|
19 | |
---|
20 | Parameters: |
---|
21 | -s Spaced Seed (default: 111111011111) |
---|
22 | -n Seed Matches per Window (default: 2) |
---|
23 | -t Seed Hit Taboo Length (default: 4) |
---|
24 | -9 Seed Generation Taboo Length (default: 0) |
---|
25 | -w Seed Window Length (default: 115.00%) |
---|
26 | -o Maximum Hits per Read (default: 100) |
---|
27 | -r Maximum Read Length (default: 1000) |
---|
28 | -d Kmer Std. Deviation Limit (default: -1 [None]) |
---|
29 | |
---|
30 | -m S-W Match Value (default: 100) |
---|
31 | -i S-W Mismatch Value (default: -150) |
---|
32 | -g S-W Gap Open Penalty (Reference) (default: -400) |
---|
33 | -q S-W Gap Open Penalty (Query) (default: -400) |
---|
34 | -e S-W Gap Extend Penalty (Reference) (default: -70) |
---|
35 | -f S-W Gap Extend Penalty (Query) (default: -70) |
---|
36 | -h S-W Hit Threshold (default: 68.00%) |
---|
37 | |
---|
38 | Command: |
---|
39 | %rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold <query> <target> > <output> 2> <log> |
---|
40 | |
---|
41 | SHRiMP output: |
---|
42 | >7:2:1147:982/1 chr3 + 36586562 36586595 2 35 36 2900 3G16G13 |
---|
43 | >7:2:1147:982/1 chr3 + 95338194 95338225 4 35 36 2700 9T7C14 |
---|
44 | >7:2:587:93/1 chr3 + 14913541 14913577 1 35 36 2960 19--16 |
---|
45 | |
---|
46 | """ |
---|
47 | |
---|
48 | import os, sys, tempfile, os.path, re |
---|
49 | |
---|
50 | assert sys.version_info[:2] >= (2.4) |
---|
51 | |
---|
52 | def stop_err( msg ): |
---|
53 | |
---|
54 | sys.stderr.write( "%s\n" % msg ) |
---|
55 | sys.exit() |
---|
56 | |
---|
57 | def reverse_complement(s): |
---|
58 | |
---|
59 | complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":".", "-":"-"} |
---|
60 | reversed_s = [] |
---|
61 | for i in s: |
---|
62 | reversed_s.append(complement_dna[i]) |
---|
63 | reversed_s.reverse() |
---|
64 | return "".join(reversed_s) |
---|
65 | |
---|
66 | def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size): |
---|
67 | |
---|
68 | invalid_editstring_char = 0 |
---|
69 | |
---|
70 | all_score_file = score_files.split(',') |
---|
71 | |
---|
72 | if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.') |
---|
73 | |
---|
74 | temp_table_name = tempfile.NamedTemporaryFile().name |
---|
75 | temp_table = open(temp_table_name, 'w') |
---|
76 | |
---|
77 | outfile = open(table_outfile,'w') |
---|
78 | |
---|
79 | # reference seq: not a single fasta seq |
---|
80 | refseq = {} |
---|
81 | chrom_cov = {} |
---|
82 | seq = '' |
---|
83 | |
---|
84 | for i, line in enumerate(file(ref_file)): |
---|
85 | line = line.rstrip() |
---|
86 | if not line or line.startswith('#'): continue |
---|
87 | |
---|
88 | if line.startswith('>'): |
---|
89 | if seq: |
---|
90 | if refseq.has_key(title): |
---|
91 | pass |
---|
92 | else: |
---|
93 | refseq[title] = seq |
---|
94 | chrom_cov[title] = {} |
---|
95 | seq = '' |
---|
96 | title = line[1:] |
---|
97 | else: |
---|
98 | seq += line |
---|
99 | if seq: |
---|
100 | if not refseq.has_key(title): |
---|
101 | refseq[title] = seq |
---|
102 | chrom_cov[title] = {} |
---|
103 | |
---|
104 | # find hits : one end and/or the other |
---|
105 | hits = {} |
---|
106 | for i, line in enumerate(file(result_file)): |
---|
107 | line = line.rstrip() |
---|
108 | if not line or line.startswith('#'): continue |
---|
109 | |
---|
110 | #FORMAT: readname contigname strand contigstart contigend readstart readend readlength score editstring |
---|
111 | fields = line.split('\t') |
---|
112 | readname = fields[0][1:] |
---|
113 | chrom = fields[1] |
---|
114 | strand = fields[2] |
---|
115 | chrom_start = int(fields[3]) - 1 |
---|
116 | chrom_end = int(fields[4]) |
---|
117 | read_start = fields[5] |
---|
118 | read_end = fields[6] |
---|
119 | read_len = fields[7] |
---|
120 | score = fields[8] |
---|
121 | editstring = fields[9] |
---|
122 | |
---|
123 | if hit_per_read == 1: |
---|
124 | endindex = '1' |
---|
125 | else: |
---|
126 | readname, endindex = readname.split('/') |
---|
127 | |
---|
128 | if hits.has_key(readname): |
---|
129 | if hits[readname].has_key(endindex): |
---|
130 | hits[readname][endindex].append([strand, editstring, chrom_start, chrom_end, read_start, chrom]) |
---|
131 | else: |
---|
132 | hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]] |
---|
133 | else: |
---|
134 | hits[readname] = {} |
---|
135 | hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]] |
---|
136 | |
---|
137 | # find score : one end and the other end |
---|
138 | hits_score = {} |
---|
139 | readname = '' |
---|
140 | score = '' |
---|
141 | for num_score_file in range(len(all_score_file)): |
---|
142 | score_file = all_score_file[num_score_file] |
---|
143 | for i, line in enumerate(file(score_file)): |
---|
144 | line = line.rstrip() |
---|
145 | if not line or line.startswith('#'): continue |
---|
146 | |
---|
147 | if line.startswith('>'): |
---|
148 | if score: |
---|
149 | if hits.has_key(readname): |
---|
150 | if len(hits[readname]) == hit_per_read: |
---|
151 | if hits_score.has_key(readname): |
---|
152 | if hits_score[readname].has_key(endindex): |
---|
153 | pass |
---|
154 | else: |
---|
155 | hits_score[readname][endindex] = score |
---|
156 | else: |
---|
157 | hits_score[readname] = {} |
---|
158 | hits_score[readname][endindex] = score |
---|
159 | score = '' |
---|
160 | if hit_per_read == 1: |
---|
161 | readname = line[1:] |
---|
162 | endindex = '1' |
---|
163 | else: |
---|
164 | readname, endindex = line[1:].split('/') |
---|
165 | else: |
---|
166 | score = line |
---|
167 | |
---|
168 | if score: # the last one |
---|
169 | if hits.has_key(readname): |
---|
170 | if len(hits[readname]) == hit_per_read: |
---|
171 | if hits_score.has_key(readname): |
---|
172 | if hits_score[readname].has_key(endindex): |
---|
173 | pass |
---|
174 | else: |
---|
175 | hits_score[readname][endindex] = score |
---|
176 | else: |
---|
177 | hits_score[readname] = {} |
---|
178 | hits_score[readname][endindex] = score |
---|
179 | |
---|
180 | # call to all mappings |
---|
181 | for readkey in hits.keys(): |
---|
182 | if len(hits[readkey]) != hit_per_read: continue |
---|
183 | |
---|
184 | matches = [] |
---|
185 | match_count = 0 |
---|
186 | |
---|
187 | if hit_per_read == 1: |
---|
188 | if len(hits[readkey]['1']) == 1: |
---|
189 | matches = [ hits[readkey]['1'] ] |
---|
190 | match_count = 1 |
---|
191 | else: |
---|
192 | end1_data = hits[readkey]['1'] |
---|
193 | end2_data = hits[readkey]['2'] |
---|
194 | |
---|
195 | for i, end1_hit in enumerate(end1_data): |
---|
196 | crin_strand = {'+': False, '-': False} |
---|
197 | crin_insertSize = {'+': False, '-': False} |
---|
198 | |
---|
199 | crin_strand[end1_hit[0]] = True |
---|
200 | crin_insertSize[end1_hit[0]] = int(end1_hit[2]) |
---|
201 | |
---|
202 | for j, end2_hit in enumerate(end2_data): |
---|
203 | crin_strand[end2_hit[0]] = True |
---|
204 | crin_insertSize[end2_hit[0]] = int(end2_hit[2]) |
---|
205 | |
---|
206 | if end1_hit[-1] != end2_hit[-1] : continue |
---|
207 | |
---|
208 | if crin_strand['+'] and crin_strand['-']: |
---|
209 | if (crin_insertSize['-'] - crin_insertSize['+']) <= insertion_size: |
---|
210 | matches.append([end1_hit, end2_hit]) |
---|
211 | match_count += 1 |
---|
212 | |
---|
213 | if match_count == 1: |
---|
214 | |
---|
215 | for x, end_data in enumerate(matches[0]): |
---|
216 | |
---|
217 | end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data |
---|
218 | end_read_start = int(end_read_start) - 1 |
---|
219 | |
---|
220 | if end_strand == '-': |
---|
221 | refsegment = reverse_complement(refseq[end_chrom][end_chr_start:end_chr_end]) |
---|
222 | else: |
---|
223 | refsegment = refseq[end_chrom][end_chr_start:end_chr_end] |
---|
224 | |
---|
225 | match_len = 0 |
---|
226 | editindex = 0 |
---|
227 | gap_read = 0 |
---|
228 | |
---|
229 | while editindex < len(end_editstring): |
---|
230 | |
---|
231 | editchr = end_editstring[editindex] |
---|
232 | chrA = '' |
---|
233 | chrB = '' |
---|
234 | locIndex = [] |
---|
235 | |
---|
236 | if editchr.isdigit(): |
---|
237 | editcode = '' |
---|
238 | |
---|
239 | while editchr.isdigit() and editindex < len(end_editstring): |
---|
240 | editcode += editchr |
---|
241 | editindex += 1 |
---|
242 | if editindex < len(end_editstring): editchr = end_editstring[editindex] |
---|
243 | |
---|
244 | for baseIndex in range(int(editcode)): |
---|
245 | chrA += refsegment[match_len+baseIndex] |
---|
246 | chrB = chrA |
---|
247 | |
---|
248 | match_len += int(editcode) |
---|
249 | |
---|
250 | elif editchr == 'x': |
---|
251 | # crossover: inserted between the appropriate two bases |
---|
252 | # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers) |
---|
253 | # Treated as errors in the reads; Do nothing. |
---|
254 | editindex += 1 |
---|
255 | |
---|
256 | elif editchr.isalpha(): |
---|
257 | editcode = editchr |
---|
258 | editindex += 1 |
---|
259 | chrA = refsegment[match_len] |
---|
260 | chrB = editcode |
---|
261 | match_len += len(editcode) |
---|
262 | |
---|
263 | elif editchr == '-': |
---|
264 | editcode = editchr |
---|
265 | editindex += 1 |
---|
266 | chrA = refsegment[match_len] |
---|
267 | chrB = editcode |
---|
268 | match_len += len(editcode) |
---|
269 | gap_read += 1 |
---|
270 | |
---|
271 | elif editchr == '(': |
---|
272 | editcode = '' |
---|
273 | |
---|
274 | while editchr != ')' and editindex < len(end_editstring): |
---|
275 | if editindex < len(end_editstring): editchr = end_editstring[editindex] |
---|
276 | editcode += editchr |
---|
277 | editindex += 1 |
---|
278 | |
---|
279 | editcode = editcode[1:-1] |
---|
280 | chrA = '-'*len(editcode) |
---|
281 | chrB = editcode |
---|
282 | |
---|
283 | else: |
---|
284 | invalid_editstring_char += 1 |
---|
285 | |
---|
286 | if end_strand == '-': |
---|
287 | |
---|
288 | chrA = reverse_complement(chrA) |
---|
289 | chrB = reverse_complement(chrB) |
---|
290 | |
---|
291 | pos_line = '' |
---|
292 | rev_line = '' |
---|
293 | |
---|
294 | for mappingIndex in range(len(chrA)): |
---|
295 | # reference |
---|
296 | chrAx = chrA[mappingIndex] |
---|
297 | # read |
---|
298 | chrBx = chrB[mappingIndex] |
---|
299 | |
---|
300 | if chrAx and chrBx and chrBx.upper() != 'N': |
---|
301 | |
---|
302 | if end_strand == '+': |
---|
303 | |
---|
304 | chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex |
---|
305 | read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read |
---|
306 | |
---|
307 | if chrAx == '-': chrom_loc -= 1 |
---|
308 | |
---|
309 | if chrBx == '-': |
---|
310 | scoreBx = '-1' |
---|
311 | else: |
---|
312 | scoreBx = hits_score[readkey][str(x+1)].split()[read_loc] |
---|
313 | |
---|
314 | # 1-based on chrom_loc and read_loc |
---|
315 | pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n' |
---|
316 | |
---|
317 | else: |
---|
318 | |
---|
319 | chrom_loc = end_chr_end-match_len+mappingIndex |
---|
320 | read_loc = end_read_start+match_len-1-mappingIndex-gap_read |
---|
321 | |
---|
322 | if chrAx == '-': chrom_loc -= 1 |
---|
323 | |
---|
324 | if chrBx == '-': |
---|
325 | scoreBx = '-1' |
---|
326 | else: |
---|
327 | scoreBx = hits_score[readkey][str(x+1)].split()[read_loc] |
---|
328 | |
---|
329 | # 1-based on chrom_loc and read_loc |
---|
330 | rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line |
---|
331 | |
---|
332 | if chrom_cov.has_key(end_chrom): |
---|
333 | |
---|
334 | if chrom_cov[end_chrom].has_key(chrom_loc): |
---|
335 | chrom_cov[end_chrom][chrom_loc] += 1 |
---|
336 | else: |
---|
337 | chrom_cov[end_chrom][chrom_loc] = 1 |
---|
338 | |
---|
339 | else: |
---|
340 | |
---|
341 | chrom_cov[end_chrom] = {} |
---|
342 | chrom_cov[end_chrom][chrom_loc] = 1 |
---|
343 | |
---|
344 | if pos_line: temp_table.write('%s\n' %(pos_line.rstrip('\r\n'))) |
---|
345 | if rev_line: temp_table.write('%s\n' %(rev_line.rstrip('\r\n'))) |
---|
346 | |
---|
347 | temp_table.close() |
---|
348 | |
---|
349 | # chrom-wide coverage |
---|
350 | for i, line in enumerate(open(temp_table_name)): |
---|
351 | |
---|
352 | line = line.rstrip() |
---|
353 | if not line or line.startswith('#'): continue |
---|
354 | |
---|
355 | fields = line.split() |
---|
356 | chrom = fields[0] |
---|
357 | eachBp = int(fields[1]) |
---|
358 | readname = fields[2] |
---|
359 | |
---|
360 | if hit_per_read == 1: |
---|
361 | fields[2] = readname.split('/')[0] |
---|
362 | |
---|
363 | if chrom_cov[chrom].has_key(eachBp): |
---|
364 | outfile.write('%s\t%d\n' %('\t'.join(fields), chrom_cov[chrom][eachBp])) |
---|
365 | else: |
---|
366 | outfile.write('%s\t%d\n' %('\t'.join(fields), 0)) |
---|
367 | |
---|
368 | outfile.close() |
---|
369 | |
---|
370 | if os.path.exists(temp_table_name): os.remove(temp_table_name) |
---|
371 | |
---|
372 | if invalid_editstring_char: |
---|
373 | print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings' |
---|
374 | |
---|
375 | return True |
---|
376 | |
---|
377 | def convert_fastqsolexa_to_fasta_qual(infile_name, query_fasta, query_qual): |
---|
378 | |
---|
379 | outfile_seq = open( query_fasta, 'w' ) |
---|
380 | outfile_score = open( query_qual, 'w' ) |
---|
381 | |
---|
382 | seq_title_startswith = '' |
---|
383 | qual_title_startswith = '' |
---|
384 | |
---|
385 | default_coding_value = 64 # Solexa ascii-code |
---|
386 | fastq_block_lines = 0 |
---|
387 | |
---|
388 | for i, line in enumerate( file( infile_name ) ): |
---|
389 | line = line.rstrip() |
---|
390 | if not line or line.startswith( '#' ): continue |
---|
391 | |
---|
392 | fastq_block_lines = ( fastq_block_lines + 1 ) % 4 |
---|
393 | line_startswith = line[0:1] |
---|
394 | |
---|
395 | if fastq_block_lines == 1: |
---|
396 | # first line is @title_of_seq |
---|
397 | if not seq_title_startswith: |
---|
398 | seq_title_startswith = line_startswith |
---|
399 | |
---|
400 | if line_startswith != seq_title_startswith: |
---|
401 | outfile_seq.close() |
---|
402 | outfile_score.close() |
---|
403 | stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) |
---|
404 | |
---|
405 | read_title = line[1:] |
---|
406 | outfile_seq.write( '>%s\n' % line[1:] ) |
---|
407 | |
---|
408 | elif fastq_block_lines == 2: |
---|
409 | # second line is nucleotides |
---|
410 | read_length = len( line ) |
---|
411 | outfile_seq.write( '%s\n' % line ) |
---|
412 | |
---|
413 | elif fastq_block_lines == 3: |
---|
414 | # third line is +title_of_qualityscore ( might be skipped ) |
---|
415 | if not qual_title_startswith: |
---|
416 | qual_title_startswith = line_startswith |
---|
417 | |
---|
418 | if line_startswith != qual_title_startswith: |
---|
419 | outfile_seq.close() |
---|
420 | outfile_score.close() |
---|
421 | stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) |
---|
422 | |
---|
423 | quality_title = line[1:] |
---|
424 | if quality_title and read_title != quality_title: |
---|
425 | outfile_seq.close() |
---|
426 | outfile_score.close() |
---|
427 | stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) |
---|
428 | |
---|
429 | if not quality_title: |
---|
430 | outfile_score.write( '>%s\n' % read_title ) |
---|
431 | else: |
---|
432 | outfile_score.write( '>%s\n' % line[1:] ) |
---|
433 | |
---|
434 | else: |
---|
435 | # fourth line is quality scores |
---|
436 | qual = '' |
---|
437 | fastq_integer = True |
---|
438 | # peek: ascii or digits? |
---|
439 | val = line.split()[0] |
---|
440 | try: |
---|
441 | check = int( val ) |
---|
442 | fastq_integer = True |
---|
443 | except: |
---|
444 | fastq_integer = False |
---|
445 | |
---|
446 | if fastq_integer: |
---|
447 | # digits |
---|
448 | qual = line |
---|
449 | else: |
---|
450 | # ascii |
---|
451 | quality_score_length = len( line ) |
---|
452 | if quality_score_length == read_length + 1: |
---|
453 | # first char is qual_score_startswith |
---|
454 | qual_score_startswith = ord( line[0:1] ) |
---|
455 | line = line[1:] |
---|
456 | elif quality_score_length == read_length: |
---|
457 | qual_score_startswith = default_coding_value |
---|
458 | else: |
---|
459 | stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) |
---|
460 | |
---|
461 | for j, char in enumerate( line ): |
---|
462 | score = ord( char ) - qual_score_startswith # 64 |
---|
463 | qual = "%s%s " % ( qual, str( score ) ) |
---|
464 | |
---|
465 | outfile_score.write( '%s\n' % qual ) |
---|
466 | |
---|
467 | outfile_seq.close() |
---|
468 | outfile_score.close() |
---|
469 | |
---|
470 | return True |
---|
471 | |
---|
472 | def __main__(): |
---|
473 | |
---|
474 | # SHRiMP path |
---|
475 | shrimp = 'rmapper-ls' |
---|
476 | |
---|
477 | # I/O |
---|
478 | input_target_file = sys.argv[1] # fasta |
---|
479 | shrimp_outfile = sys.argv[2] # shrimp output |
---|
480 | table_outfile = sys.argv[3] # table output |
---|
481 | single_or_paired = sys.argv[4].split(',') |
---|
482 | |
---|
483 | insertion_size = 600 |
---|
484 | |
---|
485 | if len(single_or_paired) == 1: # single or paired |
---|
486 | type_of_reads = 'single' |
---|
487 | hit_per_read = 1 |
---|
488 | input_query = single_or_paired[0] |
---|
489 | query_fasta = tempfile.NamedTemporaryFile().name |
---|
490 | query_qual = tempfile.NamedTemporaryFile().name |
---|
491 | |
---|
492 | else: # paired-end |
---|
493 | type_of_reads = 'paired' |
---|
494 | hit_per_read = 2 |
---|
495 | input_query_end1 = single_or_paired[0] |
---|
496 | input_query_end2 = single_or_paired[1] |
---|
497 | insertion_size = int(single_or_paired[2]) |
---|
498 | query_fasta_end1 = tempfile.NamedTemporaryFile().name |
---|
499 | query_fasta_end2 = tempfile.NamedTemporaryFile().name |
---|
500 | query_qual_end1 = tempfile.NamedTemporaryFile().name |
---|
501 | query_qual_end2 = tempfile.NamedTemporaryFile().name |
---|
502 | |
---|
503 | # SHRiMP parameters: total = 15, default values |
---|
504 | spaced_seed = '111111011111' |
---|
505 | seed_matches_per_window = '2' |
---|
506 | seed_hit_taboo_length = '4' |
---|
507 | seed_generation_taboo_length = '0' |
---|
508 | seed_window_length = '115.0' |
---|
509 | max_hits_per_read = '100' |
---|
510 | max_read_length = '1000' |
---|
511 | kmer = '-1' |
---|
512 | sw_match_value = '100' |
---|
513 | sw_mismatch_value = '-150' |
---|
514 | sw_gap_open_ref = '-400' |
---|
515 | sw_gap_open_query = '-400' |
---|
516 | sw_gap_ext_ref = '-70' |
---|
517 | sw_gap_ext_query = '-70' |
---|
518 | sw_hit_threshold = '68.0' |
---|
519 | |
---|
520 | # TODO: put the threshold on each of these parameters |
---|
521 | if len(sys.argv) > 5: |
---|
522 | |
---|
523 | try: |
---|
524 | if sys.argv[5].isdigit(): |
---|
525 | spaced_seed = sys.argv[5] |
---|
526 | else: |
---|
527 | stop_err('Error in assigning parameter: Spaced seed.') |
---|
528 | except: |
---|
529 | stop_err('Spaced seed must be a combination of 1s and 0s.') |
---|
530 | |
---|
531 | seed_matches_per_window = sys.argv[6] |
---|
532 | seed_hit_taboo_length = sys.argv[7] |
---|
533 | seed_generation_taboo_length = sys.argv[8] |
---|
534 | seed_window_length = sys.argv[9] |
---|
535 | max_hits_per_read = sys.argv[10] |
---|
536 | max_read_length = sys.argv[11] |
---|
537 | kmer = sys.argv[12] |
---|
538 | sw_match_value = sys.argv[13] |
---|
539 | sw_mismatch_value = sys.argv[14] |
---|
540 | sw_gap_open_ref = sys.argv[15] |
---|
541 | sw_gap_open_query = sys.argv[16] |
---|
542 | sw_gap_ext_ref = sys.argv[17] |
---|
543 | sw_gap_ext_query = sys.argv[18] |
---|
544 | sw_hit_threshold = sys.argv[19] |
---|
545 | |
---|
546 | # temp file for shrimp log file |
---|
547 | shrimp_log = tempfile.NamedTemporaryFile().name |
---|
548 | |
---|
549 | # convert fastq to fasta and quality score files |
---|
550 | if type_of_reads == 'single': |
---|
551 | return_value = convert_fastqsolexa_to_fasta_qual(input_query, query_fasta, query_qual) |
---|
552 | else: |
---|
553 | return_value = convert_fastqsolexa_to_fasta_qual(input_query_end1, query_fasta_end1, query_qual_end1) |
---|
554 | return_value = convert_fastqsolexa_to_fasta_qual(input_query_end2, query_fasta_end2, query_qual_end2) |
---|
555 | |
---|
556 | # SHRiMP command |
---|
557 | if type_of_reads == 'single': |
---|
558 | command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) |
---|
559 | |
---|
560 | try: |
---|
561 | os.system(command) |
---|
562 | except Exception, e: |
---|
563 | if os.path.exists(query_fasta): os.remove(query_fasta) |
---|
564 | if os.path.exists(query_qual): os.remove(query_qual) |
---|
565 | stop_err(str(e)) |
---|
566 | |
---|
567 | else: # paired |
---|
568 | command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) |
---|
569 | command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log]) |
---|
570 | |
---|
571 | try: |
---|
572 | os.system(command_end1) |
---|
573 | os.system(command_end2) |
---|
574 | except Exception, e: |
---|
575 | if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1) |
---|
576 | if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2) |
---|
577 | if os.path.exists(query_qual_end1): os.remove(query_qual_end1) |
---|
578 | if os.path.exists(query_qual_end2): os.remove(query_qual_end2) |
---|
579 | stop_err(str(e)) |
---|
580 | |
---|
581 | # check SHRiMP output: count number of lines |
---|
582 | num_hits = 0 |
---|
583 | if shrimp_outfile: |
---|
584 | for i, line in enumerate(file(shrimp_outfile)): |
---|
585 | line = line.rstrip('\r\n') |
---|
586 | if not line or line.startswith('#'): continue |
---|
587 | try: |
---|
588 | fields = line.split() |
---|
589 | num_hits += 1 |
---|
590 | except Exception, e: |
---|
591 | stop_err(str(e)) |
---|
592 | |
---|
593 | if num_hits == 0: # no hits generated |
---|
594 | err_msg = '' |
---|
595 | if shrimp_log: |
---|
596 | for i, line in enumerate(file(shrimp_log)): |
---|
597 | if line.startswith('error'): # deal with memory error: |
---|
598 | err_msg += line # error: realloc failed: Cannot allocate memory |
---|
599 | if re.search('Reads Matched', line): # deal with zero hits |
---|
600 | if int(line[8:].split()[2]) == 0: |
---|
601 | err_msg = 'Zero hits found.\n' |
---|
602 | stop_err('SHRiMP Failed due to:\n' + err_msg) |
---|
603 | |
---|
604 | # convert to table |
---|
605 | if type_of_reads == 'single': |
---|
606 | return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size) |
---|
607 | else: |
---|
608 | return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size) |
---|
609 | |
---|
610 | # remove temp. files |
---|
611 | if type_of_reads == 'single': |
---|
612 | if os.path.exists(query_fasta): os.remove(query_fasta) |
---|
613 | if os.path.exists(query_qual): os.remove(query_qual) |
---|
614 | else: |
---|
615 | if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1) |
---|
616 | if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2) |
---|
617 | if os.path.exists(query_qual_end1): os.remove(query_qual_end1) |
---|
618 | if os.path.exists(query_qual_end2): os.remove(query_qual_end2) |
---|
619 | |
---|
620 | if os.path.exists(shrimp_log): os.remove(shrimp_log) |
---|
621 | |
---|
622 | |
---|
623 | if __name__ == '__main__': __main__() |
---|
624 | |
---|