[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | """ |
---|
| 4 | TODO |
---|
| 5 | 1. decrease memory usage |
---|
| 6 | 2. multi-fasta fastq file, ex. 454 |
---|
| 7 | 3. split reads into small chuncks? |
---|
| 8 | |
---|
| 9 | SHRiMP wrapper |
---|
| 10 | |
---|
| 11 | Inputs: |
---|
| 12 | 1. reference seq |
---|
| 13 | 2. reads |
---|
| 14 | |
---|
| 15 | Outputs: |
---|
| 16 | 1. table of 8 columns: |
---|
| 17 | chrom ref_loc read_id read_loc ref_nuc read_nuc quality coverage |
---|
| 18 | 2. SHRiMP output |
---|
| 19 | |
---|
| 20 | Parameters: |
---|
| 21 | -s Spaced Seed (default: 111111011111) |
---|
| 22 | -n Seed Matches per Window (default: 2) |
---|
| 23 | -t Seed Hit Taboo Length (default: 4) |
---|
| 24 | -9 Seed Generation Taboo Length (default: 0) |
---|
| 25 | -w Seed Window Length (default: 115.00%) |
---|
| 26 | -o Maximum Hits per Read (default: 100) |
---|
| 27 | -r Maximum Read Length (default: 1000) |
---|
| 28 | -d Kmer Std. Deviation Limit (default: -1 [None]) |
---|
| 29 | |
---|
| 30 | -m S-W Match Value (default: 100) |
---|
| 31 | -i S-W Mismatch Value (default: -150) |
---|
| 32 | -g S-W Gap Open Penalty (Reference) (default: -400) |
---|
| 33 | -q S-W Gap Open Penalty (Query) (default: -400) |
---|
| 34 | -e S-W Gap Extend Penalty (Reference) (default: -70) |
---|
| 35 | -f S-W Gap Extend Penalty (Query) (default: -70) |
---|
| 36 | -h S-W Hit Threshold (default: 68.00%) |
---|
| 37 | |
---|
| 38 | Command: |
---|
| 39 | %rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold <query> <target> > <output> 2> <log> |
---|
| 40 | |
---|
| 41 | SHRiMP output: |
---|
| 42 | >7:2:1147:982/1 chr3 + 36586562 36586595 2 35 36 2900 3G16G13 |
---|
| 43 | >7:2:1147:982/1 chr3 + 95338194 95338225 4 35 36 2700 9T7C14 |
---|
| 44 | >7:2:587:93/1 chr3 + 14913541 14913577 1 35 36 2960 19--16 |
---|
| 45 | |
---|
| 46 | """ |
---|
| 47 | |
---|
| 48 | import os, sys, tempfile, os.path, re |
---|
| 49 | |
---|
| 50 | assert sys.version_info[:2] >= (2.4) |
---|
| 51 | |
---|
| 52 | def stop_err( msg ): |
---|
| 53 | |
---|
| 54 | sys.stderr.write( "%s\n" % msg ) |
---|
| 55 | sys.exit() |
---|
| 56 | |
---|
| 57 | def reverse_complement(s): |
---|
| 58 | |
---|
| 59 | complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":".", "-":"-"} |
---|
| 60 | reversed_s = [] |
---|
| 61 | for i in s: |
---|
| 62 | reversed_s.append(complement_dna[i]) |
---|
| 63 | reversed_s.reverse() |
---|
| 64 | return "".join(reversed_s) |
---|
| 65 | |
---|
| 66 | def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size): |
---|
| 67 | |
---|
| 68 | invalid_editstring_char = 0 |
---|
| 69 | |
---|
| 70 | all_score_file = score_files.split(',') |
---|
| 71 | |
---|
| 72 | if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.') |
---|
| 73 | |
---|
| 74 | temp_table_name = tempfile.NamedTemporaryFile().name |
---|
| 75 | temp_table = open(temp_table_name, 'w') |
---|
| 76 | |
---|
| 77 | outfile = open(table_outfile,'w') |
---|
| 78 | |
---|
| 79 | # reference seq: not a single fasta seq |
---|
| 80 | refseq = {} |
---|
| 81 | chrom_cov = {} |
---|
| 82 | seq = '' |
---|
| 83 | |
---|
| 84 | for i, line in enumerate(file(ref_file)): |
---|
| 85 | line = line.rstrip() |
---|
| 86 | if not line or line.startswith('#'): continue |
---|
| 87 | |
---|
| 88 | if line.startswith('>'): |
---|
| 89 | if seq: |
---|
| 90 | if refseq.has_key(title): |
---|
| 91 | pass |
---|
| 92 | else: |
---|
| 93 | refseq[title] = seq |
---|
| 94 | chrom_cov[title] = {} |
---|
| 95 | seq = '' |
---|
| 96 | title = line[1:] |
---|
| 97 | else: |
---|
| 98 | seq += line |
---|
| 99 | if seq: |
---|
| 100 | if not refseq.has_key(title): |
---|
| 101 | refseq[title] = seq |
---|
| 102 | chrom_cov[title] = {} |
---|
| 103 | |
---|
| 104 | # find hits : one end and/or the other |
---|
| 105 | hits = {} |
---|
| 106 | for i, line in enumerate(file(result_file)): |
---|
| 107 | line = line.rstrip() |
---|
| 108 | if not line or line.startswith('#'): continue |
---|
| 109 | |
---|
| 110 | #FORMAT: readname contigname strand contigstart contigend readstart readend readlength score editstring |
---|
| 111 | fields = line.split('\t') |
---|
| 112 | readname = fields[0][1:] |
---|
| 113 | chrom = fields[1] |
---|
| 114 | strand = fields[2] |
---|
| 115 | chrom_start = int(fields[3]) - 1 |
---|
| 116 | chrom_end = int(fields[4]) |
---|
| 117 | read_start = fields[5] |
---|
| 118 | read_end = fields[6] |
---|
| 119 | read_len = fields[7] |
---|
| 120 | score = fields[8] |
---|
| 121 | editstring = fields[9] |
---|
| 122 | |
---|
| 123 | if hit_per_read == 1: |
---|
| 124 | endindex = '1' |
---|
| 125 | else: |
---|
| 126 | readname, endindex = readname.split('/') |
---|
| 127 | |
---|
| 128 | if hits.has_key(readname): |
---|
| 129 | if hits[readname].has_key(endindex): |
---|
| 130 | hits[readname][endindex].append([strand, editstring, chrom_start, chrom_end, read_start, chrom]) |
---|
| 131 | else: |
---|
| 132 | hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]] |
---|
| 133 | else: |
---|
| 134 | hits[readname] = {} |
---|
| 135 | hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]] |
---|
| 136 | |
---|
| 137 | # find score : one end and the other end |
---|
| 138 | hits_score = {} |
---|
| 139 | readname = '' |
---|
| 140 | score = '' |
---|
| 141 | for num_score_file in range(len(all_score_file)): |
---|
| 142 | score_file = all_score_file[num_score_file] |
---|
| 143 | for i, line in enumerate(file(score_file)): |
---|
| 144 | line = line.rstrip() |
---|
| 145 | if not line or line.startswith('#'): continue |
---|
| 146 | |
---|
| 147 | if line.startswith('>'): |
---|
| 148 | if score: |
---|
| 149 | if hits.has_key(readname): |
---|
| 150 | if len(hits[readname]) == hit_per_read: |
---|
| 151 | if hits_score.has_key(readname): |
---|
| 152 | if hits_score[readname].has_key(endindex): |
---|
| 153 | pass |
---|
| 154 | else: |
---|
| 155 | hits_score[readname][endindex] = score |
---|
| 156 | else: |
---|
| 157 | hits_score[readname] = {} |
---|
| 158 | hits_score[readname][endindex] = score |
---|
| 159 | score = '' |
---|
| 160 | if hit_per_read == 1: |
---|
| 161 | readname = line[1:] |
---|
| 162 | endindex = '1' |
---|
| 163 | else: |
---|
| 164 | readname, endindex = line[1:].split('/') |
---|
| 165 | else: |
---|
| 166 | score = line |
---|
| 167 | |
---|
| 168 | if score: # the last one |
---|
| 169 | if hits.has_key(readname): |
---|
| 170 | if len(hits[readname]) == hit_per_read: |
---|
| 171 | if hits_score.has_key(readname): |
---|
| 172 | if hits_score[readname].has_key(endindex): |
---|
| 173 | pass |
---|
| 174 | else: |
---|
| 175 | hits_score[readname][endindex] = score |
---|
| 176 | else: |
---|
| 177 | hits_score[readname] = {} |
---|
| 178 | hits_score[readname][endindex] = score |
---|
| 179 | |
---|
| 180 | # call to all mappings |
---|
| 181 | for readkey in hits.keys(): |
---|
| 182 | if len(hits[readkey]) != hit_per_read: continue |
---|
| 183 | |
---|
| 184 | matches = [] |
---|
| 185 | match_count = 0 |
---|
| 186 | |
---|
| 187 | if hit_per_read == 1: |
---|
| 188 | if len(hits[readkey]['1']) == 1: |
---|
| 189 | matches = [ hits[readkey]['1'] ] |
---|
| 190 | match_count = 1 |
---|
| 191 | else: |
---|
| 192 | end1_data = hits[readkey]['1'] |
---|
| 193 | end2_data = hits[readkey]['2'] |
---|
| 194 | |
---|
| 195 | for i, end1_hit in enumerate(end1_data): |
---|
| 196 | crin_strand = {'+': False, '-': False} |
---|
| 197 | crin_insertSize = {'+': False, '-': False} |
---|
| 198 | |
---|
| 199 | crin_strand[end1_hit[0]] = True |
---|
| 200 | crin_insertSize[end1_hit[0]] = int(end1_hit[2]) |
---|
| 201 | |
---|
| 202 | for j, end2_hit in enumerate(end2_data): |
---|
| 203 | crin_strand[end2_hit[0]] = True |
---|
| 204 | crin_insertSize[end2_hit[0]] = int(end2_hit[2]) |
---|
| 205 | |
---|
| 206 | if end1_hit[-1] != end2_hit[-1] : continue |
---|
| 207 | |
---|
| 208 | if crin_strand['+'] and crin_strand['-']: |
---|
| 209 | if (crin_insertSize['-'] - crin_insertSize['+']) <= insertion_size: |
---|
| 210 | matches.append([end1_hit, end2_hit]) |
---|
| 211 | match_count += 1 |
---|
| 212 | |
---|
| 213 | if match_count == 1: |
---|
| 214 | |
---|
| 215 | for x, end_data in enumerate(matches[0]): |
---|
| 216 | |
---|
| 217 | end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data |
---|
| 218 | end_read_start = int(end_read_start) - 1 |
---|
| 219 | |
---|
| 220 | if end_strand == '-': |
---|
| 221 | refsegment = reverse_complement(refseq[end_chrom][end_chr_start:end_chr_end]) |
---|
| 222 | else: |
---|
| 223 | refsegment = refseq[end_chrom][end_chr_start:end_chr_end] |
---|
| 224 | |
---|
| 225 | match_len = 0 |
---|
| 226 | editindex = 0 |
---|
| 227 | gap_read = 0 |
---|
| 228 | |
---|
| 229 | while editindex < len(end_editstring): |
---|
| 230 | |
---|
| 231 | editchr = end_editstring[editindex] |
---|
| 232 | chrA = '' |
---|
| 233 | chrB = '' |
---|
| 234 | locIndex = [] |
---|
| 235 | |
---|
| 236 | if editchr.isdigit(): |
---|
| 237 | editcode = '' |
---|
| 238 | |
---|
| 239 | while editchr.isdigit() and editindex < len(end_editstring): |
---|
| 240 | editcode += editchr |
---|
| 241 | editindex += 1 |
---|
| 242 | if editindex < len(end_editstring): editchr = end_editstring[editindex] |
---|
| 243 | |
---|
| 244 | for baseIndex in range(int(editcode)): |
---|
| 245 | chrA += refsegment[match_len+baseIndex] |
---|
| 246 | chrB = chrA |
---|
| 247 | |
---|
| 248 | match_len += int(editcode) |
---|
| 249 | |
---|
| 250 | elif editchr == 'x': |
---|
| 251 | # crossover: inserted between the appropriate two bases |
---|
| 252 | # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers) |
---|
| 253 | # Treated as errors in the reads; Do nothing. |
---|
| 254 | editindex += 1 |
---|
| 255 | |
---|
| 256 | elif editchr.isalpha(): |
---|
| 257 | editcode = editchr |
---|
| 258 | editindex += 1 |
---|
| 259 | chrA = refsegment[match_len] |
---|
| 260 | chrB = editcode |
---|
| 261 | match_len += len(editcode) |
---|
| 262 | |
---|
| 263 | elif editchr == '-': |
---|
| 264 | editcode = editchr |
---|
| 265 | editindex += 1 |
---|
| 266 | chrA = refsegment[match_len] |
---|
| 267 | chrB = editcode |
---|
| 268 | match_len += len(editcode) |
---|
| 269 | gap_read += 1 |
---|
| 270 | |
---|
| 271 | elif editchr == '(': |
---|
| 272 | editcode = '' |
---|
| 273 | |
---|
| 274 | while editchr != ')' and editindex < len(end_editstring): |
---|
| 275 | if editindex < len(end_editstring): editchr = end_editstring[editindex] |
---|
| 276 | editcode += editchr |
---|
| 277 | editindex += 1 |
---|
| 278 | |
---|
| 279 | editcode = editcode[1:-1] |
---|
| 280 | chrA = '-'*len(editcode) |
---|
| 281 | chrB = editcode |
---|
| 282 | |
---|
| 283 | else: |
---|
| 284 | invalid_editstring_char += 1 |
---|
| 285 | |
---|
| 286 | if end_strand == '-': |
---|
| 287 | |
---|
| 288 | chrA = reverse_complement(chrA) |
---|
| 289 | chrB = reverse_complement(chrB) |
---|
| 290 | |
---|
| 291 | pos_line = '' |
---|
| 292 | rev_line = '' |
---|
| 293 | |
---|
| 294 | for mappingIndex in range(len(chrA)): |
---|
| 295 | # reference |
---|
| 296 | chrAx = chrA[mappingIndex] |
---|
| 297 | # read |
---|
| 298 | chrBx = chrB[mappingIndex] |
---|
| 299 | |
---|
| 300 | if chrAx and chrBx and chrBx.upper() != 'N': |
---|
| 301 | |
---|
| 302 | if end_strand == '+': |
---|
| 303 | |
---|
| 304 | chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex |
---|
| 305 | read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read |
---|
| 306 | |
---|
| 307 | if chrAx == '-': chrom_loc -= 1 |
---|
| 308 | |
---|
| 309 | if chrBx == '-': |
---|
| 310 | scoreBx = '-1' |
---|
| 311 | else: |
---|
| 312 | scoreBx = hits_score[readkey][str(x+1)].split()[read_loc] |
---|
| 313 | |
---|
| 314 | # 1-based on chrom_loc and read_loc |
---|
| 315 | pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n' |
---|
| 316 | |
---|
| 317 | else: |
---|
| 318 | |
---|
| 319 | chrom_loc = end_chr_end-match_len+mappingIndex |
---|
| 320 | read_loc = end_read_start+match_len-1-mappingIndex-gap_read |
---|
| 321 | |
---|
| 322 | if chrAx == '-': chrom_loc -= 1 |
---|
| 323 | |
---|
| 324 | if chrBx == '-': |
---|
| 325 | scoreBx = '-1' |
---|
| 326 | else: |
---|
| 327 | scoreBx = hits_score[readkey][str(x+1)].split()[read_loc] |
---|
| 328 | |
---|
| 329 | # 1-based on chrom_loc and read_loc |
---|
| 330 | rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line |
---|
| 331 | |
---|
| 332 | if chrom_cov.has_key(end_chrom): |
---|
| 333 | |
---|
| 334 | if chrom_cov[end_chrom].has_key(chrom_loc): |
---|
| 335 | chrom_cov[end_chrom][chrom_loc] += 1 |
---|
| 336 | else: |
---|
| 337 | chrom_cov[end_chrom][chrom_loc] = 1 |
---|
| 338 | |
---|
| 339 | else: |
---|
| 340 | |
---|
| 341 | chrom_cov[end_chrom] = {} |
---|
| 342 | chrom_cov[end_chrom][chrom_loc] = 1 |
---|
| 343 | |
---|
| 344 | if pos_line: temp_table.write('%s\n' %(pos_line.rstrip('\r\n'))) |
---|
| 345 | if rev_line: temp_table.write('%s\n' %(rev_line.rstrip('\r\n'))) |
---|
| 346 | |
---|
| 347 | temp_table.close() |
---|
| 348 | |
---|
| 349 | # chrom-wide coverage |
---|
| 350 | for i, line in enumerate(open(temp_table_name)): |
---|
| 351 | |
---|
| 352 | line = line.rstrip() |
---|
| 353 | if not line or line.startswith('#'): continue |
---|
| 354 | |
---|
| 355 | fields = line.split() |
---|
| 356 | chrom = fields[0] |
---|
| 357 | eachBp = int(fields[1]) |
---|
| 358 | readname = fields[2] |
---|
| 359 | |
---|
| 360 | if hit_per_read == 1: |
---|
| 361 | fields[2] = readname.split('/')[0] |
---|
| 362 | |
---|
| 363 | if chrom_cov[chrom].has_key(eachBp): |
---|
| 364 | outfile.write('%s\t%d\n' %('\t'.join(fields), chrom_cov[chrom][eachBp])) |
---|
| 365 | else: |
---|
| 366 | outfile.write('%s\t%d\n' %('\t'.join(fields), 0)) |
---|
| 367 | |
---|
| 368 | outfile.close() |
---|
| 369 | |
---|
| 370 | if os.path.exists(temp_table_name): os.remove(temp_table_name) |
---|
| 371 | |
---|
| 372 | if invalid_editstring_char: |
---|
| 373 | print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings' |
---|
| 374 | |
---|
| 375 | return True |
---|
| 376 | |
---|
| 377 | def convert_fastqsolexa_to_fasta_qual(infile_name, query_fasta, query_qual): |
---|
| 378 | |
---|
| 379 | outfile_seq = open( query_fasta, 'w' ) |
---|
| 380 | outfile_score = open( query_qual, 'w' ) |
---|
| 381 | |
---|
| 382 | seq_title_startswith = '' |
---|
| 383 | qual_title_startswith = '' |
---|
| 384 | |
---|
| 385 | default_coding_value = 64 # Solexa ascii-code |
---|
| 386 | fastq_block_lines = 0 |
---|
| 387 | |
---|
| 388 | for i, line in enumerate( file( infile_name ) ): |
---|
| 389 | line = line.rstrip() |
---|
| 390 | if not line or line.startswith( '#' ): continue |
---|
| 391 | |
---|
| 392 | fastq_block_lines = ( fastq_block_lines + 1 ) % 4 |
---|
| 393 | line_startswith = line[0:1] |
---|
| 394 | |
---|
| 395 | if fastq_block_lines == 1: |
---|
| 396 | # first line is @title_of_seq |
---|
| 397 | if not seq_title_startswith: |
---|
| 398 | seq_title_startswith = line_startswith |
---|
| 399 | |
---|
| 400 | if line_startswith != seq_title_startswith: |
---|
| 401 | outfile_seq.close() |
---|
| 402 | outfile_score.close() |
---|
| 403 | stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) |
---|
| 404 | |
---|
| 405 | read_title = line[1:] |
---|
| 406 | outfile_seq.write( '>%s\n' % line[1:] ) |
---|
| 407 | |
---|
| 408 | elif fastq_block_lines == 2: |
---|
| 409 | # second line is nucleotides |
---|
| 410 | read_length = len( line ) |
---|
| 411 | outfile_seq.write( '%s\n' % line ) |
---|
| 412 | |
---|
| 413 | elif fastq_block_lines == 3: |
---|
| 414 | # third line is +title_of_qualityscore ( might be skipped ) |
---|
| 415 | if not qual_title_startswith: |
---|
| 416 | qual_title_startswith = line_startswith |
---|
| 417 | |
---|
| 418 | if line_startswith != qual_title_startswith: |
---|
| 419 | outfile_seq.close() |
---|
| 420 | outfile_score.close() |
---|
| 421 | stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) ) |
---|
| 422 | |
---|
| 423 | quality_title = line[1:] |
---|
| 424 | if quality_title and read_title != quality_title: |
---|
| 425 | outfile_seq.close() |
---|
| 426 | outfile_score.close() |
---|
| 427 | stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) ) |
---|
| 428 | |
---|
| 429 | if not quality_title: |
---|
| 430 | outfile_score.write( '>%s\n' % read_title ) |
---|
| 431 | else: |
---|
| 432 | outfile_score.write( '>%s\n' % line[1:] ) |
---|
| 433 | |
---|
| 434 | else: |
---|
| 435 | # fourth line is quality scores |
---|
| 436 | qual = '' |
---|
| 437 | fastq_integer = True |
---|
| 438 | # peek: ascii or digits? |
---|
| 439 | val = line.split()[0] |
---|
| 440 | try: |
---|
| 441 | check = int( val ) |
---|
| 442 | fastq_integer = True |
---|
| 443 | except: |
---|
| 444 | fastq_integer = False |
---|
| 445 | |
---|
| 446 | if fastq_integer: |
---|
| 447 | # digits |
---|
| 448 | qual = line |
---|
| 449 | else: |
---|
| 450 | # ascii |
---|
| 451 | quality_score_length = len( line ) |
---|
| 452 | if quality_score_length == read_length + 1: |
---|
| 453 | # first char is qual_score_startswith |
---|
| 454 | qual_score_startswith = ord( line[0:1] ) |
---|
| 455 | line = line[1:] |
---|
| 456 | elif quality_score_length == read_length: |
---|
| 457 | qual_score_startswith = default_coding_value |
---|
| 458 | else: |
---|
| 459 | stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) ) |
---|
| 460 | |
---|
| 461 | for j, char in enumerate( line ): |
---|
| 462 | score = ord( char ) - qual_score_startswith # 64 |
---|
| 463 | qual = "%s%s " % ( qual, str( score ) ) |
---|
| 464 | |
---|
| 465 | outfile_score.write( '%s\n' % qual ) |
---|
| 466 | |
---|
| 467 | outfile_seq.close() |
---|
| 468 | outfile_score.close() |
---|
| 469 | |
---|
| 470 | return True |
---|
| 471 | |
---|
| 472 | def __main__(): |
---|
| 473 | |
---|
| 474 | # SHRiMP path |
---|
| 475 | shrimp = 'rmapper-ls' |
---|
| 476 | |
---|
| 477 | # I/O |
---|
| 478 | input_target_file = sys.argv[1] # fasta |
---|
| 479 | shrimp_outfile = sys.argv[2] # shrimp output |
---|
| 480 | table_outfile = sys.argv[3] # table output |
---|
| 481 | single_or_paired = sys.argv[4].split(',') |
---|
| 482 | |
---|
| 483 | insertion_size = 600 |
---|
| 484 | |
---|
| 485 | if len(single_or_paired) == 1: # single or paired |
---|
| 486 | type_of_reads = 'single' |
---|
| 487 | hit_per_read = 1 |
---|
| 488 | input_query = single_or_paired[0] |
---|
| 489 | query_fasta = tempfile.NamedTemporaryFile().name |
---|
| 490 | query_qual = tempfile.NamedTemporaryFile().name |
---|
| 491 | |
---|
| 492 | else: # paired-end |
---|
| 493 | type_of_reads = 'paired' |
---|
| 494 | hit_per_read = 2 |
---|
| 495 | input_query_end1 = single_or_paired[0] |
---|
| 496 | input_query_end2 = single_or_paired[1] |
---|
| 497 | insertion_size = int(single_or_paired[2]) |
---|
| 498 | query_fasta_end1 = tempfile.NamedTemporaryFile().name |
---|
| 499 | query_fasta_end2 = tempfile.NamedTemporaryFile().name |
---|
| 500 | query_qual_end1 = tempfile.NamedTemporaryFile().name |
---|
| 501 | query_qual_end2 = tempfile.NamedTemporaryFile().name |
---|
| 502 | |
---|
| 503 | # SHRiMP parameters: total = 15, default values |
---|
| 504 | spaced_seed = '111111011111' |
---|
| 505 | seed_matches_per_window = '2' |
---|
| 506 | seed_hit_taboo_length = '4' |
---|
| 507 | seed_generation_taboo_length = '0' |
---|
| 508 | seed_window_length = '115.0' |
---|
| 509 | max_hits_per_read = '100' |
---|
| 510 | max_read_length = '1000' |
---|
| 511 | kmer = '-1' |
---|
| 512 | sw_match_value = '100' |
---|
| 513 | sw_mismatch_value = '-150' |
---|
| 514 | sw_gap_open_ref = '-400' |
---|
| 515 | sw_gap_open_query = '-400' |
---|
| 516 | sw_gap_ext_ref = '-70' |
---|
| 517 | sw_gap_ext_query = '-70' |
---|
| 518 | sw_hit_threshold = '68.0' |
---|
| 519 | |
---|
| 520 | # TODO: put the threshold on each of these parameters |
---|
| 521 | if len(sys.argv) > 5: |
---|
| 522 | |
---|
| 523 | try: |
---|
| 524 | if sys.argv[5].isdigit(): |
---|
| 525 | spaced_seed = sys.argv[5] |
---|
| 526 | else: |
---|
| 527 | stop_err('Error in assigning parameter: Spaced seed.') |
---|
| 528 | except: |
---|
| 529 | stop_err('Spaced seed must be a combination of 1s and 0s.') |
---|
| 530 | |
---|
| 531 | seed_matches_per_window = sys.argv[6] |
---|
| 532 | seed_hit_taboo_length = sys.argv[7] |
---|
| 533 | seed_generation_taboo_length = sys.argv[8] |
---|
| 534 | seed_window_length = sys.argv[9] |
---|
| 535 | max_hits_per_read = sys.argv[10] |
---|
| 536 | max_read_length = sys.argv[11] |
---|
| 537 | kmer = sys.argv[12] |
---|
| 538 | sw_match_value = sys.argv[13] |
---|
| 539 | sw_mismatch_value = sys.argv[14] |
---|
| 540 | sw_gap_open_ref = sys.argv[15] |
---|
| 541 | sw_gap_open_query = sys.argv[16] |
---|
| 542 | sw_gap_ext_ref = sys.argv[17] |
---|
| 543 | sw_gap_ext_query = sys.argv[18] |
---|
| 544 | sw_hit_threshold = sys.argv[19] |
---|
| 545 | |
---|
| 546 | # temp file for shrimp log file |
---|
| 547 | shrimp_log = tempfile.NamedTemporaryFile().name |
---|
| 548 | |
---|
| 549 | # convert fastq to fasta and quality score files |
---|
| 550 | if type_of_reads == 'single': |
---|
| 551 | return_value = convert_fastqsolexa_to_fasta_qual(input_query, query_fasta, query_qual) |
---|
| 552 | else: |
---|
| 553 | return_value = convert_fastqsolexa_to_fasta_qual(input_query_end1, query_fasta_end1, query_qual_end1) |
---|
| 554 | return_value = convert_fastqsolexa_to_fasta_qual(input_query_end2, query_fasta_end2, query_qual_end2) |
---|
| 555 | |
---|
| 556 | # SHRiMP command |
---|
| 557 | if type_of_reads == 'single': |
---|
| 558 | command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) |
---|
| 559 | |
---|
| 560 | try: |
---|
| 561 | os.system(command) |
---|
| 562 | except Exception, e: |
---|
| 563 | if os.path.exists(query_fasta): os.remove(query_fasta) |
---|
| 564 | if os.path.exists(query_qual): os.remove(query_qual) |
---|
| 565 | stop_err(str(e)) |
---|
| 566 | |
---|
| 567 | else: # paired |
---|
| 568 | command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) |
---|
| 569 | command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log]) |
---|
| 570 | |
---|
| 571 | try: |
---|
| 572 | os.system(command_end1) |
---|
| 573 | os.system(command_end2) |
---|
| 574 | except Exception, e: |
---|
| 575 | if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1) |
---|
| 576 | if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2) |
---|
| 577 | if os.path.exists(query_qual_end1): os.remove(query_qual_end1) |
---|
| 578 | if os.path.exists(query_qual_end2): os.remove(query_qual_end2) |
---|
| 579 | stop_err(str(e)) |
---|
| 580 | |
---|
| 581 | # check SHRiMP output: count number of lines |
---|
| 582 | num_hits = 0 |
---|
| 583 | if shrimp_outfile: |
---|
| 584 | for i, line in enumerate(file(shrimp_outfile)): |
---|
| 585 | line = line.rstrip('\r\n') |
---|
| 586 | if not line or line.startswith('#'): continue |
---|
| 587 | try: |
---|
| 588 | fields = line.split() |
---|
| 589 | num_hits += 1 |
---|
| 590 | except Exception, e: |
---|
| 591 | stop_err(str(e)) |
---|
| 592 | |
---|
| 593 | if num_hits == 0: # no hits generated |
---|
| 594 | err_msg = '' |
---|
| 595 | if shrimp_log: |
---|
| 596 | for i, line in enumerate(file(shrimp_log)): |
---|
| 597 | if line.startswith('error'): # deal with memory error: |
---|
| 598 | err_msg += line # error: realloc failed: Cannot allocate memory |
---|
| 599 | if re.search('Reads Matched', line): # deal with zero hits |
---|
| 600 | if int(line[8:].split()[2]) == 0: |
---|
| 601 | err_msg = 'Zero hits found.\n' |
---|
| 602 | stop_err('SHRiMP Failed due to:\n' + err_msg) |
---|
| 603 | |
---|
| 604 | # convert to table |
---|
| 605 | if type_of_reads == 'single': |
---|
| 606 | return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size) |
---|
| 607 | else: |
---|
| 608 | return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size) |
---|
| 609 | |
---|
| 610 | # remove temp. files |
---|
| 611 | if type_of_reads == 'single': |
---|
| 612 | if os.path.exists(query_fasta): os.remove(query_fasta) |
---|
| 613 | if os.path.exists(query_qual): os.remove(query_qual) |
---|
| 614 | else: |
---|
| 615 | if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1) |
---|
| 616 | if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2) |
---|
| 617 | if os.path.exists(query_qual_end1): os.remove(query_qual_end1) |
---|
| 618 | if os.path.exists(query_qual_end2): os.remove(query_qual_end2) |
---|
| 619 | |
---|
| 620 | if os.path.exists(shrimp_log): os.remove(shrimp_log) |
---|
| 621 | |
---|
| 622 | |
---|
| 623 | if __name__ == '__main__': __main__() |
---|
| 624 | |
---|