[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | """ |
---|
| 4 | Allows user to filter out non-indels from SAM. |
---|
| 5 | |
---|
| 6 | usage: %prog [options] |
---|
| 7 | -i, --input=i: Input SAM file to be filtered |
---|
| 8 | -q, --quality_threshold=q: Minimum quality value for adjacent bases |
---|
| 9 | -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities |
---|
| 10 | -o, --output=o: Filtered output SAM file |
---|
| 11 | """ |
---|
| 12 | |
---|
| 13 | import re, sys |
---|
| 14 | from galaxy import eggs |
---|
| 15 | import pkg_resources; pkg_resources.require( "bx-python" ) |
---|
| 16 | from bx.cookbook import doc_optparse |
---|
| 17 | |
---|
| 18 | |
---|
| 19 | def stop_err( msg ): |
---|
| 20 | sys.stderr.write( '%s\n' % msg ) |
---|
| 21 | sys.exit() |
---|
| 22 | |
---|
| 23 | def __main__(): |
---|
| 24 | #Parse Command Line |
---|
| 25 | options, args = doc_optparse.parse( __doc__ ) |
---|
| 26 | # prep output file |
---|
| 27 | output = open( options.output, 'wb' ) |
---|
| 28 | # patterns |
---|
| 29 | pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) |
---|
| 30 | pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) |
---|
| 31 | try: |
---|
| 32 | qual_thresh = int( options.quality_threshold ) |
---|
| 33 | if qual_thresh < 0 or qual_thresh > 93: |
---|
| 34 | raise ValueError |
---|
| 35 | except ValueError: |
---|
| 36 | stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' ) |
---|
| 37 | try: |
---|
| 38 | adj_bases = int( options.adjacent_bases ) |
---|
| 39 | if adj_bases < 1: |
---|
| 40 | raise ValueError |
---|
| 41 | except ValueError: |
---|
| 42 | stop_err( 'The number of adjacent bases should be an integer greater than 1.' ) |
---|
| 43 | # record lines skipped because of more than one indel |
---|
| 44 | multi_indel_lines = 0 |
---|
| 45 | # go through all lines in input file |
---|
| 46 | for i,line in enumerate(open( options.input, 'rb' )): |
---|
| 47 | if line and not line.startswith( '#' ) and not line.startswith( '@' ) : |
---|
| 48 | split_line = line.split( '\t' ) |
---|
| 49 | cigar = split_line[5].strip() |
---|
| 50 | # find matches like 3M2D7M or 7M3I10M |
---|
| 51 | match = {} |
---|
| 52 | m = pat.match( cigar ) |
---|
| 53 | # if unprocessable CIGAR |
---|
| 54 | if not m: |
---|
| 55 | m = pat_multi.match( cigar ) |
---|
| 56 | # skip this line if no match |
---|
| 57 | if not m: |
---|
| 58 | continue |
---|
| 59 | # account for multiple indels or operations we don't process |
---|
| 60 | else: |
---|
| 61 | multi_indel_lines += 1 |
---|
| 62 | # otherwise get matching parts |
---|
| 63 | else: |
---|
| 64 | match = m.groupdict() |
---|
| 65 | # process for indels |
---|
| 66 | if match: |
---|
| 67 | left = int( match[ 'lmatch' ] ) |
---|
| 68 | right = int( match[ 'rmatch' ] ) |
---|
| 69 | if match[ 'ins_del' ] == 'I': |
---|
| 70 | middle = int( match[ 'ins_del_width' ] ) |
---|
| 71 | else: |
---|
| 72 | middle = 0 |
---|
| 73 | # if there are enough adjacent bases to check, then do so |
---|
| 74 | if left >= adj_bases and right >= adj_bases: |
---|
| 75 | quals = split_line[10] |
---|
| 76 | eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ] |
---|
| 77 | qual_thresh_met = True |
---|
| 78 | for q in eligible_quals: |
---|
| 79 | if ord( q ) - 33 < qual_thresh: |
---|
| 80 | qual_thresh_met = False |
---|
| 81 | break |
---|
| 82 | # if filter reqs met, output line |
---|
| 83 | if qual_thresh_met: |
---|
| 84 | output.write( line ) |
---|
| 85 | # close out file |
---|
| 86 | output.close() |
---|
| 87 | # if skipped lines because of more than one indel, output message |
---|
| 88 | if multi_indel_lines > 0: |
---|
| 89 | sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) |
---|
| 90 | |
---|
| 91 | if __name__=="__main__": __main__() |
---|