1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """ |
---|
4 | Allows user to filter out non-indels from SAM. |
---|
5 | |
---|
6 | usage: %prog [options] |
---|
7 | -i, --input=i: Input SAM file to be filtered |
---|
8 | -q, --quality_threshold=q: Minimum quality value for adjacent bases |
---|
9 | -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities |
---|
10 | -o, --output=o: Filtered output SAM file |
---|
11 | """ |
---|
12 | |
---|
13 | import re, sys |
---|
14 | from galaxy import eggs |
---|
15 | import pkg_resources; pkg_resources.require( "bx-python" ) |
---|
16 | from bx.cookbook import doc_optparse |
---|
17 | |
---|
18 | |
---|
19 | def stop_err( msg ): |
---|
20 | sys.stderr.write( '%s\n' % msg ) |
---|
21 | sys.exit() |
---|
22 | |
---|
23 | def __main__(): |
---|
24 | #Parse Command Line |
---|
25 | options, args = doc_optparse.parse( __doc__ ) |
---|
26 | # prep output file |
---|
27 | output = open( options.output, 'wb' ) |
---|
28 | # patterns |
---|
29 | pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) |
---|
30 | pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) |
---|
31 | try: |
---|
32 | qual_thresh = int( options.quality_threshold ) |
---|
33 | if qual_thresh < 0 or qual_thresh > 93: |
---|
34 | raise ValueError |
---|
35 | except ValueError: |
---|
36 | stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' ) |
---|
37 | try: |
---|
38 | adj_bases = int( options.adjacent_bases ) |
---|
39 | if adj_bases < 1: |
---|
40 | raise ValueError |
---|
41 | except ValueError: |
---|
42 | stop_err( 'The number of adjacent bases should be an integer greater than 1.' ) |
---|
43 | # record lines skipped because of more than one indel |
---|
44 | multi_indel_lines = 0 |
---|
45 | # go through all lines in input file |
---|
46 | for i,line in enumerate(open( options.input, 'rb' )): |
---|
47 | if line and not line.startswith( '#' ) and not line.startswith( '@' ) : |
---|
48 | split_line = line.split( '\t' ) |
---|
49 | cigar = split_line[5].strip() |
---|
50 | # find matches like 3M2D7M or 7M3I10M |
---|
51 | match = {} |
---|
52 | m = pat.match( cigar ) |
---|
53 | # if unprocessable CIGAR |
---|
54 | if not m: |
---|
55 | m = pat_multi.match( cigar ) |
---|
56 | # skip this line if no match |
---|
57 | if not m: |
---|
58 | continue |
---|
59 | # account for multiple indels or operations we don't process |
---|
60 | else: |
---|
61 | multi_indel_lines += 1 |
---|
62 | # otherwise get matching parts |
---|
63 | else: |
---|
64 | match = m.groupdict() |
---|
65 | # process for indels |
---|
66 | if match: |
---|
67 | left = int( match[ 'lmatch' ] ) |
---|
68 | right = int( match[ 'rmatch' ] ) |
---|
69 | if match[ 'ins_del' ] == 'I': |
---|
70 | middle = int( match[ 'ins_del_width' ] ) |
---|
71 | else: |
---|
72 | middle = 0 |
---|
73 | # if there are enough adjacent bases to check, then do so |
---|
74 | if left >= adj_bases and right >= adj_bases: |
---|
75 | quals = split_line[10] |
---|
76 | eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ] |
---|
77 | qual_thresh_met = True |
---|
78 | for q in eligible_quals: |
---|
79 | if ord( q ) - 33 < qual_thresh: |
---|
80 | qual_thresh_met = False |
---|
81 | break |
---|
82 | # if filter reqs met, output line |
---|
83 | if qual_thresh_met: |
---|
84 | output.write( line ) |
---|
85 | # close out file |
---|
86 | output.close() |
---|
87 | # if skipped lines because of more than one indel, output message |
---|
88 | if multi_indel_lines > 0: |
---|
89 | sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) |
---|
90 | |
---|
91 | if __name__=="__main__": __main__() |
---|