root/galaxy-central/tools/indels/sam_indel_filter.py @ 2

リビジョン 2, 3.5 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3"""
4Allows user to filter out non-indels from SAM.
5
6usage: %prog [options]
7   -i, --input=i: Input SAM file to be filtered
8   -q, --quality_threshold=q: Minimum quality value for adjacent bases
9   -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities
10   -o, --output=o: Filtered output SAM file
11"""
12
13import re, sys
14from galaxy import eggs
15import pkg_resources; pkg_resources.require( "bx-python" )
16from bx.cookbook import doc_optparse
17
18
19def stop_err( msg ):
20    sys.stderr.write( '%s\n' % msg )
21    sys.exit()
22
23def __main__():
24    #Parse Command Line
25    options, args = doc_optparse.parse( __doc__ )
26    # prep output file
27    output = open( options.output, 'wb' )
28    # patterns
29    pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' )
30    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
31    try:
32        qual_thresh = int( options.quality_threshold )
33        if qual_thresh < 0 or qual_thresh > 93:
34            raise ValueError
35    except ValueError:
36        stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' )
37    try:
38        adj_bases = int( options.adjacent_bases )
39        if adj_bases < 1:
40            raise ValueError
41    except ValueError:
42        stop_err( 'The number of adjacent bases should be an integer greater than 1.' )
43    # record lines skipped because of more than one indel
44    multi_indel_lines = 0
45    # go through all lines in input file
46    for i,line in enumerate(open( options.input, 'rb' )):
47        if line and not line.startswith( '#' ) and not line.startswith( '@' ) :
48            split_line = line.split( '\t' )
49            cigar = split_line[5].strip()
50            # find matches like 3M2D7M or 7M3I10M
51            match = {}
52            m = pat.match( cigar )
53            # if unprocessable CIGAR
54            if not m:
55                m = pat_multi.match( cigar )
56                # skip this line if no match
57                if not m:
58                    continue
59                # account for multiple indels or operations we don't process
60                else:
61                    multi_indel_lines += 1
62            # otherwise get matching parts
63            else:
64                match = m.groupdict()
65            # process for indels
66            if match:
67                left = int( match[ 'lmatch' ] )
68                right = int( match[ 'rmatch' ] )
69                if match[ 'ins_del' ] == 'I':
70                    middle = int( match[ 'ins_del_width' ] )
71                else:
72                    middle = 0
73                # if there are enough adjacent bases to check, then do so
74                if left >= adj_bases and right >= adj_bases:
75                    quals = split_line[10]
76                    eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ]
77                    qual_thresh_met = True
78                    for q in eligible_quals:
79                        if ord( q ) - 33 < qual_thresh:
80                            qual_thresh_met = False
81                            break
82                    # if filter reqs met, output line
83                    if qual_thresh_met:
84                        output.write( line )
85    # close out file
86    output.close()
87    # if skipped lines because of more than one indel, output message
88    if multi_indel_lines > 0:
89        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
90
91if __name__=="__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。