root/galaxy-central/tools/indels/indel_analysis.py @ 2

リビジョン 2, 9.8 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3"""
4Given an input sam file, provides analysis of the indels..
5
6usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]]
7   -i, --input=i: The sam file to analyze
8   -t, --threshold=t: The deletion frequency threshold
9   -I, --out_ins=I: The interval output file showing insertions
10   -D, --out_del=D: The interval output file showing deletions
11"""
12
13import re, sys
14from galaxy import eggs
15import pkg_resources; pkg_resources.require( "bx-python" )
16from bx.cookbook import doc_optparse
17
18
19def stop_err( msg ):
20    sys.stderr.write( '%s\n' % msg )
21    sys.exit()
22
23def add_to_mis_matches( mis_matches, pos, bases ):
24    """
25    Adds the bases and counts to the mis_matches dict
26    """
27    for j, base in enumerate( bases ):
28        try:
29            mis_matches[ pos + j ][ base ] += 1
30        except KeyError:
31            try:
32                mis_matches[ pos + j ][ base ] = 1
33            except KeyError:
34                mis_matches[ pos + j ] = { base: 1 }
35
36def __main__():
37    #Parse Command Line
38    options, args = doc_optparse.parse( __doc__ )
39    # prep output files
40    out_ins = open( options.out_ins, 'wb' )
41    out_del = open( options.out_del, 'wb' )
42    # patterns
43    pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' )
44    pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' )
45    # for tracking occurences at each pos of ref
46    mis_matches = {}
47    indels = {}
48    multi_indel_lines = 0
49    # go through all lines in input file
50    for i,line in enumerate( open( options.input, 'rb' ) ):
51        if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) :
52            split_line = line.split( '\t' )
53            chrom = split_line[2].strip()
54            pos = int( split_line[3].strip() )
55            cigar = split_line[5].strip()
56            bases = split_line[9].strip()
57            # if not an indel or match, exit
58            if chrom == '*':
59                continue
60            # find matches like 3M2D7M or 7M3I10M
61            match = {}
62            m = pat.match( cigar )
63            # unprocessable CIGAR
64            if not m:
65                m = pat_multi.match( cigar )
66                # skip this line if no match
67                if not m:
68                    continue
69                # account for multiple indels or operations we don't process
70                else:
71                    multi_indel_lines += 1
72            # get matching parts for the indel or full match if matching
73            else:
74                if not mis_matches.has_key( chrom ):
75                    mis_matches[ chrom ] = {}
76                    indels[ chrom ] = { 'D': {}, 'I': {} }
77                parts = m.groupdict()
78                if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ):
79                    match = parts
80            # see if matches meet filter requirements
81            if match:
82                # match/mismatch
83                if parts[ 'match_width' ]:
84                    add_to_mis_matches( mis_matches[ chrom ], pos, bases )
85                # indel
86                else:
87                    # pieces of CIGAR string
88                    left = int( match[ 'lmatch' ] )
89                    middle = int( match[ 'ins_del_width' ] )
90                    right = int( match[ 'rmatch' ] )
91                    left_bases = bases[ : left ]
92                    if match[ 'ins_del' ] == 'I':
93                        middle_bases = bases[ left : left + middle ]
94                    else:
95                        middle_bases = ''
96                    right_bases = bases[ -right : ]
97                    start = pos + left
98                    # add data to ref_pos dict for match/mismatch bases on left and on right
99                    add_to_mis_matches( mis_matches[ chrom ], pos, left_bases )
100                    if match[ 'ins_del' ] == 'I':
101                        add_to_mis_matches( mis_matches[ chrom ], start, right_bases )
102                    else:
103                        add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases )
104                    # for insertions, count instances of particular inserted bases
105                    if match[ 'ins_del' ] == 'I':
106                        if indels[ chrom ][ 'I' ].has_key( start ):
107                            try:
108                                indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1
109                            except KeyError:
110                                indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1
111                        else:
112                            indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 }
113                    # for deletions, count number of deletions bases
114                    else:
115                        if indels[ chrom ][ 'D' ].has_key( start ):
116                            try:
117                                indels[ chrom ][ 'D' ][ start ][ middle ] += 1
118                            except KeyError:
119                                indels[ chrom ][ 'D' ][ start ][ middle ] = 1
120                        else:
121                            indels[ chrom ][ 'D' ][ start ] = { middle: 1 }
122    # compute deletion frequencies and insertion frequencies for checking against threshold
123    freqs = {}
124    ins_freqs = {}
125    chroms = mis_matches.keys()
126    chroms.sort()
127    for chrom in chroms:
128        freqs[ chrom ] = {}
129        ins_freqs[ chrom ] = {}
130        poses = mis_matches[ chrom ].keys()
131        poses.extend( indels[ chrom ][ 'D' ].keys() )
132        poses.extend( indels[ chrom ][ 'I' ].keys() )
133        poses = list( set( poses ) )
134        for pos in poses:
135            # all reads touching this particular position
136            freqs[ chrom ][ pos ] = {}
137            sum_counts = 0.0
138            sum_counts_end = 0.0
139            # get basic counts (match/mismatch)
140            try:
141                sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) )
142            except KeyError:
143                pass
144            try:
145                sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) )
146            except KeyError:
147                pass
148            # add deletions also touching this position
149            try:
150                sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) )
151            except KeyError:
152                pass
153            try:
154                sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) )
155            except KeyError:
156                pass
157            freqs[ chrom ][ pos ][ 'total' ] = sum_counts
158            # calculate actual frequencies
159            # deletions
160            # frequencies for deletions
161            try:
162                for d in indels[ chrom ][ 'D' ][ pos ].keys():
163                    freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts
164            except KeyError:
165                pass
166            # frequencies for matches/mismatches
167            try:
168                for base in mis_matches[ chrom ][ pos ].keys():
169                    try:
170                        prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts
171                        freqs[ chrom ][ pos ][ base ] = prop
172                    except ZeroDivisionError:
173                        freqs[ chrom ][ pos ][ base ] = 0.0
174            except KeyError:
175                pass
176            # insertions
177            try:
178                for bases in indels[ chrom ][ 'I' ][ pos ].keys():
179                    prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts )
180                    try:
181                        prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end )
182                    except ZeroDivisionError:
183                        prop_end = 0.0
184                    try:
185                        ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ]
186                    except KeyError:
187                        ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] }
188            except KeyError, e:
189                pass
190    # output to files if meet threshold requirement
191    threshold = float( options.threshold )
192    #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' )
193    #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' )
194    for chrom in chroms:
195        # deletions file
196        poses = indels[ chrom ][ 'D' ].keys()
197        poses.sort()
198        for pos in poses:
199            start = pos
200            dels = indels[ chrom ][ 'D' ][ start ].keys()
201            dels.sort()
202            for d in dels:
203                end = start + d
204                prop = freqs[ chrom ][ start ][ d ]
205                if prop > threshold :
206                    out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) )
207        # insertions file
208        poses = indels[ chrom ][ 'I' ].keys()
209        poses.sort()
210        for pos in poses:
211            start = pos
212            end = pos + 1
213            ins_bases = indels[ chrom ][ 'I' ][ start ].keys()
214            ins_bases.sort()
215            for bases in ins_bases:
216                prop_start = ins_freqs[ chrom ][ start ][ bases ][0]
217                prop_end = ins_freqs[ chrom ][ start ][ bases ][1]
218                if prop_start > threshold or prop_end > threshold:
219                    out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) )
220    # close out files
221    out_del.close()
222    out_ins.close()
223    # if skipped lines because of more than one indel, output message
224    if multi_indel_lines > 0:
225        sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines )
226
227if __name__=="__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。