1 | #!/usr/bin/env python |
---|
2 | #Guruprasad Ananda |
---|
3 | |
---|
4 | from galaxy import eggs |
---|
5 | import pkg_resources |
---|
6 | pkg_resources.require( "bx-python" ) |
---|
7 | |
---|
8 | import sys, os, tempfile |
---|
9 | import traceback |
---|
10 | import fileinput |
---|
11 | from warnings import warn |
---|
12 | |
---|
13 | from galaxy.tools.util.galaxyops import * |
---|
14 | from bx.intervals.io import * |
---|
15 | |
---|
16 | from bx.intervals.operations import quicksect |
---|
17 | |
---|
18 | def stop_err(msg): |
---|
19 | sys.stderr.write(msg) |
---|
20 | sys.exit() |
---|
21 | |
---|
22 | def counter(node, start, end, sort_col): |
---|
23 | global full, blk_len, blk_list |
---|
24 | if node.start < start: |
---|
25 | if node.right: |
---|
26 | counter(node.right, start, end, sort_col) |
---|
27 | elif start <= node.start <= end and start <= node.end <= end: |
---|
28 | full += 1 |
---|
29 | if node.other[0] not in blk_list: |
---|
30 | blk_list.append(node.other[0]) |
---|
31 | blk_len += int(node.other[sort_col+2]) |
---|
32 | if node.left and node.left.maxend > start: |
---|
33 | counter(node.left, start, end, sort_col) |
---|
34 | if node.right: |
---|
35 | counter(node.right, start, end, sort_col) |
---|
36 | elif node.start > end: |
---|
37 | if node.left: |
---|
38 | counter(node.left, start, end, sort_col) |
---|
39 | |
---|
40 | |
---|
41 | infile = sys.argv[1] |
---|
42 | fout = open(sys.argv[2],'w') |
---|
43 | int_file = sys.argv[3] |
---|
44 | if int_file != "None": #User has specified an interval file |
---|
45 | try: |
---|
46 | fint = open(int_file, 'r') |
---|
47 | dbkey_i = sys.argv[4] |
---|
48 | chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) |
---|
49 | except: |
---|
50 | stop_err("Unable to open input Interval file") |
---|
51 | |
---|
52 | def main(): |
---|
53 | |
---|
54 | for i, line in enumerate( file ( infile )): |
---|
55 | line = line.rstrip('\r\n') |
---|
56 | if len( line )>0 and not line.startswith( '#' ): |
---|
57 | elems = line.split( '\t' ) |
---|
58 | break |
---|
59 | if i == 30: |
---|
60 | break # Hopefully we'll never get here... |
---|
61 | |
---|
62 | if len( elems ) != 18: |
---|
63 | stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." ) |
---|
64 | |
---|
65 | for i, line in enumerate( file ( infile )): |
---|
66 | line = line.rstrip('\r\n') |
---|
67 | elems = line.split('\t') |
---|
68 | try: |
---|
69 | assert int(elems[0]) |
---|
70 | assert len(elems) == 18 |
---|
71 | if int_file != "None": |
---|
72 | if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]: |
---|
73 | stop_err("The species build corresponding to your interval file is not present in the Indel file.") |
---|
74 | if dbkey_i in elems[3]: |
---|
75 | sort_col = 4 |
---|
76 | elif dbkey_i in elems[8]: |
---|
77 | sort_col = 9 |
---|
78 | elif dbkey_i in elems[13]: |
---|
79 | sort_col = 14 |
---|
80 | else: |
---|
81 | species = [] |
---|
82 | species.append( elems[3].split('.')[0] ) |
---|
83 | species.append( elems[8].split('.')[0] ) |
---|
84 | species.append( elems[13].split('.')[0] ) |
---|
85 | sort_col = 0 #Based on block numbers |
---|
86 | break |
---|
87 | except: |
---|
88 | continue |
---|
89 | |
---|
90 | |
---|
91 | fin = open(infile, 'r') |
---|
92 | skipped = 0 |
---|
93 | |
---|
94 | if int_file == "None": |
---|
95 | sorted_infile = tempfile.NamedTemporaryFile() |
---|
96 | cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile |
---|
97 | try: |
---|
98 | os.system(cmdline) |
---|
99 | except: |
---|
100 | stop_err("Encountered error while sorting the input file.") |
---|
101 | print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2]) |
---|
102 | prev_bnum = -1 |
---|
103 | sorted_infile.seek(0) |
---|
104 | for line in sorted_infile.readlines(): |
---|
105 | line = line.rstrip('\r\n') |
---|
106 | elems = line.split('\t') |
---|
107 | try: |
---|
108 | assert int(elems[0]) |
---|
109 | assert len(elems) == 18 |
---|
110 | new_bnum = int(elems[0]) |
---|
111 | if new_bnum != prev_bnum: |
---|
112 | if prev_bnum != -1: |
---|
113 | irate = [] |
---|
114 | drate = [] |
---|
115 | for i,elem in enumerate(inserts): |
---|
116 | try: |
---|
117 | irate.append(str("%.2e" %(inserts[i]/blen[i]))) |
---|
118 | except: |
---|
119 | irate.append('0') |
---|
120 | try: |
---|
121 | drate.append(str("%.2e" %(deletes[i]/blen[i]))) |
---|
122 | except: |
---|
123 | drate.append('0') |
---|
124 | print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) |
---|
125 | inserts = [0.0, 0.0, 0.0] |
---|
126 | deletes = [0.0, 0.0, 0.0] |
---|
127 | blen = [] |
---|
128 | blen.append( int(elems[6]) ) |
---|
129 | blen.append( int(elems[11]) ) |
---|
130 | blen.append( int(elems[16]) ) |
---|
131 | line_sp = elems[1].split('.')[0] |
---|
132 | sp_ind = species.index(line_sp) |
---|
133 | if elems[1].endswith('insert'): |
---|
134 | inserts[sp_ind] += 1 |
---|
135 | elif elems[1].endswith('delete'): |
---|
136 | deletes[sp_ind] += 1 |
---|
137 | prev_bnum = new_bnum |
---|
138 | except Exception, ei: |
---|
139 | #print >>sys.stderr, ei |
---|
140 | continue |
---|
141 | irate = [] |
---|
142 | drate = [] |
---|
143 | for i,elem in enumerate(inserts): |
---|
144 | try: |
---|
145 | irate.append(str("%.2e" %(inserts[i]/blen[i]))) |
---|
146 | except: |
---|
147 | irate.append('0') |
---|
148 | try: |
---|
149 | drate.append(str("%.2e" %(deletes[i]/blen[i]))) |
---|
150 | except: |
---|
151 | drate.append('0') |
---|
152 | print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) |
---|
153 | sys.exit() |
---|
154 | |
---|
155 | |
---|
156 | inf = open(infile, 'r') |
---|
157 | start_met = False |
---|
158 | end_met = False |
---|
159 | sp_file = tempfile.NamedTemporaryFile() |
---|
160 | for n, line in enumerate(inf): |
---|
161 | line = line.rstrip('\r\n') |
---|
162 | elems = line.split('\t') |
---|
163 | try: |
---|
164 | assert int(elems[0]) |
---|
165 | assert len(elems) == 18 |
---|
166 | if dbkey_i not in elems[1]: |
---|
167 | if not(start_met): |
---|
168 | continue |
---|
169 | else: |
---|
170 | sp_end = n |
---|
171 | break |
---|
172 | else: |
---|
173 | print >>sp_file, line |
---|
174 | if not(start_met): |
---|
175 | start_met = True |
---|
176 | sp_start = n |
---|
177 | except: |
---|
178 | continue |
---|
179 | |
---|
180 | try: |
---|
181 | assert sp_end |
---|
182 | except: |
---|
183 | sp_end = n+1 |
---|
184 | |
---|
185 | sp_file.seek(0) |
---|
186 | win = NiceReaderWrapper( fileinput.FileInput( int_file ), |
---|
187 | chrom_col=chr_col_i, |
---|
188 | start_col=start_col_i, |
---|
189 | end_col=end_col_i, |
---|
190 | strand_col=strand_col_i, |
---|
191 | fix_strand=True) |
---|
192 | |
---|
193 | indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), |
---|
194 | chrom_col=1, |
---|
195 | start_col=sort_col, |
---|
196 | end_col=sort_col+1, |
---|
197 | strand_col=-1, |
---|
198 | fix_strand=True) |
---|
199 | |
---|
200 | indelTree = quicksect.IntervalTree() |
---|
201 | for item in indel: |
---|
202 | if type( item ) is GenomicInterval: |
---|
203 | indelTree.insert( item, indel.linenum, item.fields ) |
---|
204 | result=[] |
---|
205 | |
---|
206 | global full, blk_len, blk_list |
---|
207 | for interval in win: |
---|
208 | if type( interval ) is Header: |
---|
209 | pass |
---|
210 | if type( interval ) is Comment: |
---|
211 | pass |
---|
212 | elif type( interval ) == GenomicInterval: |
---|
213 | chrom = interval.chrom |
---|
214 | start = int(interval.start) |
---|
215 | end = int(interval.end) |
---|
216 | if start > end: |
---|
217 | warn( "Interval start after end!" ) |
---|
218 | ins_chr = "%s.%s_insert" %(dbkey_i,chrom) |
---|
219 | del_chr = "%s.%s_delete" %(dbkey_i,chrom) |
---|
220 | irate = 0 |
---|
221 | drate = 0 |
---|
222 | if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms: |
---|
223 | pass |
---|
224 | else: |
---|
225 | if ins_chr in indelTree.chroms: |
---|
226 | full = 0.0 |
---|
227 | blk_len = 0 |
---|
228 | blk_list = [] |
---|
229 | root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree |
---|
230 | counter(root, start, end, sort_col) |
---|
231 | if blk_len: |
---|
232 | irate = full/blk_len |
---|
233 | |
---|
234 | if del_chr in indelTree.chroms: |
---|
235 | full = 0.0 |
---|
236 | blk_len = 0 |
---|
237 | blk_list = [] |
---|
238 | root = indelTree.chroms[del_chr] #root node for the chrom insertion tree |
---|
239 | counter(root, start, end, sort_col) |
---|
240 | if blk_len: |
---|
241 | drate = full/blk_len |
---|
242 | |
---|
243 | interval.fields.append(str("%.2e" %irate)) |
---|
244 | interval.fields.append(str("%.2e" %drate)) |
---|
245 | print >>fout, "\t".join(interval.fields) |
---|
246 | fout.flush() |
---|
247 | |
---|
248 | if __name__ == "__main__": |
---|
249 | main() |
---|