[2] | 1 | #!/usr/bin/env python |
---|
| 2 | #Guruprasad Ananda |
---|
| 3 | |
---|
| 4 | from galaxy import eggs |
---|
| 5 | import pkg_resources |
---|
| 6 | pkg_resources.require( "bx-python" ) |
---|
| 7 | |
---|
| 8 | import sys, os, tempfile |
---|
| 9 | import traceback |
---|
| 10 | import fileinput |
---|
| 11 | from warnings import warn |
---|
| 12 | |
---|
| 13 | from galaxy.tools.util.galaxyops import * |
---|
| 14 | from bx.intervals.io import * |
---|
| 15 | |
---|
| 16 | from bx.intervals.operations import quicksect |
---|
| 17 | |
---|
| 18 | def stop_err(msg): |
---|
| 19 | sys.stderr.write(msg) |
---|
| 20 | sys.exit() |
---|
| 21 | |
---|
| 22 | def counter(node, start, end, sort_col): |
---|
| 23 | global full, blk_len, blk_list |
---|
| 24 | if node.start < start: |
---|
| 25 | if node.right: |
---|
| 26 | counter(node.right, start, end, sort_col) |
---|
| 27 | elif start <= node.start <= end and start <= node.end <= end: |
---|
| 28 | full += 1 |
---|
| 29 | if node.other[0] not in blk_list: |
---|
| 30 | blk_list.append(node.other[0]) |
---|
| 31 | blk_len += int(node.other[sort_col+2]) |
---|
| 32 | if node.left and node.left.maxend > start: |
---|
| 33 | counter(node.left, start, end, sort_col) |
---|
| 34 | if node.right: |
---|
| 35 | counter(node.right, start, end, sort_col) |
---|
| 36 | elif node.start > end: |
---|
| 37 | if node.left: |
---|
| 38 | counter(node.left, start, end, sort_col) |
---|
| 39 | |
---|
| 40 | |
---|
| 41 | infile = sys.argv[1] |
---|
| 42 | fout = open(sys.argv[2],'w') |
---|
| 43 | int_file = sys.argv[3] |
---|
| 44 | if int_file != "None": #User has specified an interval file |
---|
| 45 | try: |
---|
| 46 | fint = open(int_file, 'r') |
---|
| 47 | dbkey_i = sys.argv[4] |
---|
| 48 | chr_col_i, start_col_i, end_col_i, strand_col_i = parse_cols_arg( sys.argv[5] ) |
---|
| 49 | except: |
---|
| 50 | stop_err("Unable to open input Interval file") |
---|
| 51 | |
---|
| 52 | def main(): |
---|
| 53 | |
---|
| 54 | for i, line in enumerate( file ( infile )): |
---|
| 55 | line = line.rstrip('\r\n') |
---|
| 56 | if len( line )>0 and not line.startswith( '#' ): |
---|
| 57 | elems = line.split( '\t' ) |
---|
| 58 | break |
---|
| 59 | if i == 30: |
---|
| 60 | break # Hopefully we'll never get here... |
---|
| 61 | |
---|
| 62 | if len( elems ) != 18: |
---|
| 63 | stop_err( "This tool only works on tabular data output by 'Fetch Indels from 3-way alignments' tool. The data in your input dataset is either missing or not formatted properly." ) |
---|
| 64 | |
---|
| 65 | for i, line in enumerate( file ( infile )): |
---|
| 66 | line = line.rstrip('\r\n') |
---|
| 67 | elems = line.split('\t') |
---|
| 68 | try: |
---|
| 69 | assert int(elems[0]) |
---|
| 70 | assert len(elems) == 18 |
---|
| 71 | if int_file != "None": |
---|
| 72 | if dbkey_i not in elems[3] and dbkey_i not in elems[8] and dbkey_i not in elems[13]: |
---|
| 73 | stop_err("The species build corresponding to your interval file is not present in the Indel file.") |
---|
| 74 | if dbkey_i in elems[3]: |
---|
| 75 | sort_col = 4 |
---|
| 76 | elif dbkey_i in elems[8]: |
---|
| 77 | sort_col = 9 |
---|
| 78 | elif dbkey_i in elems[13]: |
---|
| 79 | sort_col = 14 |
---|
| 80 | else: |
---|
| 81 | species = [] |
---|
| 82 | species.append( elems[3].split('.')[0] ) |
---|
| 83 | species.append( elems[8].split('.')[0] ) |
---|
| 84 | species.append( elems[13].split('.')[0] ) |
---|
| 85 | sort_col = 0 #Based on block numbers |
---|
| 86 | break |
---|
| 87 | except: |
---|
| 88 | continue |
---|
| 89 | |
---|
| 90 | |
---|
| 91 | fin = open(infile, 'r') |
---|
| 92 | skipped = 0 |
---|
| 93 | |
---|
| 94 | if int_file == "None": |
---|
| 95 | sorted_infile = tempfile.NamedTemporaryFile() |
---|
| 96 | cmdline = "sort -n -k"+str(1)+" -o "+sorted_infile.name+" "+infile |
---|
| 97 | try: |
---|
| 98 | os.system(cmdline) |
---|
| 99 | except: |
---|
| 100 | stop_err("Encountered error while sorting the input file.") |
---|
| 101 | print >>fout, "#Block\t%s_InsRate\t%s_InsRate\t%s_InsRate\t%s_DelRate\t%s_DelRate\t%s_DelRate" %(species[0],species[1],species[2],species[0],species[1],species[2]) |
---|
| 102 | prev_bnum = -1 |
---|
| 103 | sorted_infile.seek(0) |
---|
| 104 | for line in sorted_infile.readlines(): |
---|
| 105 | line = line.rstrip('\r\n') |
---|
| 106 | elems = line.split('\t') |
---|
| 107 | try: |
---|
| 108 | assert int(elems[0]) |
---|
| 109 | assert len(elems) == 18 |
---|
| 110 | new_bnum = int(elems[0]) |
---|
| 111 | if new_bnum != prev_bnum: |
---|
| 112 | if prev_bnum != -1: |
---|
| 113 | irate = [] |
---|
| 114 | drate = [] |
---|
| 115 | for i,elem in enumerate(inserts): |
---|
| 116 | try: |
---|
| 117 | irate.append(str("%.2e" %(inserts[i]/blen[i]))) |
---|
| 118 | except: |
---|
| 119 | irate.append('0') |
---|
| 120 | try: |
---|
| 121 | drate.append(str("%.2e" %(deletes[i]/blen[i]))) |
---|
| 122 | except: |
---|
| 123 | drate.append('0') |
---|
| 124 | print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) |
---|
| 125 | inserts = [0.0, 0.0, 0.0] |
---|
| 126 | deletes = [0.0, 0.0, 0.0] |
---|
| 127 | blen = [] |
---|
| 128 | blen.append( int(elems[6]) ) |
---|
| 129 | blen.append( int(elems[11]) ) |
---|
| 130 | blen.append( int(elems[16]) ) |
---|
| 131 | line_sp = elems[1].split('.')[0] |
---|
| 132 | sp_ind = species.index(line_sp) |
---|
| 133 | if elems[1].endswith('insert'): |
---|
| 134 | inserts[sp_ind] += 1 |
---|
| 135 | elif elems[1].endswith('delete'): |
---|
| 136 | deletes[sp_ind] += 1 |
---|
| 137 | prev_bnum = new_bnum |
---|
| 138 | except Exception, ei: |
---|
| 139 | #print >>sys.stderr, ei |
---|
| 140 | continue |
---|
| 141 | irate = [] |
---|
| 142 | drate = [] |
---|
| 143 | for i,elem in enumerate(inserts): |
---|
| 144 | try: |
---|
| 145 | irate.append(str("%.2e" %(inserts[i]/blen[i]))) |
---|
| 146 | except: |
---|
| 147 | irate.append('0') |
---|
| 148 | try: |
---|
| 149 | drate.append(str("%.2e" %(deletes[i]/blen[i]))) |
---|
| 150 | except: |
---|
| 151 | drate.append('0') |
---|
| 152 | print >>fout, "%s\t%s\t%s" %(prev_bnum, '\t'.join(irate) , '\t'.join(drate)) |
---|
| 153 | sys.exit() |
---|
| 154 | |
---|
| 155 | |
---|
| 156 | inf = open(infile, 'r') |
---|
| 157 | start_met = False |
---|
| 158 | end_met = False |
---|
| 159 | sp_file = tempfile.NamedTemporaryFile() |
---|
| 160 | for n, line in enumerate(inf): |
---|
| 161 | line = line.rstrip('\r\n') |
---|
| 162 | elems = line.split('\t') |
---|
| 163 | try: |
---|
| 164 | assert int(elems[0]) |
---|
| 165 | assert len(elems) == 18 |
---|
| 166 | if dbkey_i not in elems[1]: |
---|
| 167 | if not(start_met): |
---|
| 168 | continue |
---|
| 169 | else: |
---|
| 170 | sp_end = n |
---|
| 171 | break |
---|
| 172 | else: |
---|
| 173 | print >>sp_file, line |
---|
| 174 | if not(start_met): |
---|
| 175 | start_met = True |
---|
| 176 | sp_start = n |
---|
| 177 | except: |
---|
| 178 | continue |
---|
| 179 | |
---|
| 180 | try: |
---|
| 181 | assert sp_end |
---|
| 182 | except: |
---|
| 183 | sp_end = n+1 |
---|
| 184 | |
---|
| 185 | sp_file.seek(0) |
---|
| 186 | win = NiceReaderWrapper( fileinput.FileInput( int_file ), |
---|
| 187 | chrom_col=chr_col_i, |
---|
| 188 | start_col=start_col_i, |
---|
| 189 | end_col=end_col_i, |
---|
| 190 | strand_col=strand_col_i, |
---|
| 191 | fix_strand=True) |
---|
| 192 | |
---|
| 193 | indel = NiceReaderWrapper( fileinput.FileInput( sp_file.name ), |
---|
| 194 | chrom_col=1, |
---|
| 195 | start_col=sort_col, |
---|
| 196 | end_col=sort_col+1, |
---|
| 197 | strand_col=-1, |
---|
| 198 | fix_strand=True) |
---|
| 199 | |
---|
| 200 | indelTree = quicksect.IntervalTree() |
---|
| 201 | for item in indel: |
---|
| 202 | if type( item ) is GenomicInterval: |
---|
| 203 | indelTree.insert( item, indel.linenum, item.fields ) |
---|
| 204 | result=[] |
---|
| 205 | |
---|
| 206 | global full, blk_len, blk_list |
---|
| 207 | for interval in win: |
---|
| 208 | if type( interval ) is Header: |
---|
| 209 | pass |
---|
| 210 | if type( interval ) is Comment: |
---|
| 211 | pass |
---|
| 212 | elif type( interval ) == GenomicInterval: |
---|
| 213 | chrom = interval.chrom |
---|
| 214 | start = int(interval.start) |
---|
| 215 | end = int(interval.end) |
---|
| 216 | if start > end: |
---|
| 217 | warn( "Interval start after end!" ) |
---|
| 218 | ins_chr = "%s.%s_insert" %(dbkey_i,chrom) |
---|
| 219 | del_chr = "%s.%s_delete" %(dbkey_i,chrom) |
---|
| 220 | irate = 0 |
---|
| 221 | drate = 0 |
---|
| 222 | if ins_chr not in indelTree.chroms and del_chr not in indelTree.chroms: |
---|
| 223 | pass |
---|
| 224 | else: |
---|
| 225 | if ins_chr in indelTree.chroms: |
---|
| 226 | full = 0.0 |
---|
| 227 | blk_len = 0 |
---|
| 228 | blk_list = [] |
---|
| 229 | root = indelTree.chroms[ins_chr] #root node for the chrom insertion tree |
---|
| 230 | counter(root, start, end, sort_col) |
---|
| 231 | if blk_len: |
---|
| 232 | irate = full/blk_len |
---|
| 233 | |
---|
| 234 | if del_chr in indelTree.chroms: |
---|
| 235 | full = 0.0 |
---|
| 236 | blk_len = 0 |
---|
| 237 | blk_list = [] |
---|
| 238 | root = indelTree.chroms[del_chr] #root node for the chrom insertion tree |
---|
| 239 | counter(root, start, end, sort_col) |
---|
| 240 | if blk_len: |
---|
| 241 | drate = full/blk_len |
---|
| 242 | |
---|
| 243 | interval.fields.append(str("%.2e" %irate)) |
---|
| 244 | interval.fields.append(str("%.2e" %drate)) |
---|
| 245 | print >>fout, "\t".join(interval.fields) |
---|
| 246 | fout.flush() |
---|
| 247 | |
---|
| 248 | if __name__ == "__main__": |
---|
| 249 | main() |
---|