1 | #!/usr/bin/env python |
---|
2 | #Guruprasad Ananda |
---|
3 | """ |
---|
4 | Calculate count and coverage of one query on another, and append the Coverage and counts to |
---|
5 | the last four columns as bases covered, percent coverage, number of completely present features, number of partially present/overlapping features. |
---|
6 | |
---|
7 | usage: %prog bed_file_1 bed_file_2 out_file |
---|
8 | -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file |
---|
9 | -2, --cols2=N,N,N,N: Columns for chr, start, end, strand in second file |
---|
10 | """ |
---|
11 | from galaxy import eggs |
---|
12 | import pkg_resources |
---|
13 | pkg_resources.require( "bx-python" ) |
---|
14 | import sys, traceback, fileinput |
---|
15 | from warnings import warn |
---|
16 | from bx.intervals.io import * |
---|
17 | from bx.cookbook import doc_optparse |
---|
18 | from bx.intervals.operations import quicksect |
---|
19 | from galaxy.tools.util.galaxyops import * |
---|
20 | |
---|
21 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
22 | |
---|
23 | def stop_err(msg): |
---|
24 | sys.stderr.write(msg) |
---|
25 | sys.exit() |
---|
26 | |
---|
27 | def counter(node, start, end): |
---|
28 | global full, partial |
---|
29 | if node.start <= start and node.maxend > start: |
---|
30 | if node.end >= end or (node.start == start and end > node.end > start): |
---|
31 | full += 1 |
---|
32 | elif end > node.end > start: |
---|
33 | partial += 1 |
---|
34 | if node.left and node.left.maxend > start: |
---|
35 | counter(node.left, start, end) |
---|
36 | if node.right: |
---|
37 | counter(node.right, start, end) |
---|
38 | elif start < node.start < end: |
---|
39 | if node.end <= end: |
---|
40 | full += 1 |
---|
41 | else: |
---|
42 | partial += 1 |
---|
43 | if node.left and node.left.maxend > start: |
---|
44 | counter(node.left, start, end) |
---|
45 | if node.right: |
---|
46 | counter(node.right, start, end) |
---|
47 | else: |
---|
48 | if node.left: |
---|
49 | counter(node.left, start, end) |
---|
50 | |
---|
51 | def count_coverage( readers, comments=True ): |
---|
52 | primary = readers[0] |
---|
53 | secondary = readers[1] |
---|
54 | secondary_copy = readers[2] |
---|
55 | |
---|
56 | rightTree = quicksect.IntervalTree() |
---|
57 | for item in secondary: |
---|
58 | if type( item ) is GenomicInterval: |
---|
59 | rightTree.insert( item, secondary.linenum, item.fields ) |
---|
60 | |
---|
61 | bitsets = secondary_copy.binned_bitsets() |
---|
62 | |
---|
63 | global full, partial |
---|
64 | |
---|
65 | for interval in primary: |
---|
66 | if type( interval ) is Header: |
---|
67 | yield interval |
---|
68 | if type( interval ) is Comment and comments: |
---|
69 | yield interval |
---|
70 | elif type( interval ) == GenomicInterval: |
---|
71 | chrom = interval.chrom |
---|
72 | start = int(interval.start) |
---|
73 | end = int(interval.end) |
---|
74 | full = 0 |
---|
75 | partial = 0 |
---|
76 | if chrom not in bitsets: |
---|
77 | bases_covered = 0 |
---|
78 | percent = 0.0 |
---|
79 | full = 0 |
---|
80 | partial = 0 |
---|
81 | else: |
---|
82 | bases_covered = bitsets[ chrom ].count_range( start, end-start ) |
---|
83 | if (end - start) == 0: |
---|
84 | percent = 0 |
---|
85 | else: |
---|
86 | percent = float(bases_covered) / float(end - start) |
---|
87 | if bases_covered: |
---|
88 | root = rightTree.chroms[chrom] #root node for the chrom tree |
---|
89 | counter(root, start, end) |
---|
90 | interval.fields.append(str(bases_covered)) |
---|
91 | interval.fields.append(str(percent)) |
---|
92 | interval.fields.append(str(full)) |
---|
93 | interval.fields.append(str(partial)) |
---|
94 | yield interval |
---|
95 | |
---|
96 | def main(): |
---|
97 | options, args = doc_optparse.parse( __doc__ ) |
---|
98 | |
---|
99 | try: |
---|
100 | chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) |
---|
101 | chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) |
---|
102 | in1_fname, in2_fname, out_fname = args |
---|
103 | except: |
---|
104 | stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." ) |
---|
105 | |
---|
106 | g1 = NiceReaderWrapper( fileinput.FileInput( in1_fname ), |
---|
107 | chrom_col=chr_col_1, |
---|
108 | start_col=start_col_1, |
---|
109 | end_col=end_col_1, |
---|
110 | strand_col=strand_col_1, |
---|
111 | fix_strand=True ) |
---|
112 | g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), |
---|
113 | chrom_col=chr_col_2, |
---|
114 | start_col=start_col_2, |
---|
115 | end_col=end_col_2, |
---|
116 | strand_col=strand_col_2, |
---|
117 | fix_strand=True ) |
---|
118 | g2_copy = NiceReaderWrapper( fileinput.FileInput( in2_fname ), |
---|
119 | chrom_col=chr_col_2, |
---|
120 | start_col=start_col_2, |
---|
121 | end_col=end_col_2, |
---|
122 | strand_col=strand_col_2, |
---|
123 | fix_strand=True ) |
---|
124 | |
---|
125 | |
---|
126 | out_file = open( out_fname, "w" ) |
---|
127 | |
---|
128 | try: |
---|
129 | for line in count_coverage([g1,g2,g2_copy]): |
---|
130 | if type( line ) is GenomicInterval: |
---|
131 | out_file.write( "%s\n" % "\t".join( line.fields ) ) |
---|
132 | else: |
---|
133 | out_file.write( "%s\n" % line ) |
---|
134 | except ParseError, exc: |
---|
135 | out_file.close() |
---|
136 | fail( str( exc ) ) |
---|
137 | |
---|
138 | out_file.close() |
---|
139 | |
---|
140 | if g1.skipped > 0: |
---|
141 | print skipped( g1, filedesc=" of 1st dataset" ) |
---|
142 | if g2.skipped > 0: |
---|
143 | print skipped( g2, filedesc=" of 2nd dataset" ) |
---|
144 | elif g2_copy.skipped > 0: |
---|
145 | print skipped( g2_copy, filedesc=" of 2nd dataset" ) |
---|
146 | |
---|
147 | if __name__ == "__main__": |
---|
148 | main() |
---|