[2] | 1 | #!/usr/bin/env python |
---|
| 2 | #Guruprasad Ananda |
---|
| 3 | """ |
---|
| 4 | Calculate count and coverage of one query on another, and append the Coverage and counts to |
---|
| 5 | the last four columns as bases covered, percent coverage, number of completely present features, number of partially present/overlapping features. |
---|
| 6 | |
---|
| 7 | usage: %prog bed_file_1 bed_file_2 out_file |
---|
| 8 | -1, --cols1=N,N,N,N: Columns for chr, start, end, strand in first file |
---|
| 9 | -2, --cols2=N,N,N,N: Columns for chr, start, end, strand in second file |
---|
| 10 | """ |
---|
| 11 | from galaxy import eggs |
---|
| 12 | import pkg_resources |
---|
| 13 | pkg_resources.require( "bx-python" ) |
---|
| 14 | import sys, traceback, fileinput |
---|
| 15 | from warnings import warn |
---|
| 16 | from bx.intervals.io import * |
---|
| 17 | from bx.cookbook import doc_optparse |
---|
| 18 | from bx.intervals.operations import quicksect |
---|
| 19 | from galaxy.tools.util.galaxyops import * |
---|
| 20 | |
---|
| 21 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 22 | |
---|
| 23 | def stop_err(msg): |
---|
| 24 | sys.stderr.write(msg) |
---|
| 25 | sys.exit() |
---|
| 26 | |
---|
| 27 | def counter(node, start, end): |
---|
| 28 | global full, partial |
---|
| 29 | if node.start <= start and node.maxend > start: |
---|
| 30 | if node.end >= end or (node.start == start and end > node.end > start): |
---|
| 31 | full += 1 |
---|
| 32 | elif end > node.end > start: |
---|
| 33 | partial += 1 |
---|
| 34 | if node.left and node.left.maxend > start: |
---|
| 35 | counter(node.left, start, end) |
---|
| 36 | if node.right: |
---|
| 37 | counter(node.right, start, end) |
---|
| 38 | elif start < node.start < end: |
---|
| 39 | if node.end <= end: |
---|
| 40 | full += 1 |
---|
| 41 | else: |
---|
| 42 | partial += 1 |
---|
| 43 | if node.left and node.left.maxend > start: |
---|
| 44 | counter(node.left, start, end) |
---|
| 45 | if node.right: |
---|
| 46 | counter(node.right, start, end) |
---|
| 47 | else: |
---|
| 48 | if node.left: |
---|
| 49 | counter(node.left, start, end) |
---|
| 50 | |
---|
| 51 | def count_coverage( readers, comments=True ): |
---|
| 52 | primary = readers[0] |
---|
| 53 | secondary = readers[1] |
---|
| 54 | secondary_copy = readers[2] |
---|
| 55 | |
---|
| 56 | rightTree = quicksect.IntervalTree() |
---|
| 57 | for item in secondary: |
---|
| 58 | if type( item ) is GenomicInterval: |
---|
| 59 | rightTree.insert( item, secondary.linenum, item.fields ) |
---|
| 60 | |
---|
| 61 | bitsets = secondary_copy.binned_bitsets() |
---|
| 62 | |
---|
| 63 | global full, partial |
---|
| 64 | |
---|
| 65 | for interval in primary: |
---|
| 66 | if type( interval ) is Header: |
---|
| 67 | yield interval |
---|
| 68 | if type( interval ) is Comment and comments: |
---|
| 69 | yield interval |
---|
| 70 | elif type( interval ) == GenomicInterval: |
---|
| 71 | chrom = interval.chrom |
---|
| 72 | start = int(interval.start) |
---|
| 73 | end = int(interval.end) |
---|
| 74 | full = 0 |
---|
| 75 | partial = 0 |
---|
| 76 | if chrom not in bitsets: |
---|
| 77 | bases_covered = 0 |
---|
| 78 | percent = 0.0 |
---|
| 79 | full = 0 |
---|
| 80 | partial = 0 |
---|
| 81 | else: |
---|
| 82 | bases_covered = bitsets[ chrom ].count_range( start, end-start ) |
---|
| 83 | if (end - start) == 0: |
---|
| 84 | percent = 0 |
---|
| 85 | else: |
---|
| 86 | percent = float(bases_covered) / float(end - start) |
---|
| 87 | if bases_covered: |
---|
| 88 | root = rightTree.chroms[chrom] #root node for the chrom tree |
---|
| 89 | counter(root, start, end) |
---|
| 90 | interval.fields.append(str(bases_covered)) |
---|
| 91 | interval.fields.append(str(percent)) |
---|
| 92 | interval.fields.append(str(full)) |
---|
| 93 | interval.fields.append(str(partial)) |
---|
| 94 | yield interval |
---|
| 95 | |
---|
| 96 | def main(): |
---|
| 97 | options, args = doc_optparse.parse( __doc__ ) |
---|
| 98 | |
---|
| 99 | try: |
---|
| 100 | chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) |
---|
| 101 | chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) |
---|
| 102 | in1_fname, in2_fname, out_fname = args |
---|
| 103 | except: |
---|
| 104 | stop_err( "Data issue: click the pencil icon in the history item to correct the metadata attributes." ) |
---|
| 105 | |
---|
| 106 | g1 = NiceReaderWrapper( fileinput.FileInput( in1_fname ), |
---|
| 107 | chrom_col=chr_col_1, |
---|
| 108 | start_col=start_col_1, |
---|
| 109 | end_col=end_col_1, |
---|
| 110 | strand_col=strand_col_1, |
---|
| 111 | fix_strand=True ) |
---|
| 112 | g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ), |
---|
| 113 | chrom_col=chr_col_2, |
---|
| 114 | start_col=start_col_2, |
---|
| 115 | end_col=end_col_2, |
---|
| 116 | strand_col=strand_col_2, |
---|
| 117 | fix_strand=True ) |
---|
| 118 | g2_copy = NiceReaderWrapper( fileinput.FileInput( in2_fname ), |
---|
| 119 | chrom_col=chr_col_2, |
---|
| 120 | start_col=start_col_2, |
---|
| 121 | end_col=end_col_2, |
---|
| 122 | strand_col=strand_col_2, |
---|
| 123 | fix_strand=True ) |
---|
| 124 | |
---|
| 125 | |
---|
| 126 | out_file = open( out_fname, "w" ) |
---|
| 127 | |
---|
| 128 | try: |
---|
| 129 | for line in count_coverage([g1,g2,g2_copy]): |
---|
| 130 | if type( line ) is GenomicInterval: |
---|
| 131 | out_file.write( "%s\n" % "\t".join( line.fields ) ) |
---|
| 132 | else: |
---|
| 133 | out_file.write( "%s\n" % line ) |
---|
| 134 | except ParseError, exc: |
---|
| 135 | out_file.close() |
---|
| 136 | fail( str( exc ) ) |
---|
| 137 | |
---|
| 138 | out_file.close() |
---|
| 139 | |
---|
| 140 | if g1.skipped > 0: |
---|
| 141 | print skipped( g1, filedesc=" of 1st dataset" ) |
---|
| 142 | if g2.skipped > 0: |
---|
| 143 | print skipped( g2, filedesc=" of 2nd dataset" ) |
---|
| 144 | elif g2_copy.skipped > 0: |
---|
| 145 | print skipped( g2_copy, filedesc=" of 2nd dataset" ) |
---|
| 146 | |
---|
| 147 | if __name__ == "__main__": |
---|
| 148 | main() |
---|