[2] | 1 | #!/usr/bin/env python |
---|
| 2 | """ |
---|
| 3 | Converter to generate 3 (or 4) column base-pair coverage from an interval file. |
---|
| 4 | |
---|
| 5 | usage: %prog bed_file out_file |
---|
| 6 | -1, --cols1=N,N,N,N: Columns for chrom, start, end, strand in interval file |
---|
| 7 | -2, --cols2=N,N,N,N: Columns for chrom, start, end, strand in coverage file |
---|
| 8 | """ |
---|
| 9 | import sys |
---|
| 10 | from galaxy import eggs |
---|
| 11 | import pkg_resources; pkg_resources.require( "bx-python" ) |
---|
| 12 | from bx.intervals import io |
---|
| 13 | from bx.cookbook import doc_optparse |
---|
| 14 | import psyco_full |
---|
| 15 | import commands |
---|
| 16 | import os |
---|
| 17 | from os import environ |
---|
| 18 | import tempfile |
---|
| 19 | from bisect import bisect |
---|
| 20 | |
---|
| 21 | INTERVAL_METADATA = ('chromCol', |
---|
| 22 | 'startCol', |
---|
| 23 | 'endCol', |
---|
| 24 | 'strandCol',) |
---|
| 25 | |
---|
| 26 | COVERAGE_METADATA = ('chromCol', |
---|
| 27 | 'positionCol', |
---|
| 28 | 'forwardCol', |
---|
| 29 | 'reverseCol',) |
---|
| 30 | |
---|
| 31 | def main( interval, coverage ): |
---|
| 32 | """ |
---|
| 33 | Uses a sliding window of partitions to count coverages. |
---|
| 34 | Every interval record adds its start and end to the partitions. The result |
---|
| 35 | is a list of partitions, or every position that has a (maybe) different |
---|
| 36 | number of basepairs covered. We don't worry about merging because we pop |
---|
| 37 | as the sorted intervals are read in. As the input start positions exceed |
---|
| 38 | the partition positions in partitions, coverages are kicked out in bulk. |
---|
| 39 | """ |
---|
| 40 | partitions = [] |
---|
| 41 | forward_covs = [] |
---|
| 42 | reverse_covs = [] |
---|
| 43 | offset = 0 |
---|
| 44 | chrom = None |
---|
| 45 | lastchrom = None |
---|
| 46 | for record in interval: |
---|
| 47 | chrom = record.chrom |
---|
| 48 | if lastchrom and not lastchrom == chrom and partitions: |
---|
| 49 | for partition in xrange(0, len(partitions)-1): |
---|
| 50 | forward = forward_covs[partition] |
---|
| 51 | reverse = reverse_covs[partition] |
---|
| 52 | if forward+reverse > 0: |
---|
| 53 | coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]), |
---|
| 54 | forward=forward, reverse=reverse) |
---|
| 55 | partitions = [] |
---|
| 56 | forward_covs = [] |
---|
| 57 | reverse_covs = [] |
---|
| 58 | |
---|
| 59 | start_index = bisect(partitions, record.start) |
---|
| 60 | forward = int(record.strand == "+") |
---|
| 61 | reverse = int(record.strand == "-") |
---|
| 62 | forward_base = 0 |
---|
| 63 | reverse_base = 0 |
---|
| 64 | if start_index > 0: |
---|
| 65 | forward_base = forward_covs[start_index-1] |
---|
| 66 | reverse_base = reverse_covs[start_index-1] |
---|
| 67 | partitions.insert(start_index, record.start) |
---|
| 68 | forward_covs.insert(start_index, forward_base) |
---|
| 69 | reverse_covs.insert(start_index, reverse_base) |
---|
| 70 | end_index = bisect(partitions, record.end) |
---|
| 71 | for index in xrange(start_index, end_index): |
---|
| 72 | forward_covs[index] += forward |
---|
| 73 | reverse_covs[index] += reverse |
---|
| 74 | partitions.insert(end_index, record.end) |
---|
| 75 | forward_covs.insert(end_index, forward_covs[end_index-1] - forward ) |
---|
| 76 | reverse_covs.insert(end_index, reverse_covs[end_index-1] - reverse ) |
---|
| 77 | |
---|
| 78 | if partitions: |
---|
| 79 | for partition in xrange(0, start_index): |
---|
| 80 | forward = forward_covs[partition] |
---|
| 81 | reverse = reverse_covs[partition] |
---|
| 82 | if forward+reverse > 0: |
---|
| 83 | coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]), |
---|
| 84 | forward=forward, reverse=reverse) |
---|
| 85 | partitions = partitions[start_index:] |
---|
| 86 | forward_covs = forward_covs[start_index:] |
---|
| 87 | reverse_covs = reverse_covs[start_index:] |
---|
| 88 | |
---|
| 89 | lastchrom = chrom |
---|
| 90 | |
---|
| 91 | # Finish the last chromosome |
---|
| 92 | if partitions: |
---|
| 93 | for partition in xrange(0, len(partitions)-1): |
---|
| 94 | forward = forward_covs[partition] |
---|
| 95 | reverse = reverse_covs[partition] |
---|
| 96 | if forward+reverse > 0: |
---|
| 97 | coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]), |
---|
| 98 | forward=forward, reverse=reverse) |
---|
| 99 | |
---|
| 100 | class CoverageWriter( object ): |
---|
| 101 | def __init__( self, out_stream=None, chromCol=0, positionCol=1, forwardCol=2, reverseCol=3 ): |
---|
| 102 | self.out_stream = out_stream |
---|
| 103 | self.reverseCol = reverseCol |
---|
| 104 | self.nlines = 0 |
---|
| 105 | positions = {str(chromCol):'%(chrom)s', |
---|
| 106 | str(positionCol):'%(position)d', |
---|
| 107 | str(forwardCol):'%(forward)d', |
---|
| 108 | str(reverseCol):'%(reverse)d'} |
---|
| 109 | if reverseCol < 0: |
---|
| 110 | self.template = "%(0)s\t%(1)s\t%(2)s\n" % positions |
---|
| 111 | else: |
---|
| 112 | self.template = "%(0)s\t%(1)s\t%(2)s\t%(3)s\n" % positions |
---|
| 113 | |
---|
| 114 | def write(self, **kwargs ): |
---|
| 115 | if self.reverseCol < 0: kwargs['forward'] += kwargs['reverse'] |
---|
| 116 | posgen = kwargs['position'] |
---|
| 117 | for position in posgen: |
---|
| 118 | kwargs['position'] = position |
---|
| 119 | self.out_stream.write(self.template % kwargs) |
---|
| 120 | |
---|
| 121 | def close(self): |
---|
| 122 | self.out_stream.flush() |
---|
| 123 | self.out_stream.close() |
---|
| 124 | |
---|
| 125 | if __name__ == "__main__": |
---|
| 126 | options, args = doc_optparse.parse( __doc__ ) |
---|
| 127 | try: |
---|
| 128 | chr_col_1, start_col_1, end_col_1, strand_col_1 = [int(x)-1 for x in options.cols1.split(',')] |
---|
| 129 | chr_col_2, position_col_2, forward_col_2, reverse_col_2 = [int(x)-1 for x in options.cols2.split(',')] |
---|
| 130 | in_fname, out_fname = args |
---|
| 131 | except: |
---|
| 132 | doc_optparse.exception() |
---|
| 133 | |
---|
| 134 | # Sort through a tempfile first |
---|
| 135 | temp_file = tempfile.NamedTemporaryFile(mode="r") |
---|
| 136 | environ['LC_ALL'] = 'POSIX' |
---|
| 137 | commandline = "sort -f -n -k %d -k %d -k %d -o %s %s" % (chr_col_1+1,start_col_1+1,end_col_1+1, temp_file.name, in_fname) |
---|
| 138 | errorcode, stdout = commands.getstatusoutput(commandline) |
---|
| 139 | |
---|
| 140 | coverage = CoverageWriter( out_stream = open(out_fname, "a"), |
---|
| 141 | chromCol = chr_col_2, positionCol = position_col_2, |
---|
| 142 | forwardCol = forward_col_2, reverseCol = reverse_col_2, ) |
---|
| 143 | temp_file.seek(0) |
---|
| 144 | interval = io.NiceReaderWrapper( temp_file, |
---|
| 145 | chrom_col=chr_col_1, |
---|
| 146 | start_col=start_col_1, |
---|
| 147 | end_col=end_col_1, |
---|
| 148 | strand_col=strand_col_1, |
---|
| 149 | fix_strand=True ) |
---|
| 150 | main( interval, coverage ) |
---|
| 151 | temp_file.close() |
---|
| 152 | coverage.close() |
---|