1 | #!/usr/bin/env python |
---|
2 | """ |
---|
3 | Converter to generate 3 (or 4) column base-pair coverage from an interval file. |
---|
4 | |
---|
5 | usage: %prog bed_file out_file |
---|
6 | -1, --cols1=N,N,N,N: Columns for chrom, start, end, strand in interval file |
---|
7 | -2, --cols2=N,N,N,N: Columns for chrom, start, end, strand in coverage file |
---|
8 | """ |
---|
9 | import sys |
---|
10 | from galaxy import eggs |
---|
11 | import pkg_resources; pkg_resources.require( "bx-python" ) |
---|
12 | from bx.intervals import io |
---|
13 | from bx.cookbook import doc_optparse |
---|
14 | import psyco_full |
---|
15 | import commands |
---|
16 | import os |
---|
17 | from os import environ |
---|
18 | import tempfile |
---|
19 | from bisect import bisect |
---|
20 | |
---|
21 | INTERVAL_METADATA = ('chromCol', |
---|
22 | 'startCol', |
---|
23 | 'endCol', |
---|
24 | 'strandCol',) |
---|
25 | |
---|
26 | COVERAGE_METADATA = ('chromCol', |
---|
27 | 'positionCol', |
---|
28 | 'forwardCol', |
---|
29 | 'reverseCol',) |
---|
30 | |
---|
31 | def main( interval, coverage ): |
---|
32 | """ |
---|
33 | Uses a sliding window of partitions to count coverages. |
---|
34 | Every interval record adds its start and end to the partitions. The result |
---|
35 | is a list of partitions, or every position that has a (maybe) different |
---|
36 | number of basepairs covered. We don't worry about merging because we pop |
---|
37 | as the sorted intervals are read in. As the input start positions exceed |
---|
38 | the partition positions in partitions, coverages are kicked out in bulk. |
---|
39 | """ |
---|
40 | partitions = [] |
---|
41 | forward_covs = [] |
---|
42 | reverse_covs = [] |
---|
43 | offset = 0 |
---|
44 | chrom = None |
---|
45 | lastchrom = None |
---|
46 | for record in interval: |
---|
47 | chrom = record.chrom |
---|
48 | if lastchrom and not lastchrom == chrom and partitions: |
---|
49 | for partition in xrange(0, len(partitions)-1): |
---|
50 | forward = forward_covs[partition] |
---|
51 | reverse = reverse_covs[partition] |
---|
52 | if forward+reverse > 0: |
---|
53 | coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]), |
---|
54 | forward=forward, reverse=reverse) |
---|
55 | partitions = [] |
---|
56 | forward_covs = [] |
---|
57 | reverse_covs = [] |
---|
58 | |
---|
59 | start_index = bisect(partitions, record.start) |
---|
60 | forward = int(record.strand == "+") |
---|
61 | reverse = int(record.strand == "-") |
---|
62 | forward_base = 0 |
---|
63 | reverse_base = 0 |
---|
64 | if start_index > 0: |
---|
65 | forward_base = forward_covs[start_index-1] |
---|
66 | reverse_base = reverse_covs[start_index-1] |
---|
67 | partitions.insert(start_index, record.start) |
---|
68 | forward_covs.insert(start_index, forward_base) |
---|
69 | reverse_covs.insert(start_index, reverse_base) |
---|
70 | end_index = bisect(partitions, record.end) |
---|
71 | for index in xrange(start_index, end_index): |
---|
72 | forward_covs[index] += forward |
---|
73 | reverse_covs[index] += reverse |
---|
74 | partitions.insert(end_index, record.end) |
---|
75 | forward_covs.insert(end_index, forward_covs[end_index-1] - forward ) |
---|
76 | reverse_covs.insert(end_index, reverse_covs[end_index-1] - reverse ) |
---|
77 | |
---|
78 | if partitions: |
---|
79 | for partition in xrange(0, start_index): |
---|
80 | forward = forward_covs[partition] |
---|
81 | reverse = reverse_covs[partition] |
---|
82 | if forward+reverse > 0: |
---|
83 | coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]), |
---|
84 | forward=forward, reverse=reverse) |
---|
85 | partitions = partitions[start_index:] |
---|
86 | forward_covs = forward_covs[start_index:] |
---|
87 | reverse_covs = reverse_covs[start_index:] |
---|
88 | |
---|
89 | lastchrom = chrom |
---|
90 | |
---|
91 | # Finish the last chromosome |
---|
92 | if partitions: |
---|
93 | for partition in xrange(0, len(partitions)-1): |
---|
94 | forward = forward_covs[partition] |
---|
95 | reverse = reverse_covs[partition] |
---|
96 | if forward+reverse > 0: |
---|
97 | coverage.write(chrom=chrom, position=xrange(partitions[partition],partitions[partition+1]), |
---|
98 | forward=forward, reverse=reverse) |
---|
99 | |
---|
100 | class CoverageWriter( object ): |
---|
101 | def __init__( self, out_stream=None, chromCol=0, positionCol=1, forwardCol=2, reverseCol=3 ): |
---|
102 | self.out_stream = out_stream |
---|
103 | self.reverseCol = reverseCol |
---|
104 | self.nlines = 0 |
---|
105 | positions = {str(chromCol):'%(chrom)s', |
---|
106 | str(positionCol):'%(position)d', |
---|
107 | str(forwardCol):'%(forward)d', |
---|
108 | str(reverseCol):'%(reverse)d'} |
---|
109 | if reverseCol < 0: |
---|
110 | self.template = "%(0)s\t%(1)s\t%(2)s\n" % positions |
---|
111 | else: |
---|
112 | self.template = "%(0)s\t%(1)s\t%(2)s\t%(3)s\n" % positions |
---|
113 | |
---|
114 | def write(self, **kwargs ): |
---|
115 | if self.reverseCol < 0: kwargs['forward'] += kwargs['reverse'] |
---|
116 | posgen = kwargs['position'] |
---|
117 | for position in posgen: |
---|
118 | kwargs['position'] = position |
---|
119 | self.out_stream.write(self.template % kwargs) |
---|
120 | |
---|
121 | def close(self): |
---|
122 | self.out_stream.flush() |
---|
123 | self.out_stream.close() |
---|
124 | |
---|
125 | if __name__ == "__main__": |
---|
126 | options, args = doc_optparse.parse( __doc__ ) |
---|
127 | try: |
---|
128 | chr_col_1, start_col_1, end_col_1, strand_col_1 = [int(x)-1 for x in options.cols1.split(',')] |
---|
129 | chr_col_2, position_col_2, forward_col_2, reverse_col_2 = [int(x)-1 for x in options.cols2.split(',')] |
---|
130 | in_fname, out_fname = args |
---|
131 | except: |
---|
132 | doc_optparse.exception() |
---|
133 | |
---|
134 | # Sort through a tempfile first |
---|
135 | temp_file = tempfile.NamedTemporaryFile(mode="r") |
---|
136 | environ['LC_ALL'] = 'POSIX' |
---|
137 | commandline = "sort -f -n -k %d -k %d -k %d -o %s %s" % (chr_col_1+1,start_col_1+1,end_col_1+1, temp_file.name, in_fname) |
---|
138 | errorcode, stdout = commands.getstatusoutput(commandline) |
---|
139 | |
---|
140 | coverage = CoverageWriter( out_stream = open(out_fname, "a"), |
---|
141 | chromCol = chr_col_2, positionCol = position_col_2, |
---|
142 | forwardCol = forward_col_2, reverseCol = reverse_col_2, ) |
---|
143 | temp_file.seek(0) |
---|
144 | interval = io.NiceReaderWrapper( temp_file, |
---|
145 | chrom_col=chr_col_1, |
---|
146 | start_col=start_col_1, |
---|
147 | end_col=end_col_1, |
---|
148 | strand_col=strand_col_1, |
---|
149 | fix_strand=True ) |
---|
150 | main( interval, coverage ) |
---|
151 | temp_file.close() |
---|
152 | coverage.close() |
---|