1 | #!/usr/bin/env python |
---|
2 | """ |
---|
3 | Subtract one set of genomic intervals from another (base-by-base or whole |
---|
4 | intervals). The returned GenomicIntervals will be in the order |
---|
5 | of the first set of intervals passed in, with the corresponding |
---|
6 | meta-data. |
---|
7 | """ |
---|
8 | |
---|
9 | import traceback |
---|
10 | import fileinput |
---|
11 | from warnings import warn |
---|
12 | |
---|
13 | from bx.intervals.io import * |
---|
14 | from bx.intervals.operations import * |
---|
15 | |
---|
16 | def subtract(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True): |
---|
17 | # The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets. |
---|
18 | # Read all but first into bitsets and union to one (if confused, read DeMorgan's...) |
---|
19 | primary = readers[0] |
---|
20 | union = readers[1:] |
---|
21 | # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when |
---|
22 | # the bitsets are being created by skipping the problem lines |
---|
23 | union[0] = BitsetSafeReaderWrapper( union[0], lens=lens ) |
---|
24 | bitsets = union[0].binned_bitsets( upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens ) |
---|
25 | union = union[1:] |
---|
26 | for andset in union: |
---|
27 | bitset2 = andset.binned_bitsets(upstream_pad = upstream_pad, downstream_pad = downstream_pad, lens = lens) |
---|
28 | for chrom in bitset2: |
---|
29 | if chrom not in bitsets: |
---|
30 | bitsets[chrom] = bitset2[chrom] |
---|
31 | else: |
---|
32 | bitsets[chrom].ior(bitset2[chrom]) |
---|
33 | |
---|
34 | # Read remaining intervals and subtract |
---|
35 | for interval in primary: |
---|
36 | if type( interval ) is Header: |
---|
37 | yield interval |
---|
38 | if type( interval ) is Comment and comments: |
---|
39 | yield interval |
---|
40 | elif type( interval ) == GenomicInterval: |
---|
41 | chrom = interval.chrom |
---|
42 | if chrom not in bitsets: |
---|
43 | yield interval |
---|
44 | else: |
---|
45 | start = int(interval.start) |
---|
46 | end = int(interval.end) |
---|
47 | if start > end: warn( "Interval start after end!" ) |
---|
48 | out_intervals = [] |
---|
49 | # Find the intervals that meet the criteria (for the three sensible |
---|
50 | # permutations of reverse and pieces) |
---|
51 | try: |
---|
52 | if bitsets[ chrom ].count_range( start, end-start ) >= mincols: |
---|
53 | if pieces: |
---|
54 | out_intervals = bits_clear_in_range( bitsets[chrom], start, end ) |
---|
55 | else: |
---|
56 | out_intervals = [ ( start, end ) ] |
---|
57 | # Write the intervals |
---|
58 | for start, end in out_intervals: |
---|
59 | new_interval = interval.copy() |
---|
60 | new_interval.start = start |
---|
61 | new_interval.end = end |
---|
62 | yield new_interval |
---|
63 | except IndexError, e: |
---|
64 | try: |
---|
65 | # This will work only if primary is a NiceReaderWrapper |
---|
66 | primary.skipped += 1 |
---|
67 | # no reason to stuff an entire bad file into memmory |
---|
68 | if primary.skipped < 10: |
---|
69 | primary.skipped_lines.append( ( primary.linenum, primary.current_line, str( e ) ) ) |
---|
70 | except: |
---|
71 | pass |
---|
72 | continue |
---|