1 | """ |
---|
2 | Compute the intersection of two sets of genomic intervals, either base-by-base |
---|
3 | or at the interval level. The returned GenomicIntervalReader will be in |
---|
4 | the order of the first set of intervals passed in, with the corresponding |
---|
5 | additional fields. |
---|
6 | """ |
---|
7 | |
---|
8 | import traceback |
---|
9 | import fileinput |
---|
10 | from warnings import warn |
---|
11 | |
---|
12 | from bx.intervals.io import * |
---|
13 | from bx.intervals.operations import * |
---|
14 | |
---|
15 | def intersect(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True): |
---|
16 | # The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets. |
---|
17 | # Read all but first into bitsets and intersect to one |
---|
18 | primary = readers[0] |
---|
19 | intersect = readers[1:] |
---|
20 | # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when |
---|
21 | # the bitsets are being created by skipping the problem lines |
---|
22 | intersect[0] = BitsetSafeReaderWrapper( intersect[0], lens=lens ) |
---|
23 | bitsets = intersect[0].binned_bitsets( upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens ) |
---|
24 | intersect = intersect[1:] |
---|
25 | for andset in intersect: |
---|
26 | bitset2 = andset.binned_bitsets(upstream_pad = upstream_pad, downstream_pad = downstream_pad, lens = lens) |
---|
27 | for chrom in bitsets: |
---|
28 | if chrom not in bitset2: continue |
---|
29 | bitsets[chrom].iand(bitset2[chrom]) |
---|
30 | intersect = intersect[1:] |
---|
31 | |
---|
32 | # Read remaining intervals and intersect |
---|
33 | for interval in primary: |
---|
34 | if type( interval ) is Header: |
---|
35 | yield interval |
---|
36 | if type( interval ) is Comment and comments: |
---|
37 | yield interval |
---|
38 | elif type( interval ) == GenomicInterval: |
---|
39 | chrom = interval.chrom |
---|
40 | start = int( interval.start ) |
---|
41 | end = int( interval.end ) |
---|
42 | if chrom not in bitsets: |
---|
43 | continue |
---|
44 | if start > end: |
---|
45 | try: |
---|
46 | # This will only work if primary is a NiceReaderWrapper |
---|
47 | primary.skipped += 1 |
---|
48 | # no reason to stuff an entire bad file into memmory |
---|
49 | if primary.skipped < 10: |
---|
50 | primary.skipped_lines.append( ( primary.linenum, primary.current_line, "Interval start after end!" ) ) |
---|
51 | except: |
---|
52 | pass |
---|
53 | continue |
---|
54 | out_intervals = [] |
---|
55 | # Intersect or Overlap |
---|
56 | try: |
---|
57 | if bitsets[ chrom ].count_range( start, end-start ) >= mincols: |
---|
58 | if pieces: |
---|
59 | out_intervals = bits_set_in_range( bitsets[chrom], start, end ) |
---|
60 | else: |
---|
61 | out_intervals = [ ( start, end ) ] |
---|
62 | # Write the intervals |
---|
63 | for start, end in out_intervals: |
---|
64 | new_interval = interval.copy() |
---|
65 | new_interval.start = start |
---|
66 | new_interval.end = end |
---|
67 | yield new_interval |
---|
68 | except IndexError, e: |
---|
69 | try: |
---|
70 | # This will only work if primary is a NiceReaderWrapper |
---|
71 | primary.skipped += 1 |
---|
72 | # no reason to stuff an entire bad file into memmory |
---|
73 | if primary.skipped < 10: |
---|
74 | primary.skipped_lines.append( ( primary.linenum, primary.current_line, str( e ) ) ) |
---|
75 | except: |
---|
76 | pass |
---|
77 | continue |
---|