[3] | 1 | """ |
---|
| 2 | Compute the intersection of two sets of genomic intervals, either base-by-base |
---|
| 3 | or at the interval level. The returned GenomicIntervalReader will be in |
---|
| 4 | the order of the first set of intervals passed in, with the corresponding |
---|
| 5 | additional fields. |
---|
| 6 | """ |
---|
| 7 | |
---|
| 8 | import traceback |
---|
| 9 | import fileinput |
---|
| 10 | from warnings import warn |
---|
| 11 | |
---|
| 12 | from bx.intervals.io import * |
---|
| 13 | from bx.intervals.operations import * |
---|
| 14 | |
---|
| 15 | def intersect(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True): |
---|
| 16 | # The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets. |
---|
| 17 | # Read all but first into bitsets and intersect to one |
---|
| 18 | primary = readers[0] |
---|
| 19 | intersect = readers[1:] |
---|
| 20 | # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when |
---|
| 21 | # the bitsets are being created by skipping the problem lines |
---|
| 22 | intersect[0] = BitsetSafeReaderWrapper( intersect[0], lens=lens ) |
---|
| 23 | bitsets = intersect[0].binned_bitsets( upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens ) |
---|
| 24 | intersect = intersect[1:] |
---|
| 25 | for andset in intersect: |
---|
| 26 | bitset2 = andset.binned_bitsets(upstream_pad = upstream_pad, downstream_pad = downstream_pad, lens = lens) |
---|
| 27 | for chrom in bitsets: |
---|
| 28 | if chrom not in bitset2: continue |
---|
| 29 | bitsets[chrom].iand(bitset2[chrom]) |
---|
| 30 | intersect = intersect[1:] |
---|
| 31 | |
---|
| 32 | # Read remaining intervals and intersect |
---|
| 33 | for interval in primary: |
---|
| 34 | if type( interval ) is Header: |
---|
| 35 | yield interval |
---|
| 36 | if type( interval ) is Comment and comments: |
---|
| 37 | yield interval |
---|
| 38 | elif type( interval ) == GenomicInterval: |
---|
| 39 | chrom = interval.chrom |
---|
| 40 | start = int( interval.start ) |
---|
| 41 | end = int( interval.end ) |
---|
| 42 | if chrom not in bitsets: |
---|
| 43 | continue |
---|
| 44 | if start > end: |
---|
| 45 | try: |
---|
| 46 | # This will only work if primary is a NiceReaderWrapper |
---|
| 47 | primary.skipped += 1 |
---|
| 48 | # no reason to stuff an entire bad file into memmory |
---|
| 49 | if primary.skipped < 10: |
---|
| 50 | primary.skipped_lines.append( ( primary.linenum, primary.current_line, "Interval start after end!" ) ) |
---|
| 51 | except: |
---|
| 52 | pass |
---|
| 53 | continue |
---|
| 54 | out_intervals = [] |
---|
| 55 | # Intersect or Overlap |
---|
| 56 | try: |
---|
| 57 | if bitsets[ chrom ].count_range( start, end-start ) >= mincols: |
---|
| 58 | if pieces: |
---|
| 59 | out_intervals = bits_set_in_range( bitsets[chrom], start, end ) |
---|
| 60 | else: |
---|
| 61 | out_intervals = [ ( start, end ) ] |
---|
| 62 | # Write the intervals |
---|
| 63 | for start, end in out_intervals: |
---|
| 64 | new_interval = interval.copy() |
---|
| 65 | new_interval.start = start |
---|
| 66 | new_interval.end = end |
---|
| 67 | yield new_interval |
---|
| 68 | except IndexError, e: |
---|
| 69 | try: |
---|
| 70 | # This will only work if primary is a NiceReaderWrapper |
---|
| 71 | primary.skipped += 1 |
---|
| 72 | # no reason to stuff an entire bad file into memmory |
---|
| 73 | if primary.skipped < 10: |
---|
| 74 | primary.skipped_lines.append( ( primary.linenum, primary.current_line, str( e ) ) ) |
---|
| 75 | except: |
---|
| 76 | pass |
---|
| 77 | continue |
---|