| 1 | #!/usr/bin/env python | 
|---|
| 2 | """ | 
|---|
| 3 | Subtract one set of genomic intervals from another (base-by-base or whole | 
|---|
| 4 | intervals). The returned GenomicIntervals will be in the order | 
|---|
| 5 | of the first set of intervals passed in, with the corresponding | 
|---|
| 6 | meta-data. | 
|---|
| 7 | """ | 
|---|
| 8 |  | 
|---|
| 9 | import traceback | 
|---|
| 10 | import fileinput | 
|---|
| 11 | from warnings import warn | 
|---|
| 12 |  | 
|---|
| 13 | from bx.intervals.io import * | 
|---|
| 14 | from bx.intervals.operations import * | 
|---|
| 15 |  | 
|---|
| 16 | def subtract(readers, mincols=1, upstream_pad=0, downstream_pad=0, pieces=True, lens={}, comments=True): | 
|---|
| 17 |     # The incoming lens dictionary is a dictionary of chromosome lengths which are used to initialize the bitsets. | 
|---|
| 18 |     # Read all but first into bitsets and union to one (if confused, read DeMorgan's...) | 
|---|
| 19 |     primary = readers[0] | 
|---|
| 20 |     union = readers[1:] | 
|---|
| 21 |     # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when | 
|---|
| 22 |     # the bitsets are being created by skipping the problem lines | 
|---|
| 23 |     union[0] = BitsetSafeReaderWrapper( union[0], lens=lens ) | 
|---|
| 24 |     bitsets = union[0].binned_bitsets( upstream_pad=upstream_pad, downstream_pad=downstream_pad, lens=lens ) | 
|---|
| 25 |     union = union[1:] | 
|---|
| 26 |     for andset in union: | 
|---|
| 27 |         bitset2 = andset.binned_bitsets(upstream_pad = upstream_pad, downstream_pad = downstream_pad, lens = lens) | 
|---|
| 28 |         for chrom in bitset2: | 
|---|
| 29 |             if chrom not in bitsets: | 
|---|
| 30 |                 bitsets[chrom] = bitset2[chrom] | 
|---|
| 31 |             else: | 
|---|
| 32 |                 bitsets[chrom].ior(bitset2[chrom]) | 
|---|
| 33 |      | 
|---|
| 34 |     # Read remaining intervals and subtract | 
|---|
| 35 |     for interval in primary: | 
|---|
| 36 |         if type( interval ) is Header: | 
|---|
| 37 |             yield interval | 
|---|
| 38 |         if type( interval ) is Comment and comments: | 
|---|
| 39 |             yield interval | 
|---|
| 40 |         elif type( interval ) == GenomicInterval: | 
|---|
| 41 |             chrom = interval.chrom | 
|---|
| 42 |             if chrom not in bitsets: | 
|---|
| 43 |                 yield interval | 
|---|
| 44 |             else: | 
|---|
| 45 |                 start = int(interval.start) | 
|---|
| 46 |                 end = int(interval.end) | 
|---|
| 47 |                 if start > end: warn( "Interval start after end!" ) | 
|---|
| 48 |                 out_intervals = [] | 
|---|
| 49 |                 # Find the intervals that meet the criteria (for the three sensible | 
|---|
| 50 |                 # permutations of reverse and pieces) | 
|---|
| 51 |                 try: | 
|---|
| 52 |                     if bitsets[ chrom ].count_range( start, end-start ) >= mincols:                 | 
|---|
| 53 |                         if pieces: | 
|---|
| 54 |                             out_intervals = bits_clear_in_range( bitsets[chrom], start, end ) | 
|---|
| 55 |                     else: | 
|---|
| 56 |                         out_intervals = [ ( start, end ) ] | 
|---|
| 57 |                     # Write the intervals | 
|---|
| 58 |                     for start, end in out_intervals: | 
|---|
| 59 |                         new_interval = interval.copy() | 
|---|
| 60 |                         new_interval.start = start | 
|---|
| 61 |                         new_interval.end = end | 
|---|
| 62 |                         yield new_interval | 
|---|
| 63 |                 except IndexError, e: | 
|---|
| 64 |                     try: | 
|---|
| 65 |                         # This will work only if primary is a NiceReaderWrapper | 
|---|
| 66 |                         primary.skipped += 1 | 
|---|
| 67 |                         # no reason to stuff an entire bad file into memmory | 
|---|
| 68 |                         if primary.skipped < 10: | 
|---|
| 69 |                             primary.skipped_lines.append( ( primary.linenum, primary.current_line, str( e ) ) ) | 
|---|
| 70 |                     except: | 
|---|
| 71 |                         pass | 
|---|
| 72 |                     continue | 
|---|