| 1 | """ |
|---|
| 2 | Join two sets of intervals using their overlap as the key. The |
|---|
| 3 | intervals MUST be sorted by chrom(lexicographically), |
|---|
| 4 | start(arithmetically) and end(arithmetically). This works by simply |
|---|
| 5 | walking through the inputs in O(n) time. |
|---|
| 6 | """ |
|---|
| 7 | |
|---|
| 8 | import psyco_full |
|---|
| 9 | |
|---|
| 10 | import math |
|---|
| 11 | import traceback |
|---|
| 12 | import fileinput |
|---|
| 13 | from warnings import warn |
|---|
| 14 | |
|---|
| 15 | from bx.intervals.io import * |
|---|
| 16 | from bx.intervals.operations import * |
|---|
| 17 | from quicksect import IntervalTree |
|---|
| 18 | |
|---|
| 19 | def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True): |
|---|
| 20 | # Read rightSet into memory: |
|---|
| 21 | rightlen = 0 |
|---|
| 22 | leftlen = 0 |
|---|
| 23 | rightTree = IntervalTree() |
|---|
| 24 | for item in rightSet: |
|---|
| 25 | if type( item ) is GenomicInterval: |
|---|
| 26 | rightTree.insert( item, rightSet.linenum, item.fields ) |
|---|
| 27 | if rightlen == 0: rightlen = item.nfields |
|---|
| 28 | |
|---|
| 29 | for interval in leftSet: |
|---|
| 30 | if leftlen == 0 and type( interval ) is GenomicInterval: |
|---|
| 31 | leftlen = interval.nfields |
|---|
| 32 | if not (type( interval ) is GenomicInterval): |
|---|
| 33 | yield interval |
|---|
| 34 | else: |
|---|
| 35 | result = [] |
|---|
| 36 | rightTree.intersect( interval, lambda node: result.append( node ) ) |
|---|
| 37 | overlap_not_met = 0 |
|---|
| 38 | for item in result: |
|---|
| 39 | if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1): |
|---|
| 40 | overlap = interval.end-item.start |
|---|
| 41 | elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1): |
|---|
| 42 | overlap = item.end-interval.start |
|---|
| 43 | elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1): |
|---|
| 44 | overlap = item.end-item.start |
|---|
| 45 | else: #the intersecting item's start and end are outside the interval range |
|---|
| 46 | overlap = interval.end-interval.start |
|---|
| 47 | if overlap < mincols: |
|---|
| 48 | overlap_not_met += 1 |
|---|
| 49 | continue |
|---|
| 50 | outfields = list(interval) |
|---|
| 51 | map(outfields.append, item.other) |
|---|
| 52 | setattr( item, "visited", True ) |
|---|
| 53 | yield outfields |
|---|
| 54 | if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: |
|---|
| 55 | outfields = list(interval) |
|---|
| 56 | for x in range(rightlen): outfields.append(".") |
|---|
| 57 | yield outfields |
|---|
| 58 | |
|---|
| 59 | if leftfill: |
|---|
| 60 | def report_unvisited( node, results ): |
|---|
| 61 | if not hasattr(node, "visited"): |
|---|
| 62 | results.append( node ) |
|---|
| 63 | results = [] |
|---|
| 64 | rightTree.traverse( lambda x: report_unvisited( x, results ) ) |
|---|
| 65 | for item in results: |
|---|
| 66 | outfields = list() |
|---|
| 67 | for x in range(leftlen): outfields.append(".") |
|---|
| 68 | map(outfields.append, item.other) |
|---|
| 69 | yield outfields |
|---|
| 70 | |
|---|
| 71 | |
|---|
| 72 | def interval_cmp(a, b): |
|---|
| 73 | interval1 = a[0] |
|---|
| 74 | interval2 = b[0] |
|---|
| 75 | if not (type( interval1 ) is GenomicInterval and type( interval2 ) is GenomicInterval): |
|---|
| 76 | return 0 |
|---|
| 77 | # Both are intervals |
|---|
| 78 | if interval1.chrom == interval2.chrom: |
|---|
| 79 | center1 = interval1.start + ((interval1.end - interval1.start) / 2) |
|---|
| 80 | center2 = interval2.start + ((interval2.end - interval2.start) / 2) |
|---|
| 81 | return center1 - center2 |
|---|
| 82 | else: |
|---|
| 83 | if interval1.chrom > interval2.chrom: |
|---|
| 84 | return 1 |
|---|
| 85 | else: |
|---|
| 86 | return -1 |
|---|
| 87 | |
|---|
| 88 | return 0 |
|---|
| 89 | |
|---|
| 90 | def findintersect(interval, sortedlist, mincols): |
|---|
| 91 | # find range of intervals that intersect via a binary search |
|---|
| 92 | # find lower bound |
|---|
| 93 | x = len(sortedlist) / 2 |
|---|
| 94 | n = int(math.pow(2,math.ceil(math.log(len(sortedlist),2)))) |
|---|
| 95 | |
|---|
| 96 | not_found = True |
|---|
| 97 | not_done = True |
|---|
| 98 | while not_found and not_done: |
|---|
| 99 | n = n / 2 |
|---|
| 100 | if n == 0: |
|---|
| 101 | n = 1 |
|---|
| 102 | not_done = False |
|---|
| 103 | if x >= len(sortedlist): |
|---|
| 104 | x -= n |
|---|
| 105 | elif x < 0: |
|---|
| 106 | x += n |
|---|
| 107 | else: |
|---|
| 108 | if findoverlap(sortedlist[x][0], interval) >= mincols: |
|---|
| 109 | not_found = False |
|---|
| 110 | else: |
|---|
| 111 | comp = interval_cmp(sortedlist[x], [interval, 0]) |
|---|
| 112 | if comp > 0: |
|---|
| 113 | x -= n |
|---|
| 114 | else: |
|---|
| 115 | x += n |
|---|
| 116 | |
|---|
| 117 | print "\t".join(sortedlist[x][0].fields) |
|---|
| 118 | print "not_found = " + str(not_found) |
|---|
| 119 | if not_found: |
|---|
| 120 | return 0,-1 |
|---|
| 121 | |
|---|
| 122 | lowerbound = x |
|---|
| 123 | middlebound = x |
|---|
| 124 | upperbound = x |
|---|
| 125 | while (lowerbound > -1) and (findoverlap(sortedlist[lowerbound-1][0],interval) >= mincols): |
|---|
| 126 | lowerbound -= 1 |
|---|
| 127 | while (upperbound+1 < len(sortedlist)) and (findoverlap(sortedlist[upperbound+1][0],interval) >= mincols): |
|---|
| 128 | upperbound += 1 |
|---|
| 129 | |
|---|
| 130 | return lowerbound, upperbound |
|---|
| 131 | |
|---|
| 132 | def findoverlap(a, b): |
|---|
| 133 | # overlapping |
|---|
| 134 | if a.chrom == b.chrom: |
|---|
| 135 | return min(a.end, b.end) - max(a.start, b.start) |
|---|
| 136 | else: |
|---|
| 137 | return 0 |
|---|