1 | """ |
---|
2 | Join two sets of intervals using their overlap as the key. The |
---|
3 | intervals MUST be sorted by chrom(lexicographically), |
---|
4 | start(arithmetically) and end(arithmetically). This works by simply |
---|
5 | walking through the inputs in O(n) time. |
---|
6 | """ |
---|
7 | |
---|
8 | import psyco_full |
---|
9 | |
---|
10 | import math |
---|
11 | import traceback |
---|
12 | import fileinput |
---|
13 | from warnings import warn |
---|
14 | |
---|
15 | from bx.intervals.io import * |
---|
16 | from bx.intervals.operations import * |
---|
17 | from quicksect import IntervalTree |
---|
18 | |
---|
19 | def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True): |
---|
20 | # Read rightSet into memory: |
---|
21 | rightlen = 0 |
---|
22 | leftlen = 0 |
---|
23 | rightTree = IntervalTree() |
---|
24 | for item in rightSet: |
---|
25 | if type( item ) is GenomicInterval: |
---|
26 | rightTree.insert( item, rightSet.linenum, item.fields ) |
---|
27 | if rightlen == 0: rightlen = item.nfields |
---|
28 | |
---|
29 | for interval in leftSet: |
---|
30 | if leftlen == 0 and type( interval ) is GenomicInterval: |
---|
31 | leftlen = interval.nfields |
---|
32 | if not (type( interval ) is GenomicInterval): |
---|
33 | yield interval |
---|
34 | else: |
---|
35 | result = [] |
---|
36 | rightTree.intersect( interval, lambda node: result.append( node ) ) |
---|
37 | overlap_not_met = 0 |
---|
38 | for item in result: |
---|
39 | if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1): |
---|
40 | overlap = interval.end-item.start |
---|
41 | elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1): |
---|
42 | overlap = item.end-interval.start |
---|
43 | elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1): |
---|
44 | overlap = item.end-item.start |
---|
45 | else: #the intersecting item's start and end are outside the interval range |
---|
46 | overlap = interval.end-interval.start |
---|
47 | if overlap < mincols: |
---|
48 | overlap_not_met += 1 |
---|
49 | continue |
---|
50 | outfields = list(interval) |
---|
51 | map(outfields.append, item.other) |
---|
52 | setattr( item, "visited", True ) |
---|
53 | yield outfields |
---|
54 | if (len(result) == 0 or overlap_not_met == len(result)) and rightfill: |
---|
55 | outfields = list(interval) |
---|
56 | for x in range(rightlen): outfields.append(".") |
---|
57 | yield outfields |
---|
58 | |
---|
59 | if leftfill: |
---|
60 | def report_unvisited( node, results ): |
---|
61 | if not hasattr(node, "visited"): |
---|
62 | results.append( node ) |
---|
63 | results = [] |
---|
64 | rightTree.traverse( lambda x: report_unvisited( x, results ) ) |
---|
65 | for item in results: |
---|
66 | outfields = list() |
---|
67 | for x in range(leftlen): outfields.append(".") |
---|
68 | map(outfields.append, item.other) |
---|
69 | yield outfields |
---|
70 | |
---|
71 | |
---|
72 | def interval_cmp(a, b): |
---|
73 | interval1 = a[0] |
---|
74 | interval2 = b[0] |
---|
75 | if not (type( interval1 ) is GenomicInterval and type( interval2 ) is GenomicInterval): |
---|
76 | return 0 |
---|
77 | # Both are intervals |
---|
78 | if interval1.chrom == interval2.chrom: |
---|
79 | center1 = interval1.start + ((interval1.end - interval1.start) / 2) |
---|
80 | center2 = interval2.start + ((interval2.end - interval2.start) / 2) |
---|
81 | return center1 - center2 |
---|
82 | else: |
---|
83 | if interval1.chrom > interval2.chrom: |
---|
84 | return 1 |
---|
85 | else: |
---|
86 | return -1 |
---|
87 | |
---|
88 | return 0 |
---|
89 | |
---|
90 | def findintersect(interval, sortedlist, mincols): |
---|
91 | # find range of intervals that intersect via a binary search |
---|
92 | # find lower bound |
---|
93 | x = len(sortedlist) / 2 |
---|
94 | n = int(math.pow(2,math.ceil(math.log(len(sortedlist),2)))) |
---|
95 | |
---|
96 | not_found = True |
---|
97 | not_done = True |
---|
98 | while not_found and not_done: |
---|
99 | n = n / 2 |
---|
100 | if n == 0: |
---|
101 | n = 1 |
---|
102 | not_done = False |
---|
103 | if x >= len(sortedlist): |
---|
104 | x -= n |
---|
105 | elif x < 0: |
---|
106 | x += n |
---|
107 | else: |
---|
108 | if findoverlap(sortedlist[x][0], interval) >= mincols: |
---|
109 | not_found = False |
---|
110 | else: |
---|
111 | comp = interval_cmp(sortedlist[x], [interval, 0]) |
---|
112 | if comp > 0: |
---|
113 | x -= n |
---|
114 | else: |
---|
115 | x += n |
---|
116 | |
---|
117 | print "\t".join(sortedlist[x][0].fields) |
---|
118 | print "not_found = " + str(not_found) |
---|
119 | if not_found: |
---|
120 | return 0,-1 |
---|
121 | |
---|
122 | lowerbound = x |
---|
123 | middlebound = x |
---|
124 | upperbound = x |
---|
125 | while (lowerbound > -1) and (findoverlap(sortedlist[lowerbound-1][0],interval) >= mincols): |
---|
126 | lowerbound -= 1 |
---|
127 | while (upperbound+1 < len(sortedlist)) and (findoverlap(sortedlist[upperbound+1][0],interval) >= mincols): |
---|
128 | upperbound += 1 |
---|
129 | |
---|
130 | return lowerbound, upperbound |
---|
131 | |
---|
132 | def findoverlap(a, b): |
---|
133 | # overlapping |
---|
134 | if a.chrom == b.chrom: |
---|
135 | return min(a.end, b.end) - max(a.start, b.start) |
---|
136 | else: |
---|
137 | return 0 |
---|