1 | """ |
---|
2 | Intersects ... faster. Suports GenomicInterval datatype and multiple |
---|
3 | chromosomes. |
---|
4 | """ |
---|
5 | import math |
---|
6 | import time |
---|
7 | import sys |
---|
8 | import random |
---|
9 | |
---|
10 | class IntervalTree( object ): |
---|
11 | def __init__( self ): |
---|
12 | self.chroms = {} |
---|
13 | def insert( self, interval, linenum=0, other=None ): |
---|
14 | chrom = interval.chrom |
---|
15 | start = interval.start |
---|
16 | end = interval.end |
---|
17 | if interval.chrom in self.chroms: |
---|
18 | self.chroms[chrom] = self.chroms[chrom].insert( start, end, linenum, other ) |
---|
19 | else: |
---|
20 | self.chroms[chrom] = IntervalNode( start, end, linenum, other ) |
---|
21 | def intersect( self, interval, report_func ): |
---|
22 | chrom = interval.chrom |
---|
23 | start = interval.start |
---|
24 | end = interval.end |
---|
25 | if chrom in self.chroms: |
---|
26 | self.chroms[chrom].intersect( start, end, report_func ) |
---|
27 | def traverse( self, func ): |
---|
28 | for item in self.chroms.itervalues(): |
---|
29 | item.traverse( func ) |
---|
30 | |
---|
31 | class IntervalNode( object ): |
---|
32 | def __init__( self, start, end, linenum=0, other=None ): |
---|
33 | # Python lacks the binomial distribution, so we convert a |
---|
34 | # uniform into a binomial because it naturally scales with |
---|
35 | # tree size. Also, python's uniform is perfect since the |
---|
36 | # upper limit is not inclusive, which gives us undefined here. |
---|
37 | self.priority = math.ceil( (-1.0 / math.log(.5)) * math.log( -1.0 / (random.uniform(0,1) - 1))) |
---|
38 | self.start = start |
---|
39 | self.end = end |
---|
40 | self.maxend = self.end |
---|
41 | self.minend = self.end |
---|
42 | self.left = None |
---|
43 | self.right = None |
---|
44 | self.linenum = linenum |
---|
45 | self.other = other |
---|
46 | def insert( self, start, end, linenum=0, other=None ): |
---|
47 | root = self |
---|
48 | if start > self.start: |
---|
49 | # insert to right tree |
---|
50 | if self.right: |
---|
51 | self.right = self.right.insert( start, end, linenum, other ) |
---|
52 | else: |
---|
53 | self.right = IntervalNode(start, end, linenum, other ) |
---|
54 | # rebalance tree |
---|
55 | if self.priority < self.right.priority: |
---|
56 | root = self.rotateleft() |
---|
57 | else: |
---|
58 | # insert to left tree |
---|
59 | if self.left: |
---|
60 | self.left = self.left.insert( start, end, linenum, other ) |
---|
61 | else: |
---|
62 | self.left = IntervalNode(start, end, linenum, other ) |
---|
63 | # rebalance tree |
---|
64 | if self.priority < self.left.priority: |
---|
65 | root = self.rotateright() |
---|
66 | if root.right and root.left: |
---|
67 | root.maxend = max( root.end, root.right.maxend, root.left.maxend ) |
---|
68 | root.minend = min( root.end, root.right.minend, root.left.minend ) |
---|
69 | elif root.right: |
---|
70 | root.maxend = max( root.end, root.right.maxend ) |
---|
71 | root.minend = min( root.end, root.right.minend ) |
---|
72 | elif root.left: |
---|
73 | root.maxend = max( root.end, root.left.maxend ) |
---|
74 | root.minend = min( root.end, root.left.minend ) |
---|
75 | return root |
---|
76 | |
---|
77 | def rotateright( self ): |
---|
78 | root = self.left |
---|
79 | self.left = self.left.right |
---|
80 | root.right = self |
---|
81 | if self.right and self.left: |
---|
82 | self.maxend = max(self.end, self.right.maxend, self.left.maxend) |
---|
83 | self.minend = min(self.end, self.right.minend, self.left.minend ) |
---|
84 | elif self.right: |
---|
85 | self.maxend = max(self.end, self.right.maxend) |
---|
86 | self.minend = min(self.end, self.right.minend) |
---|
87 | elif self.left: |
---|
88 | self.maxend = max(self.end, self.left.maxend) |
---|
89 | self.minend = min(self.end, self.left.minend ) |
---|
90 | return root |
---|
91 | |
---|
92 | def rotateleft( self ): |
---|
93 | root = self.right |
---|
94 | self.right = self.right.left |
---|
95 | root.left = self |
---|
96 | if self.right and self.left: |
---|
97 | self.maxend = max(self.end, self.right.maxend, self.left.maxend) |
---|
98 | self.minend = min(self.end, self.right.minend, self.left.minend ) |
---|
99 | elif self.right: |
---|
100 | self.maxend = max(self.end, self.right.maxend) |
---|
101 | self.minend = min(self.end, self.right.minend) |
---|
102 | elif self.left: |
---|
103 | self.maxend = max(self.end, self.left.maxend) |
---|
104 | self.minend = min(self.end, self.left.minend ) |
---|
105 | return root |
---|
106 | |
---|
107 | def intersect( self, start, end, report_func ): |
---|
108 | if start < self.end and end > self.start: report_func( self ) |
---|
109 | if self.left and start < self.left.maxend: |
---|
110 | self.left.intersect( start, end, report_func ) |
---|
111 | if self.right and end > self.start: |
---|
112 | self.right.intersect( start, end, report_func ) |
---|
113 | |
---|
114 | def traverse( self, func ): |
---|
115 | if self.left: self.left.traverse( func ) |
---|
116 | func( self ) |
---|
117 | if self.right: self.right.traverse( func ) |
---|
118 | |
---|
119 | def main(): |
---|
120 | test = None |
---|
121 | intlist = [] |
---|
122 | for x in range(20000): |
---|
123 | start = random.randint(0,1000000) |
---|
124 | end = start + random.randint(1, 1000) |
---|
125 | if test: test = test.insert( start, end ) |
---|
126 | else: test = IntervalNode( start, end ) |
---|
127 | intlist.append( (start, end) ) |
---|
128 | starttime = time.clock() |
---|
129 | for x in range(5000): |
---|
130 | start = random.randint(0, 10000000) |
---|
131 | end = start + random.randint(1, 1000) |
---|
132 | result = [] |
---|
133 | test.intersect( start, end, lambda x: result.append(x.linenum) ) |
---|
134 | print "%f for tree method" % (time.clock() - starttime) |
---|
135 | starttime = time.clock() |
---|
136 | for x in range(5000): |
---|
137 | start = random.randint(0, 10000000) |
---|
138 | end = start + random.randint(1, 1000) |
---|
139 | bad_sect( intlist, start, end) |
---|
140 | print "%f for linear (bad) method" % (time.clock() - starttime) |
---|
141 | |
---|
142 | def test_func( node ): |
---|
143 | print "[%d, %d), %d" % (node.start, node.end, node.maxend) |
---|
144 | |
---|
145 | def bad_sect( lst, int_start, int_end ): |
---|
146 | intersection = [] |
---|
147 | for start, end in lst: |
---|
148 | if int_start < end and int_end > start: |
---|
149 | intersection.append( (start, end) ) |
---|
150 | return intersection |
---|
151 | |
---|
152 | if __name__ == "__main__": |
---|
153 | main() |
---|
154 | |
---|
155 | |
---|