| 1 | """ |
|---|
| 2 | Intersects ... faster. Suports GenomicInterval datatype and multiple |
|---|
| 3 | chromosomes. |
|---|
| 4 | """ |
|---|
| 5 | import math |
|---|
| 6 | import time |
|---|
| 7 | import sys |
|---|
| 8 | import random |
|---|
| 9 | |
|---|
| 10 | class IntervalTree( object ): |
|---|
| 11 | def __init__( self ): |
|---|
| 12 | self.chroms = {} |
|---|
| 13 | def insert( self, interval, linenum=0, other=None ): |
|---|
| 14 | chrom = interval.chrom |
|---|
| 15 | start = interval.start |
|---|
| 16 | end = interval.end |
|---|
| 17 | if interval.chrom in self.chroms: |
|---|
| 18 | self.chroms[chrom] = self.chroms[chrom].insert( start, end, linenum, other ) |
|---|
| 19 | else: |
|---|
| 20 | self.chroms[chrom] = IntervalNode( start, end, linenum, other ) |
|---|
| 21 | def intersect( self, interval, report_func ): |
|---|
| 22 | chrom = interval.chrom |
|---|
| 23 | start = interval.start |
|---|
| 24 | end = interval.end |
|---|
| 25 | if chrom in self.chroms: |
|---|
| 26 | self.chroms[chrom].intersect( start, end, report_func ) |
|---|
| 27 | def traverse( self, func ): |
|---|
| 28 | for item in self.chroms.itervalues(): |
|---|
| 29 | item.traverse( func ) |
|---|
| 30 | |
|---|
| 31 | class IntervalNode( object ): |
|---|
| 32 | def __init__( self, start, end, linenum=0, other=None ): |
|---|
| 33 | # Python lacks the binomial distribution, so we convert a |
|---|
| 34 | # uniform into a binomial because it naturally scales with |
|---|
| 35 | # tree size. Also, python's uniform is perfect since the |
|---|
| 36 | # upper limit is not inclusive, which gives us undefined here. |
|---|
| 37 | self.priority = math.ceil( (-1.0 / math.log(.5)) * math.log( -1.0 / (random.uniform(0,1) - 1))) |
|---|
| 38 | self.start = start |
|---|
| 39 | self.end = end |
|---|
| 40 | self.maxend = self.end |
|---|
| 41 | self.minend = self.end |
|---|
| 42 | self.left = None |
|---|
| 43 | self.right = None |
|---|
| 44 | self.linenum = linenum |
|---|
| 45 | self.other = other |
|---|
| 46 | def insert( self, start, end, linenum=0, other=None ): |
|---|
| 47 | root = self |
|---|
| 48 | if start > self.start: |
|---|
| 49 | # insert to right tree |
|---|
| 50 | if self.right: |
|---|
| 51 | self.right = self.right.insert( start, end, linenum, other ) |
|---|
| 52 | else: |
|---|
| 53 | self.right = IntervalNode(start, end, linenum, other ) |
|---|
| 54 | # rebalance tree |
|---|
| 55 | if self.priority < self.right.priority: |
|---|
| 56 | root = self.rotateleft() |
|---|
| 57 | else: |
|---|
| 58 | # insert to left tree |
|---|
| 59 | if self.left: |
|---|
| 60 | self.left = self.left.insert( start, end, linenum, other ) |
|---|
| 61 | else: |
|---|
| 62 | self.left = IntervalNode(start, end, linenum, other ) |
|---|
| 63 | # rebalance tree |
|---|
| 64 | if self.priority < self.left.priority: |
|---|
| 65 | root = self.rotateright() |
|---|
| 66 | if root.right and root.left: |
|---|
| 67 | root.maxend = max( root.end, root.right.maxend, root.left.maxend ) |
|---|
| 68 | root.minend = min( root.end, root.right.minend, root.left.minend ) |
|---|
| 69 | elif root.right: |
|---|
| 70 | root.maxend = max( root.end, root.right.maxend ) |
|---|
| 71 | root.minend = min( root.end, root.right.minend ) |
|---|
| 72 | elif root.left: |
|---|
| 73 | root.maxend = max( root.end, root.left.maxend ) |
|---|
| 74 | root.minend = min( root.end, root.left.minend ) |
|---|
| 75 | return root |
|---|
| 76 | |
|---|
| 77 | def rotateright( self ): |
|---|
| 78 | root = self.left |
|---|
| 79 | self.left = self.left.right |
|---|
| 80 | root.right = self |
|---|
| 81 | if self.right and self.left: |
|---|
| 82 | self.maxend = max(self.end, self.right.maxend, self.left.maxend) |
|---|
| 83 | self.minend = min(self.end, self.right.minend, self.left.minend ) |
|---|
| 84 | elif self.right: |
|---|
| 85 | self.maxend = max(self.end, self.right.maxend) |
|---|
| 86 | self.minend = min(self.end, self.right.minend) |
|---|
| 87 | elif self.left: |
|---|
| 88 | self.maxend = max(self.end, self.left.maxend) |
|---|
| 89 | self.minend = min(self.end, self.left.minend ) |
|---|
| 90 | return root |
|---|
| 91 | |
|---|
| 92 | def rotateleft( self ): |
|---|
| 93 | root = self.right |
|---|
| 94 | self.right = self.right.left |
|---|
| 95 | root.left = self |
|---|
| 96 | if self.right and self.left: |
|---|
| 97 | self.maxend = max(self.end, self.right.maxend, self.left.maxend) |
|---|
| 98 | self.minend = min(self.end, self.right.minend, self.left.minend ) |
|---|
| 99 | elif self.right: |
|---|
| 100 | self.maxend = max(self.end, self.right.maxend) |
|---|
| 101 | self.minend = min(self.end, self.right.minend) |
|---|
| 102 | elif self.left: |
|---|
| 103 | self.maxend = max(self.end, self.left.maxend) |
|---|
| 104 | self.minend = min(self.end, self.left.minend ) |
|---|
| 105 | return root |
|---|
| 106 | |
|---|
| 107 | def intersect( self, start, end, report_func ): |
|---|
| 108 | if start < self.end and end > self.start: report_func( self ) |
|---|
| 109 | if self.left and start < self.left.maxend: |
|---|
| 110 | self.left.intersect( start, end, report_func ) |
|---|
| 111 | if self.right and end > self.start: |
|---|
| 112 | self.right.intersect( start, end, report_func ) |
|---|
| 113 | |
|---|
| 114 | def traverse( self, func ): |
|---|
| 115 | if self.left: self.left.traverse( func ) |
|---|
| 116 | func( self ) |
|---|
| 117 | if self.right: self.right.traverse( func ) |
|---|
| 118 | |
|---|
| 119 | def main(): |
|---|
| 120 | test = None |
|---|
| 121 | intlist = [] |
|---|
| 122 | for x in range(20000): |
|---|
| 123 | start = random.randint(0,1000000) |
|---|
| 124 | end = start + random.randint(1, 1000) |
|---|
| 125 | if test: test = test.insert( start, end ) |
|---|
| 126 | else: test = IntervalNode( start, end ) |
|---|
| 127 | intlist.append( (start, end) ) |
|---|
| 128 | starttime = time.clock() |
|---|
| 129 | for x in range(5000): |
|---|
| 130 | start = random.randint(0, 10000000) |
|---|
| 131 | end = start + random.randint(1, 1000) |
|---|
| 132 | result = [] |
|---|
| 133 | test.intersect( start, end, lambda x: result.append(x.linenum) ) |
|---|
| 134 | print "%f for tree method" % (time.clock() - starttime) |
|---|
| 135 | starttime = time.clock() |
|---|
| 136 | for x in range(5000): |
|---|
| 137 | start = random.randint(0, 10000000) |
|---|
| 138 | end = start + random.randint(1, 1000) |
|---|
| 139 | bad_sect( intlist, start, end) |
|---|
| 140 | print "%f for linear (bad) method" % (time.clock() - starttime) |
|---|
| 141 | |
|---|
| 142 | def test_func( node ): |
|---|
| 143 | print "[%d, %d), %d" % (node.start, node.end, node.maxend) |
|---|
| 144 | |
|---|
| 145 | def bad_sect( lst, int_start, int_end ): |
|---|
| 146 | intersection = [] |
|---|
| 147 | for start, end in lst: |
|---|
| 148 | if int_start < end and int_end > start: |
|---|
| 149 | intersection.append( (start, end) ) |
|---|
| 150 | return intersection |
|---|
| 151 | |
|---|
| 152 | if __name__ == "__main__": |
|---|
| 153 | main() |
|---|
| 154 | |
|---|
| 155 | |
|---|