[2] | 1 | ''' |
---|
| 2 | 2010, Kanwei Li |
---|
| 3 | Summary tree data structure for aggregation |
---|
| 4 | |
---|
| 5 | 10/20/2010: Changed version to 2 as we no longer look at bottom level, for better performance |
---|
| 6 | ''' |
---|
| 7 | |
---|
| 8 | import sys, os |
---|
| 9 | import cPickle |
---|
| 10 | |
---|
| 11 | VERSION = 2 |
---|
| 12 | MIN_LEVEL = 2 |
---|
| 13 | |
---|
| 14 | class SummaryTree: |
---|
| 15 | def __init__(self, block_size, levels, draw_cutoff, detail_cutoff): |
---|
| 16 | self.version = VERSION |
---|
| 17 | self.chrom_blocks = {} |
---|
| 18 | self.levels = levels |
---|
| 19 | self.draw_cutoff = draw_cutoff |
---|
| 20 | self.detail_cutoff = detail_cutoff |
---|
| 21 | self.block_size = block_size |
---|
| 22 | self.chrom_stats = {} |
---|
| 23 | |
---|
| 24 | def find_block(self, num, level): |
---|
| 25 | return (num / self.block_size ** level) |
---|
| 26 | |
---|
| 27 | def insert_range(self, chrom, start, end): |
---|
| 28 | if chrom in self.chrom_blocks: |
---|
| 29 | blocks = self.chrom_blocks[chrom] |
---|
| 30 | else: |
---|
| 31 | blocks = self.chrom_blocks[chrom] = {} |
---|
| 32 | self.chrom_stats[chrom] = {} |
---|
| 33 | for level in range(MIN_LEVEL, self.levels+1): |
---|
| 34 | blocks[level] = {} |
---|
| 35 | |
---|
| 36 | |
---|
| 37 | for level in range(MIN_LEVEL, self.levels+1): |
---|
| 38 | block_level = blocks[level] |
---|
| 39 | starting_block = self.find_block(start, level) |
---|
| 40 | ending_block = self.find_block(end, level) |
---|
| 41 | for block in range(starting_block, ending_block+1): |
---|
| 42 | if block in block_level: |
---|
| 43 | block_level[block] += 1 |
---|
| 44 | else: |
---|
| 45 | block_level[block] = 1 |
---|
| 46 | |
---|
| 47 | def finish(self): |
---|
| 48 | ''' Checks for cutoff and only stores levels above it ''' |
---|
| 49 | for chrom, blocks in self.chrom_blocks.iteritems(): |
---|
| 50 | cur_best = 999 |
---|
| 51 | for level in range(self.levels, MIN_LEVEL-1, -1): |
---|
| 52 | max_val = max(blocks[level].values()) |
---|
| 53 | if max_val < self.draw_cutoff: |
---|
| 54 | if "draw_level" not in self.chrom_stats[chrom]: |
---|
| 55 | self.chrom_stats[chrom]["draw_level"] = level |
---|
| 56 | elif max_val < self.detail_cutoff: |
---|
| 57 | self.chrom_stats[chrom]["detail_level"] = level |
---|
| 58 | break |
---|
| 59 | else: |
---|
| 60 | self.chrom_stats[chrom][level] = {} |
---|
| 61 | self.chrom_stats[chrom][level]["delta"] = self.block_size ** level |
---|
| 62 | self.chrom_stats[chrom][level]["max"] = max_val |
---|
| 63 | self.chrom_stats[chrom][level]["avg"] = float(max_val) / len(blocks[level]) |
---|
| 64 | cur_best = level |
---|
| 65 | |
---|
| 66 | self.chrom_blocks[chrom] = dict([ (key, value) for key, value in blocks.iteritems() if key >= cur_best ]) |
---|
| 67 | |
---|
| 68 | def query(self, chrom, start, end, level): |
---|
| 69 | if chrom in self.chrom_blocks: |
---|
| 70 | stats = self.chrom_stats[chrom] |
---|
| 71 | if "detail_level" in stats and level <= stats["detail_level"]: |
---|
| 72 | return "detail" |
---|
| 73 | elif "draw_level" in stats and level <= stats["draw_level"]: |
---|
| 74 | return "draw" |
---|
| 75 | blocks = self.chrom_blocks[chrom] |
---|
| 76 | results = [] |
---|
| 77 | multiplier = self.block_size ** level |
---|
| 78 | starting_block = self.find_block(start, level) |
---|
| 79 | ending_block = self.find_block(end, level) |
---|
| 80 | for block in range(starting_block, ending_block+1): |
---|
| 81 | if block in blocks[level]: |
---|
| 82 | results.append( (block * multiplier, blocks[level][block]) ) |
---|
| 83 | return results |
---|
| 84 | |
---|
| 85 | return None |
---|
| 86 | |
---|
| 87 | def write(self, filename): |
---|
| 88 | self.finish() |
---|
| 89 | cPickle.dump(self, open(filename, 'wb'), 2) |
---|
| 90 | |
---|
| 91 | def summary_tree_from_file(filename): |
---|
| 92 | return cPickle.load(open(filename, "r")) |
---|
| 93 | |
---|