| 1 | ''' |
|---|
| 2 | 2010, Kanwei Li |
|---|
| 3 | Summary tree data structure for aggregation |
|---|
| 4 | |
|---|
| 5 | 10/20/2010: Changed version to 2 as we no longer look at bottom level, for better performance |
|---|
| 6 | ''' |
|---|
| 7 | |
|---|
| 8 | import sys, os |
|---|
| 9 | import cPickle |
|---|
| 10 | |
|---|
| 11 | VERSION = 2 |
|---|
| 12 | MIN_LEVEL = 2 |
|---|
| 13 | |
|---|
| 14 | class SummaryTree: |
|---|
| 15 | def __init__(self, block_size, levels, draw_cutoff, detail_cutoff): |
|---|
| 16 | self.version = VERSION |
|---|
| 17 | self.chrom_blocks = {} |
|---|
| 18 | self.levels = levels |
|---|
| 19 | self.draw_cutoff = draw_cutoff |
|---|
| 20 | self.detail_cutoff = detail_cutoff |
|---|
| 21 | self.block_size = block_size |
|---|
| 22 | self.chrom_stats = {} |
|---|
| 23 | |
|---|
| 24 | def find_block(self, num, level): |
|---|
| 25 | return (num / self.block_size ** level) |
|---|
| 26 | |
|---|
| 27 | def insert_range(self, chrom, start, end): |
|---|
| 28 | if chrom in self.chrom_blocks: |
|---|
| 29 | blocks = self.chrom_blocks[chrom] |
|---|
| 30 | else: |
|---|
| 31 | blocks = self.chrom_blocks[chrom] = {} |
|---|
| 32 | self.chrom_stats[chrom] = {} |
|---|
| 33 | for level in range(MIN_LEVEL, self.levels+1): |
|---|
| 34 | blocks[level] = {} |
|---|
| 35 | |
|---|
| 36 | |
|---|
| 37 | for level in range(MIN_LEVEL, self.levels+1): |
|---|
| 38 | block_level = blocks[level] |
|---|
| 39 | starting_block = self.find_block(start, level) |
|---|
| 40 | ending_block = self.find_block(end, level) |
|---|
| 41 | for block in range(starting_block, ending_block+1): |
|---|
| 42 | if block in block_level: |
|---|
| 43 | block_level[block] += 1 |
|---|
| 44 | else: |
|---|
| 45 | block_level[block] = 1 |
|---|
| 46 | |
|---|
| 47 | def finish(self): |
|---|
| 48 | ''' Checks for cutoff and only stores levels above it ''' |
|---|
| 49 | for chrom, blocks in self.chrom_blocks.iteritems(): |
|---|
| 50 | cur_best = 999 |
|---|
| 51 | for level in range(self.levels, MIN_LEVEL-1, -1): |
|---|
| 52 | max_val = max(blocks[level].values()) |
|---|
| 53 | if max_val < self.draw_cutoff: |
|---|
| 54 | if "draw_level" not in self.chrom_stats[chrom]: |
|---|
| 55 | self.chrom_stats[chrom]["draw_level"] = level |
|---|
| 56 | elif max_val < self.detail_cutoff: |
|---|
| 57 | self.chrom_stats[chrom]["detail_level"] = level |
|---|
| 58 | break |
|---|
| 59 | else: |
|---|
| 60 | self.chrom_stats[chrom][level] = {} |
|---|
| 61 | self.chrom_stats[chrom][level]["delta"] = self.block_size ** level |
|---|
| 62 | self.chrom_stats[chrom][level]["max"] = max_val |
|---|
| 63 | self.chrom_stats[chrom][level]["avg"] = float(max_val) / len(blocks[level]) |
|---|
| 64 | cur_best = level |
|---|
| 65 | |
|---|
| 66 | self.chrom_blocks[chrom] = dict([ (key, value) for key, value in blocks.iteritems() if key >= cur_best ]) |
|---|
| 67 | |
|---|
| 68 | def query(self, chrom, start, end, level): |
|---|
| 69 | if chrom in self.chrom_blocks: |
|---|
| 70 | stats = self.chrom_stats[chrom] |
|---|
| 71 | if "detail_level" in stats and level <= stats["detail_level"]: |
|---|
| 72 | return "detail" |
|---|
| 73 | elif "draw_level" in stats and level <= stats["draw_level"]: |
|---|
| 74 | return "draw" |
|---|
| 75 | blocks = self.chrom_blocks[chrom] |
|---|
| 76 | results = [] |
|---|
| 77 | multiplier = self.block_size ** level |
|---|
| 78 | starting_block = self.find_block(start, level) |
|---|
| 79 | ending_block = self.find_block(end, level) |
|---|
| 80 | for block in range(starting_block, ending_block+1): |
|---|
| 81 | if block in blocks[level]: |
|---|
| 82 | results.append( (block * multiplier, blocks[level][block]) ) |
|---|
| 83 | return results |
|---|
| 84 | |
|---|
| 85 | return None |
|---|
| 86 | |
|---|
| 87 | def write(self, filename): |
|---|
| 88 | self.finish() |
|---|
| 89 | cPickle.dump(self, open(filename, 'wb'), 2) |
|---|
| 90 | |
|---|
| 91 | def summary_tree_from_file(filename): |
|---|
| 92 | return cPickle.load(open(filename, "r")) |
|---|
| 93 | |
|---|