1 | ''' |
---|
2 | 2010, Kanwei Li |
---|
3 | Summary tree data structure for aggregation |
---|
4 | |
---|
5 | 10/20/2010: Changed version to 2 as we no longer look at bottom level, for better performance |
---|
6 | ''' |
---|
7 | |
---|
8 | import sys, os |
---|
9 | import cPickle |
---|
10 | |
---|
11 | VERSION = 2 |
---|
12 | MIN_LEVEL = 2 |
---|
13 | |
---|
14 | class SummaryTree: |
---|
15 | def __init__(self, block_size, levels, draw_cutoff, detail_cutoff): |
---|
16 | self.version = VERSION |
---|
17 | self.chrom_blocks = {} |
---|
18 | self.levels = levels |
---|
19 | self.draw_cutoff = draw_cutoff |
---|
20 | self.detail_cutoff = detail_cutoff |
---|
21 | self.block_size = block_size |
---|
22 | self.chrom_stats = {} |
---|
23 | |
---|
24 | def find_block(self, num, level): |
---|
25 | return (num / self.block_size ** level) |
---|
26 | |
---|
27 | def insert_range(self, chrom, start, end): |
---|
28 | if chrom in self.chrom_blocks: |
---|
29 | blocks = self.chrom_blocks[chrom] |
---|
30 | else: |
---|
31 | blocks = self.chrom_blocks[chrom] = {} |
---|
32 | self.chrom_stats[chrom] = {} |
---|
33 | for level in range(MIN_LEVEL, self.levels+1): |
---|
34 | blocks[level] = {} |
---|
35 | |
---|
36 | |
---|
37 | for level in range(MIN_LEVEL, self.levels+1): |
---|
38 | block_level = blocks[level] |
---|
39 | starting_block = self.find_block(start, level) |
---|
40 | ending_block = self.find_block(end, level) |
---|
41 | for block in range(starting_block, ending_block+1): |
---|
42 | if block in block_level: |
---|
43 | block_level[block] += 1 |
---|
44 | else: |
---|
45 | block_level[block] = 1 |
---|
46 | |
---|
47 | def finish(self): |
---|
48 | ''' Checks for cutoff and only stores levels above it ''' |
---|
49 | for chrom, blocks in self.chrom_blocks.iteritems(): |
---|
50 | cur_best = 999 |
---|
51 | for level in range(self.levels, MIN_LEVEL-1, -1): |
---|
52 | max_val = max(blocks[level].values()) |
---|
53 | if max_val < self.draw_cutoff: |
---|
54 | if "draw_level" not in self.chrom_stats[chrom]: |
---|
55 | self.chrom_stats[chrom]["draw_level"] = level |
---|
56 | elif max_val < self.detail_cutoff: |
---|
57 | self.chrom_stats[chrom]["detail_level"] = level |
---|
58 | break |
---|
59 | else: |
---|
60 | self.chrom_stats[chrom][level] = {} |
---|
61 | self.chrom_stats[chrom][level]["delta"] = self.block_size ** level |
---|
62 | self.chrom_stats[chrom][level]["max"] = max_val |
---|
63 | self.chrom_stats[chrom][level]["avg"] = float(max_val) / len(blocks[level]) |
---|
64 | cur_best = level |
---|
65 | |
---|
66 | self.chrom_blocks[chrom] = dict([ (key, value) for key, value in blocks.iteritems() if key >= cur_best ]) |
---|
67 | |
---|
68 | def query(self, chrom, start, end, level): |
---|
69 | if chrom in self.chrom_blocks: |
---|
70 | stats = self.chrom_stats[chrom] |
---|
71 | if "detail_level" in stats and level <= stats["detail_level"]: |
---|
72 | return "detail" |
---|
73 | elif "draw_level" in stats and level <= stats["draw_level"]: |
---|
74 | return "draw" |
---|
75 | blocks = self.chrom_blocks[chrom] |
---|
76 | results = [] |
---|
77 | multiplier = self.block_size ** level |
---|
78 | starting_block = self.find_block(start, level) |
---|
79 | ending_block = self.find_block(end, level) |
---|
80 | for block in range(starting_block, ending_block+1): |
---|
81 | if block in blocks[level]: |
---|
82 | results.append( (block * multiplier, blocks[level][block]) ) |
---|
83 | return results |
---|
84 | |
---|
85 | return None |
---|
86 | |
---|
87 | def write(self, filename): |
---|
88 | self.finish() |
---|
89 | cPickle.dump(self, open(filename, 'wb'), 2) |
---|
90 | |
---|
91 | def summary_tree_from_file(filename): |
---|
92 | return cPickle.load(open(filename, "r")) |
---|
93 | |
---|