1 | from UserDict import DictMixin |
---|
2 | |
---|
3 | from bx.misc.binary_file import BinaryFileReader, BinaryFileWriter |
---|
4 | import numpy |
---|
5 | import sys |
---|
6 | |
---|
7 | def cdbhash( s ): |
---|
8 | return reduce( lambda h, c: (((h << 5) + h) ^ ord(c)) & 0xffffffffL, s, 5381 ) |
---|
9 | |
---|
10 | class FileCDBDict( DictMixin ): |
---|
11 | """ |
---|
12 | For accessing a CDB structure on disk. Read only. Currently only supports |
---|
13 | access by key (getitem). |
---|
14 | |
---|
15 | NOTE: The keys method could be implemented by scanning the main table. |
---|
16 | """ |
---|
17 | def __init__( self, file, is_little_endian=True ): |
---|
18 | # TODO: Deal with endianess |
---|
19 | self.io = BinaryFileReader( file, is_little_endian=is_little_endian ) |
---|
20 | self.header_offset = self.io.tell() |
---|
21 | # Read the whole header (only 2k) |
---|
22 | self.header = [] |
---|
23 | for i in range( 256 ): |
---|
24 | self.header.append( ( self.io.read_uint32(), self.io.read_uint32() ) ) |
---|
25 | def __getitem__( self, key ): |
---|
26 | hash = cdbhash( key ) |
---|
27 | # Find position of subtable using 8 LSBs of hash |
---|
28 | subtable_offset = self.header[ hash % 256 ][0] |
---|
29 | subtable_size = self.header[ hash % 256 ][1] |
---|
30 | if subtable_size == 0: |
---|
31 | raise KeyError |
---|
32 | # Seek into subtable and look for match |
---|
33 | start = ( hash >> 8 ) |
---|
34 | for i in range( subtable_size ): |
---|
35 | offset = subtable_offset + ( ( start + i ) % subtable_size ) * 8 |
---|
36 | self.io.seek( offset ) |
---|
37 | h = self.io.read_uint32() |
---|
38 | p = self.io.read_uint32() |
---|
39 | # Hit an empty bin, no match for key |
---|
40 | if p == 0: |
---|
41 | raise KeyError |
---|
42 | # Hash matches, need to check full key |
---|
43 | if h == hash: |
---|
44 | self.io.seek( p ) |
---|
45 | klen = self.io.read_uint32() |
---|
46 | vlen = self.io.read_uint32() |
---|
47 | k = self.io.read( klen ) |
---|
48 | if k == key: |
---|
49 | v = self.io.read( vlen ) |
---|
50 | return v |
---|
51 | else: |
---|
52 | # Visited every slot and no match (should never happen since |
---|
53 | # there are empty slots by contruction) |
---|
54 | raise KeyError |
---|
55 | |
---|
56 | @classmethod |
---|
57 | def to_file( Class, dict, file, is_little_endian=True ): |
---|
58 | """ |
---|
59 | For constructing a CDB structure in a file. Able to calculate size on |
---|
60 | disk and write to a file |
---|
61 | """ |
---|
62 | io = BinaryFileWriter( file, is_little_endian=is_little_endian ) |
---|
63 | start_offset = io.tell() |
---|
64 | # Header is of fixed length |
---|
65 | io.seek( start_offset + ( 8 * 256 ) ) |
---|
66 | # For each item, key and value length (written as length prefixed |
---|
67 | # strings). We also calculate the subtables on this pass. |
---|
68 | # NOTE: This requires the key and value be byte strings, support for |
---|
69 | # dealing with encoding specific value types should be |
---|
70 | # added to this wrapper |
---|
71 | subtables = [ [] for i in range(256) ] |
---|
72 | for key, value in dict.iteritems(): |
---|
73 | pair_offset = io.tell() |
---|
74 | io.write_uint32( len( key ) ) |
---|
75 | io.write_uint32( len( value ) ) |
---|
76 | io.write( key ) |
---|
77 | io.write( value ) |
---|
78 | hash = cdbhash( key ) |
---|
79 | subtables[ hash % 256 ].append( ( hash, pair_offset ) ) |
---|
80 | # Save the offset where the subtables will start |
---|
81 | subtable_offset = io.tell() |
---|
82 | # Write subtables |
---|
83 | for subtable in subtables: |
---|
84 | if len( subtable ) > 0: |
---|
85 | # Construct hashtable to be twice the size of the number |
---|
86 | # of items in the subtable, and built it in memory |
---|
87 | ncells = len( subtable ) * 2 |
---|
88 | cells = [ (0,0) for i in range( ncells ) ] |
---|
89 | for hash, pair_offset in subtable: |
---|
90 | index = ( hash >> 8 ) % ncells |
---|
91 | while cells[index][1] != 0: |
---|
92 | index = ( index + 1 ) % ncells |
---|
93 | # Guaranteed to find a non-empty cell |
---|
94 | cells[index] = ( hash, pair_offset ) |
---|
95 | # Write subtable |
---|
96 | for hash, pair_offset in cells: |
---|
97 | io.write_uint32( hash ) |
---|
98 | io.write_uint32( pair_offset ) |
---|
99 | # Go back and write the header |
---|
100 | end_offset = io.tell() |
---|
101 | io.seek( start_offset ) |
---|
102 | index = subtable_offset |
---|
103 | for subtable in subtables: |
---|
104 | io.write_uint32( index ) |
---|
105 | io.write_uint32( len( subtable * 2 ) ) |
---|
106 | # For each cell in the subtable, a hash and a pointer to a value |
---|
107 | index += ( len( subtable ) * 2 ) * 8 |
---|
108 | # Leave fp at end of cdb |
---|
109 | io.seek( end_offset ) |
---|