| 1 | from UserDict import DictMixin |
|---|
| 2 | |
|---|
| 3 | from bx.misc.binary_file import BinaryFileReader, BinaryFileWriter |
|---|
| 4 | import numpy |
|---|
| 5 | import sys |
|---|
| 6 | |
|---|
| 7 | def cdbhash( s ): |
|---|
| 8 | return reduce( lambda h, c: (((h << 5) + h) ^ ord(c)) & 0xffffffffL, s, 5381 ) |
|---|
| 9 | |
|---|
| 10 | class FileCDBDict( DictMixin ): |
|---|
| 11 | """ |
|---|
| 12 | For accessing a CDB structure on disk. Read only. Currently only supports |
|---|
| 13 | access by key (getitem). |
|---|
| 14 | |
|---|
| 15 | NOTE: The keys method could be implemented by scanning the main table. |
|---|
| 16 | """ |
|---|
| 17 | def __init__( self, file, is_little_endian=True ): |
|---|
| 18 | # TODO: Deal with endianess |
|---|
| 19 | self.io = BinaryFileReader( file, is_little_endian=is_little_endian ) |
|---|
| 20 | self.header_offset = self.io.tell() |
|---|
| 21 | # Read the whole header (only 2k) |
|---|
| 22 | self.header = [] |
|---|
| 23 | for i in range( 256 ): |
|---|
| 24 | self.header.append( ( self.io.read_uint32(), self.io.read_uint32() ) ) |
|---|
| 25 | def __getitem__( self, key ): |
|---|
| 26 | hash = cdbhash( key ) |
|---|
| 27 | # Find position of subtable using 8 LSBs of hash |
|---|
| 28 | subtable_offset = self.header[ hash % 256 ][0] |
|---|
| 29 | subtable_size = self.header[ hash % 256 ][1] |
|---|
| 30 | if subtable_size == 0: |
|---|
| 31 | raise KeyError |
|---|
| 32 | # Seek into subtable and look for match |
|---|
| 33 | start = ( hash >> 8 ) |
|---|
| 34 | for i in range( subtable_size ): |
|---|
| 35 | offset = subtable_offset + ( ( start + i ) % subtable_size ) * 8 |
|---|
| 36 | self.io.seek( offset ) |
|---|
| 37 | h = self.io.read_uint32() |
|---|
| 38 | p = self.io.read_uint32() |
|---|
| 39 | # Hit an empty bin, no match for key |
|---|
| 40 | if p == 0: |
|---|
| 41 | raise KeyError |
|---|
| 42 | # Hash matches, need to check full key |
|---|
| 43 | if h == hash: |
|---|
| 44 | self.io.seek( p ) |
|---|
| 45 | klen = self.io.read_uint32() |
|---|
| 46 | vlen = self.io.read_uint32() |
|---|
| 47 | k = self.io.read( klen ) |
|---|
| 48 | if k == key: |
|---|
| 49 | v = self.io.read( vlen ) |
|---|
| 50 | return v |
|---|
| 51 | else: |
|---|
| 52 | # Visited every slot and no match (should never happen since |
|---|
| 53 | # there are empty slots by contruction) |
|---|
| 54 | raise KeyError |
|---|
| 55 | |
|---|
| 56 | @classmethod |
|---|
| 57 | def to_file( Class, dict, file, is_little_endian=True ): |
|---|
| 58 | """ |
|---|
| 59 | For constructing a CDB structure in a file. Able to calculate size on |
|---|
| 60 | disk and write to a file |
|---|
| 61 | """ |
|---|
| 62 | io = BinaryFileWriter( file, is_little_endian=is_little_endian ) |
|---|
| 63 | start_offset = io.tell() |
|---|
| 64 | # Header is of fixed length |
|---|
| 65 | io.seek( start_offset + ( 8 * 256 ) ) |
|---|
| 66 | # For each item, key and value length (written as length prefixed |
|---|
| 67 | # strings). We also calculate the subtables on this pass. |
|---|
| 68 | # NOTE: This requires the key and value be byte strings, support for |
|---|
| 69 | # dealing with encoding specific value types should be |
|---|
| 70 | # added to this wrapper |
|---|
| 71 | subtables = [ [] for i in range(256) ] |
|---|
| 72 | for key, value in dict.iteritems(): |
|---|
| 73 | pair_offset = io.tell() |
|---|
| 74 | io.write_uint32( len( key ) ) |
|---|
| 75 | io.write_uint32( len( value ) ) |
|---|
| 76 | io.write( key ) |
|---|
| 77 | io.write( value ) |
|---|
| 78 | hash = cdbhash( key ) |
|---|
| 79 | subtables[ hash % 256 ].append( ( hash, pair_offset ) ) |
|---|
| 80 | # Save the offset where the subtables will start |
|---|
| 81 | subtable_offset = io.tell() |
|---|
| 82 | # Write subtables |
|---|
| 83 | for subtable in subtables: |
|---|
| 84 | if len( subtable ) > 0: |
|---|
| 85 | # Construct hashtable to be twice the size of the number |
|---|
| 86 | # of items in the subtable, and built it in memory |
|---|
| 87 | ncells = len( subtable ) * 2 |
|---|
| 88 | cells = [ (0,0) for i in range( ncells ) ] |
|---|
| 89 | for hash, pair_offset in subtable: |
|---|
| 90 | index = ( hash >> 8 ) % ncells |
|---|
| 91 | while cells[index][1] != 0: |
|---|
| 92 | index = ( index + 1 ) % ncells |
|---|
| 93 | # Guaranteed to find a non-empty cell |
|---|
| 94 | cells[index] = ( hash, pair_offset ) |
|---|
| 95 | # Write subtable |
|---|
| 96 | for hash, pair_offset in cells: |
|---|
| 97 | io.write_uint32( hash ) |
|---|
| 98 | io.write_uint32( pair_offset ) |
|---|
| 99 | # Go back and write the header |
|---|
| 100 | end_offset = io.tell() |
|---|
| 101 | io.seek( start_offset ) |
|---|
| 102 | index = subtable_offset |
|---|
| 103 | for subtable in subtables: |
|---|
| 104 | io.write_uint32( index ) |
|---|
| 105 | io.write_uint32( len( subtable * 2 ) ) |
|---|
| 106 | # For each cell in the subtable, a hash and a pointer to a value |
|---|
| 107 | index += ( len( subtable ) * 2 ) * 8 |
|---|
| 108 | # Leave fp at end of cdb |
|---|
| 109 | io.seek( end_offset ) |
|---|