root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/misc/cdb.py @ 3

リビジョン 3, 4.4 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1from UserDict import DictMixin
2
3from bx.misc.binary_file import BinaryFileReader, BinaryFileWriter
4import numpy
5import sys
6
7def cdbhash( s ):
8    return reduce( lambda h, c: (((h << 5) + h) ^ ord(c)) & 0xffffffffL, s, 5381 )
9
10class FileCDBDict( DictMixin ):
11    """
12    For accessing a CDB structure on disk. Read only. Currently only supports
13    access by key (getitem).
14   
15    NOTE: The keys method could be implemented by scanning the main table.
16    """
17    def __init__( self, file, is_little_endian=True ):
18        # TODO: Deal with endianess
19        self.io = BinaryFileReader( file, is_little_endian=is_little_endian )
20        self.header_offset = self.io.tell()
21        # Read the whole header (only 2k)
22        self.header = []
23        for i in range( 256 ):
24            self.header.append( ( self.io.read_uint32(), self.io.read_uint32() ) )
25    def __getitem__( self, key ):
26        hash = cdbhash( key )
27        # Find position of subtable using 8 LSBs of hash
28        subtable_offset = self.header[ hash % 256 ][0]
29        subtable_size = self.header[ hash % 256 ][1]
30        if subtable_size == 0:
31            raise KeyError
32        # Seek into subtable and look for match
33        start = ( hash >> 8 )
34        for i in range( subtable_size ):
35            offset = subtable_offset + ( ( start + i ) % subtable_size ) * 8
36            self.io.seek( offset )
37            h = self.io.read_uint32()
38            p = self.io.read_uint32()
39            # Hit an empty bin, no match for key
40            if p == 0:
41                raise KeyError
42            # Hash matches, need to check full key
43            if h == hash:
44                self.io.seek( p )
45                klen = self.io.read_uint32()
46                vlen = self.io.read_uint32()
47                k = self.io.read( klen )
48                if k == key:
49                    v = self.io.read( vlen )
50                    return v
51        else:
52            # Visited every slot and no match (should never happen since
53            # there are empty slots by contruction)
54            raise KeyError
55       
56    @classmethod
57    def to_file( Class, dict, file, is_little_endian=True ):
58        """
59        For constructing a CDB structure in a file. Able to calculate size on
60        disk and write to a file
61        """
62        io = BinaryFileWriter( file, is_little_endian=is_little_endian )
63        start_offset = io.tell()
64        # Header is of fixed length
65        io.seek( start_offset + ( 8 * 256 ) )
66        # For each item, key and value length (written as length prefixed
67        # strings). We also calculate the subtables on this pass.
68        # NOTE: This requires the key and value be byte strings, support for
69        #       dealing with encoding specific value types should be
70        #       added to this wrapper
71        subtables = [ [] for i in range(256) ]
72        for key, value in dict.iteritems():
73            pair_offset = io.tell()
74            io.write_uint32( len( key ) )
75            io.write_uint32( len( value ) )
76            io.write( key )
77            io.write( value )
78            hash = cdbhash( key )
79            subtables[ hash % 256 ].append( ( hash, pair_offset ) )
80        # Save the offset where the subtables will start
81        subtable_offset = io.tell()
82        # Write subtables
83        for subtable in subtables:
84            if len( subtable ) > 0:
85                # Construct hashtable to be twice the size of the number
86                # of items in the subtable, and built it in memory
87                ncells = len( subtable ) * 2
88                cells = [ (0,0) for i in range( ncells ) ]
89                for hash, pair_offset in subtable:
90                    index = ( hash >> 8 ) % ncells
91                    while cells[index][1] != 0:
92                        index = ( index + 1 ) % ncells
93                    # Guaranteed to find a non-empty cell
94                    cells[index] = ( hash, pair_offset )
95                # Write subtable
96                for hash, pair_offset in cells:
97                    io.write_uint32( hash )
98                    io.write_uint32( pair_offset )
99        # Go back and write the header
100        end_offset = io.tell()
101        io.seek( start_offset )
102        index = subtable_offset
103        for subtable in subtables:
104            io.write_uint32( index )
105            io.write_uint32( len( subtable * 2 ) )
106            # For each cell in the subtable, a hash and a pointer to a value
107            index += ( len( subtable ) * 2 ) * 8
108        # Leave fp at end of cdb
109        io.seek( end_offset )
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。