Context Navigation

cdb.py @ 3

リビジョン 3, 4.4 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	from UserDict import DictMixin
2
3	from bx.misc.binary_file import BinaryFileReader, BinaryFileWriter
4	import numpy
5	import sys
6
7	def cdbhash( s ):
8	return reduce( lambda h, c: (((h << 5) + h) ^ ord(c)) & 0xffffffffL, s, 5381 )
9
10	class FileCDBDict( DictMixin ):
11	"""
12	For accessing a CDB structure on disk. Read only. Currently only supports
13	access by key (getitem).
14
15	NOTE: The keys method could be implemented by scanning the main table.
16	"""
17	def __init__( self, file, is_little_endian=True ):
18	# TODO: Deal with endianess
19	self.io = BinaryFileReader( file, is_little_endian=is_little_endian )
20	self.header_offset = self.io.tell()
21	# Read the whole header (only 2k)
22	self.header = []
23	for i in range( 256 ):
24	self.header.append( ( self.io.read_uint32(), self.io.read_uint32() ) )
25	def __getitem__( self, key ):
26	hash = cdbhash( key )
27	# Find position of subtable using 8 LSBs of hash
28	subtable_offset = self.header[ hash % 256 ][0]
29	subtable_size = self.header[ hash % 256 ][1]
30	if subtable_size == 0:
31	raise KeyError
32	# Seek into subtable and look for match
33	start = ( hash >> 8 )
34	for i in range( subtable_size ):
35	offset = subtable_offset + ( ( start + i ) % subtable_size ) * 8
36	self.io.seek( offset )
37	h = self.io.read_uint32()
38	p = self.io.read_uint32()
39	# Hit an empty bin, no match for key
40	if p == 0:
41	raise KeyError
42	# Hash matches, need to check full key
43	if h == hash:
44	self.io.seek( p )
45	klen = self.io.read_uint32()
46	vlen = self.io.read_uint32()
47	k = self.io.read( klen )
48	if k == key:
49	v = self.io.read( vlen )
50	return v
51	else:
52	# Visited every slot and no match (should never happen since
53	# there are empty slots by contruction)
54	raise KeyError
55
56	@classmethod
57	def to_file( Class, dict, file, is_little_endian=True ):
58	"""
59	For constructing a CDB structure in a file. Able to calculate size on
60	disk and write to a file
61	"""
62	io = BinaryFileWriter( file, is_little_endian=is_little_endian )
63	start_offset = io.tell()
64	# Header is of fixed length
65	io.seek( start_offset + ( 8 * 256 ) )
66	# For each item, key and value length (written as length prefixed
67	# strings). We also calculate the subtables on this pass.
68	# NOTE: This requires the key and value be byte strings, support for
69	# dealing with encoding specific value types should be
70	# added to this wrapper
71	subtables = [ [] for i in range(256) ]
72	for key, value in dict.iteritems():
73	pair_offset = io.tell()
74	io.write_uint32( len( key ) )
75	io.write_uint32( len( value ) )
76	io.write( key )
77	io.write( value )
78	hash = cdbhash( key )
79	subtables[ hash % 256 ].append( ( hash, pair_offset ) )
80	# Save the offset where the subtables will start
81	subtable_offset = io.tell()
82	# Write subtables
83	for subtable in subtables:
84	if len( subtable ) > 0:
85	# Construct hashtable to be twice the size of the number
86	# of items in the subtable, and built it in memory
87	ncells = len( subtable ) * 2
88	cells = [ (0,0) for i in range( ncells ) ]
89	for hash, pair_offset in subtable:
90	index = ( hash >> 8 ) % ncells
91	while cells[index][1] != 0:
92	index = ( index + 1 ) % ncells
93	# Guaranteed to find a non-empty cell
94	cells[index] = ( hash, pair_offset )
95	# Write subtable
96	for hash, pair_offset in cells:
97	io.write_uint32( hash )
98	io.write_uint32( pair_offset )
99	# Go back and write the header
100	end_offset = io.tell()
101	io.seek( start_offset )
102	index = subtable_offset
103	for subtable in subtables:
104	io.write_uint32( index )
105	io.write_uint32( len( subtable * 2 ) )
106	# For each cell in the subtable, a hash and a pointer to a value
107	index += ( len( subtable ) * 2 ) * 8
108	# Leave fp at end of cdb
109	io.seek( end_offset )

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/misc/cdb.py @ 3

異なるフォーマットでダウンロード: