| 1 | """ |
|---|
| 2 | Classes for index files that map genomic intervals to values. |
|---|
| 3 | |
|---|
| 4 | :Authors: James Taylor (james@bx.psu.edu), Bob Harris (rsharris@bx.psu.edu) |
|---|
| 5 | |
|---|
| 6 | An interval index file maps genomic intervals to values. |
|---|
| 7 | |
|---|
| 8 | This implementation writes version 1 file format, and reads versions 0 and 1. |
|---|
| 9 | |
|---|
| 10 | Index File Format |
|---|
| 11 | ----------------- |
|---|
| 12 | |
|---|
| 13 | All fields are in big-endian format (most significant byte first). |
|---|
| 14 | |
|---|
| 15 | All intervals are origin-zero, inclusive start, exclusive end. |
|---|
| 16 | |
|---|
| 17 | The file begins with an index file header, then is immediately followed |
|---|
| 18 | by an index table. The index table points to index headers, and index |
|---|
| 19 | headers point to bins. Index headers and bins are referenced via pointers |
|---|
| 20 | (file offsets), and can be placed more or less anywhere in the file. |
|---|
| 21 | |
|---|
| 22 | File header |
|---|
| 23 | ~~~~~~~~~~~ |
|---|
| 24 | |
|---|
| 25 | ============ =========== ================================================= |
|---|
| 26 | offset 0x00: 2C FF 80 0A magic number |
|---|
| 27 | offset 0x04: 00 00 00 01 version (00 00 00 00 is also supported) |
|---|
| 28 | offset 0x08: 00 00 00 2A (N) number of index sets |
|---|
| 29 | offset 0x0C: ... index table |
|---|
| 30 | ============ =========== ================================================= |
|---|
| 31 | |
|---|
| 32 | Index table |
|---|
| 33 | ~~~~~~~~~~~ |
|---|
| 34 | |
|---|
| 35 | The index table is a list of N index headers, packed sequentially and |
|---|
| 36 | sorted by name. The first begins at offset 0x0C. Each header describes |
|---|
| 37 | one set of intervals. |
|---|
| 38 | |
|---|
| 39 | ============ =========== ================================================= |
|---|
| 40 | offset: xx xx xx xx (L) length of index src name |
|---|
| 41 | offset+4: ... index src name (e.g. canFam1.chr1) |
|---|
| 42 | offset+4+L: xx xx xx xx offset (in this file) to index data |
|---|
| 43 | offset+8+L: xx xx xx xx (B) number of bytes in each value; for version |
|---|
| 44 | 0, this field is absent, and B is assumed to be 4 |
|---|
| 45 | ============ =========== ================================================= |
|---|
| 46 | |
|---|
| 47 | Index data |
|---|
| 48 | ~~~~~~~~~~ |
|---|
| 49 | |
|---|
| 50 | The index data for (for one index table) consists of the overall range of |
|---|
| 51 | intervals followed by an array of pointers to bins. The length of the |
|---|
| 52 | array is 1+binForRange(maxEnd-1,maxEnd), where maxEnd is the maximum |
|---|
| 53 | interval end. |
|---|
| 54 | |
|---|
| 55 | ============ =========== ================================================= |
|---|
| 56 | offset: xx xx xx xx minimum interval start |
|---|
| 57 | offset+4: xx xx xx xx maximum interval end |
|---|
| 58 | offset+8: xx xx xx xx offset (in this file) to bin 0 |
|---|
| 59 | offset+12: xx xx xx xx number of intervals in bin 0 |
|---|
| 60 | offset+16: xx xx xx xx offset (in this file) to bin 1 |
|---|
| 61 | offset+20: xx xx xx xx number of intervals in bin 1 |
|---|
| 62 | ... ... ... |
|---|
| 63 | ============ =========== ================================================= |
|---|
| 64 | |
|---|
| 65 | Bin |
|---|
| 66 | ~~~ |
|---|
| 67 | |
|---|
| 68 | A bin is an array of (start,end,val), sorted by increasing start (with |
|---|
| 69 | end and val as tiebreakers). Note that bins may be empty (the number of |
|---|
| 70 | intervals indicated in the index data is zero). Note that B is determined |
|---|
| 71 | from the appropriate entry in the index table. |
|---|
| 72 | |
|---|
| 73 | ============ =========== ================================================= |
|---|
| 74 | offset: xx xx xx xx start for interval 1 |
|---|
| 75 | offset+4: xx xx xx xx end for interval 1 |
|---|
| 76 | offset+8: ... (B bytes) value for interval 1 |
|---|
| 77 | offset+8+B: xx xx xx xx start for interval 2 |
|---|
| 78 | offset+12+B: xx xx xx xx end for interval 2 |
|---|
| 79 | offset+16+B: ... (B bytes) value for interval 2 |
|---|
| 80 | ... ... ... |
|---|
| 81 | ============ =========== ================================================= |
|---|
| 82 | """ |
|---|
| 83 | |
|---|
| 84 | from bisect import * |
|---|
| 85 | from struct import * |
|---|
| 86 | |
|---|
| 87 | from bx.misc import filecache |
|---|
| 88 | |
|---|
| 89 | try: |
|---|
| 90 | from bx.misc import seekbzip2 |
|---|
| 91 | except: |
|---|
| 92 | seekbzip2 = None |
|---|
| 93 | |
|---|
| 94 | try: |
|---|
| 95 | from bx.misc import seeklzop |
|---|
| 96 | except: |
|---|
| 97 | seeklzop = None |
|---|
| 98 | |
|---|
| 99 | import os.path |
|---|
| 100 | |
|---|
| 101 | __all__ = [ 'Indexes', 'Index' ] |
|---|
| 102 | |
|---|
| 103 | MAGIC = 0x2cff800a |
|---|
| 104 | VERSION = 2 |
|---|
| 105 | |
|---|
| 106 | # These three constants determine the structure of the default binning strategy |
|---|
| 107 | BIN_LEVELS = 6 # Number of levels of bins to build |
|---|
| 108 | BIN_FIRST_SHIFT = 17 # Number of bits for the bottom level bin |
|---|
| 109 | BIN_NEXT_SHIFT = 3 # Number of bits for each higher level bin |
|---|
| 110 | |
|---|
| 111 | # Build offset and max size arrays for each bin level |
|---|
| 112 | BIN_OFFSETS = [ 1, 0 ] |
|---|
| 113 | BIN_OFFSETS_MAX = [ ( 1 << BIN_FIRST_SHIFT << BIN_NEXT_SHIFT ), ( 1 << BIN_FIRST_SHIFT ) ] |
|---|
| 114 | for i in range( BIN_LEVELS - 2 ): |
|---|
| 115 | BIN_OFFSETS.insert( 0, ( 2 ** (3*(i+1)) ) + BIN_OFFSETS[0] ) |
|---|
| 116 | BIN_OFFSETS_MAX.insert( 0, ( BIN_OFFSETS_MAX[0] << BIN_NEXT_SHIFT ) ) |
|---|
| 117 | # The maximum size for the top bin is actually bigger than the signed integers |
|---|
| 118 | # we use to store positions in the file, so we'll change it to prevent confusion |
|---|
| 119 | BIN_OFFSETS_MAX[ 0 ] = max |
|---|
| 120 | |
|---|
| 121 | # Constants for the minimum and maximum size of the overall interval |
|---|
| 122 | MIN = 0 |
|---|
| 123 | OLD_MAX = 512*1024*1024 # Maximum size supported by versions < 2 |
|---|
| 124 | DEFAULT_MAX = 512*1024*1024 # Default max size to use when none is passed |
|---|
| 125 | MAX = ( 2 ** 31 ) # Absolute max size (limited by file format) |
|---|
| 126 | |
|---|
| 127 | def offsets_for_max_size( max_size ): |
|---|
| 128 | """ |
|---|
| 129 | Return the subset of offsets needed to contain intervals over (0,max_size) |
|---|
| 130 | """ |
|---|
| 131 | for i, max in enumerate( reversed( BIN_OFFSETS_MAX ) ): |
|---|
| 132 | if max_size < max: |
|---|
| 133 | break |
|---|
| 134 | else: |
|---|
| 135 | raise Exception( "%d is larger than the maximum possible size (%d)" % ( max_size, BIN_OFFSETS_MAX[0] ) ) |
|---|
| 136 | return BIN_OFFSETS[ ( len(BIN_OFFSETS) - i - 1 ) : ] |
|---|
| 137 | |
|---|
| 138 | def bin_for_range( start, end, offsets=None ): |
|---|
| 139 | """Find the smallest bin that can contain interval (start,end)""" |
|---|
| 140 | if offsets is None: |
|---|
| 141 | offsets = BIN_OFFSETS |
|---|
| 142 | start_bin, end_bin = start, end - 1 |
|---|
| 143 | start_bin >>= BIN_FIRST_SHIFT |
|---|
| 144 | end_bin >>= BIN_FIRST_SHIFT |
|---|
| 145 | for offset in offsets: |
|---|
| 146 | if start_bin == end_bin: |
|---|
| 147 | return offset + start_bin |
|---|
| 148 | else: |
|---|
| 149 | start_bin >>= BIN_NEXT_SHIFT |
|---|
| 150 | end_bin >>= BIN_NEXT_SHIFT |
|---|
| 151 | raise "Interval (%d,%d) out of range" |
|---|
| 152 | |
|---|
| 153 | class AbstractMultiIndexedAccess( object ): |
|---|
| 154 | """ |
|---|
| 155 | Allows accessing multiple indexes / files as if they were one |
|---|
| 156 | """ |
|---|
| 157 | indexed_access_class = None |
|---|
| 158 | def __init__( self, filenames, index_filenames=None, keep_open=False, use_cache=False, **kwargs ): |
|---|
| 159 | # TODO: Handle index_filenames argument |
|---|
| 160 | self.indexes = [ self.new_indexed_access( fname, keep_open=keep_open, use_cache=use_cache, **kwargs ) \ |
|---|
| 161 | for fname in filenames ] |
|---|
| 162 | def new_indexed_access( self, data_filename, index_filename=None, keep_open=False, **kwargs ): |
|---|
| 163 | return self.indexed_access_class( data_filename, index_filename, keep_open, **kwargs ) |
|---|
| 164 | def get( self, src, start, end ): |
|---|
| 165 | return [block for block in self.get_as_iterator( src, start, end )] |
|---|
| 166 | def get_as_iterator( self, src, start, end ): |
|---|
| 167 | for block, index, offset in self.get_as_iterator_with_index_and_offset( src, start, end ): |
|---|
| 168 | yield block |
|---|
| 169 | def get_as_iterator_with_index_and_offset( self, src, start, end ): |
|---|
| 170 | for index in self.indexes: |
|---|
| 171 | for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, start, end ): |
|---|
| 172 | yield block, idx, offset |
|---|
| 173 | def close( self ): |
|---|
| 174 | for index in self.indexes: |
|---|
| 175 | index.close() |
|---|
| 176 | |
|---|
| 177 | class AbstractIndexedAccess( object ): |
|---|
| 178 | """Indexed access to a data using overlap queries, requires an index file""" |
|---|
| 179 | |
|---|
| 180 | def __init__( self, data_filename, index_filename=None, keep_open=False, use_cache=False, **kwargs ): |
|---|
| 181 | self.data_kwargs = kwargs |
|---|
| 182 | self.data_filename = data_filename |
|---|
| 183 | if data_filename.endswith( ".bz2" ): |
|---|
| 184 | if seekbzip2 is None: |
|---|
| 185 | raise Exception( "Trying to open .bz2 file but no seekbzip2 module found") |
|---|
| 186 | table_filename = data_filename + "t" |
|---|
| 187 | self.table_filename = table_filename |
|---|
| 188 | if not os.path.exists( table_filename ): |
|---|
| 189 | raise Exception( "Cannot find bz2t file for: " + data_filename ) |
|---|
| 190 | self.file_type = "bz2t" |
|---|
| 191 | # Strip .bz2 from the filename before adding ".index" |
|---|
| 192 | data_filename_root = data_filename[:-4] |
|---|
| 193 | elif data_filename.endswith( ".lzo" ): |
|---|
| 194 | if seeklzop is None: |
|---|
| 195 | raise Exception( "Trying to open .lzo file but no seeklzop module found") |
|---|
| 196 | table_filename = data_filename + "t" |
|---|
| 197 | self.table_filename = table_filename |
|---|
| 198 | if not os.path.exists( table_filename ): |
|---|
| 199 | raise Exception( "Cannot find lzot file for: " + data_filename ) |
|---|
| 200 | self.file_type = "lzot" |
|---|
| 201 | # Strip .lzo from the filename before adding ".index" |
|---|
| 202 | data_filename_root = data_filename[:-4] |
|---|
| 203 | else: |
|---|
| 204 | self.file_type = "plain" |
|---|
| 205 | data_filename_root = data_filename |
|---|
| 206 | # Open index |
|---|
| 207 | if index_filename is None: |
|---|
| 208 | index_filename = data_filename_root + ".index" |
|---|
| 209 | self.indexes = Indexes( filename=index_filename ) |
|---|
| 210 | # Use a file cache? |
|---|
| 211 | self.use_cache = use_cache |
|---|
| 212 | # Open now? |
|---|
| 213 | if keep_open: |
|---|
| 214 | self.f = self.open_data() |
|---|
| 215 | else: |
|---|
| 216 | self.f = None |
|---|
| 217 | |
|---|
| 218 | def close( self ): |
|---|
| 219 | if self.f: |
|---|
| 220 | self.f.close() |
|---|
| 221 | self.f = None |
|---|
| 222 | |
|---|
| 223 | def open_data( self ): |
|---|
| 224 | if self.file_type == "plain": |
|---|
| 225 | return open( self.data_filename ) |
|---|
| 226 | elif self.file_type == "bz2t": |
|---|
| 227 | f = seekbzip2.SeekableBzip2File( self.data_filename, self.table_filename ) |
|---|
| 228 | if self.use_cache: |
|---|
| 229 | return filecache.FileCache( f, f.size ) |
|---|
| 230 | else: |
|---|
| 231 | return f |
|---|
| 232 | elif self.file_type == "lzot": |
|---|
| 233 | if self.use_cache: |
|---|
| 234 | block_cache_size = 20 |
|---|
| 235 | else: |
|---|
| 236 | block_cache_size = 0 |
|---|
| 237 | f = seeklzop.SeekableLzopFile( self.data_filename, |
|---|
| 238 | self.table_filename, |
|---|
| 239 | block_cache_size = block_cache_size ) |
|---|
| 240 | return f |
|---|
| 241 | |
|---|
| 242 | def get( self, src, start, end ): |
|---|
| 243 | return [ val for val in self.get_as_iterator( src, start, end ) ] |
|---|
| 244 | def get_as_iterator( self, src, start, end ): |
|---|
| 245 | for val, index, offset in self.get_as_iterator_with_index_and_offset( src, start, end ): |
|---|
| 246 | yield val |
|---|
| 247 | def get_as_iterator_with_index_and_offset( self, src, start, end ): |
|---|
| 248 | for val_start, val_end, val in self.indexes.find( src, start, end ): |
|---|
| 249 | yield self.get_at_offset( val ), self, val |
|---|
| 250 | |
|---|
| 251 | def get_at_offset( self, offset ): |
|---|
| 252 | if self.f: |
|---|
| 253 | self.f.seek( offset ) |
|---|
| 254 | return self.read_at_current_offset( self.f, **self.data_kwargs ) |
|---|
| 255 | else: |
|---|
| 256 | f = self.open_data() |
|---|
| 257 | try: |
|---|
| 258 | f.seek( offset ) |
|---|
| 259 | return self.read_at_current_offset( f, **self.data_kwargs ) |
|---|
| 260 | finally: |
|---|
| 261 | f.close() |
|---|
| 262 | |
|---|
| 263 | def read_at_current_offset( self, file, **kwargs ): |
|---|
| 264 | raise TypeError( "Abstract Method" ) |
|---|
| 265 | |
|---|
| 266 | class Indexes: |
|---|
| 267 | """A set of indexes, each identified by a unique name""" |
|---|
| 268 | |
|---|
| 269 | def __init__( self, filename=None ): |
|---|
| 270 | self.indexes = dict() |
|---|
| 271 | if filename is not None: self.open( filename ) |
|---|
| 272 | |
|---|
| 273 | def add( self, name, start, end, val, max=DEFAULT_MAX ): |
|---|
| 274 | if name not in self.indexes: |
|---|
| 275 | self.indexes[name] = Index( max=max ) |
|---|
| 276 | self.indexes[name].add( start, end, val ) |
|---|
| 277 | |
|---|
| 278 | def get( self, name ): |
|---|
| 279 | if self.indexes[name] is None: |
|---|
| 280 | offset, value_size = self.offsets[name] |
|---|
| 281 | self.indexes[name] = Index( filename=self.filename, offset=offset, value_size=value_size, version=self.version ) |
|---|
| 282 | return self.indexes[name] |
|---|
| 283 | |
|---|
| 284 | def find( self, name, start, end ): |
|---|
| 285 | if name in self.indexes: |
|---|
| 286 | return self.get( name ).find( start, end ) |
|---|
| 287 | else: |
|---|
| 288 | return [] |
|---|
| 289 | |
|---|
| 290 | def open( self, filename ): |
|---|
| 291 | self.filename = filename |
|---|
| 292 | self.offsets = dict() # (will map key to (offset,value_size)) |
|---|
| 293 | f = open( filename ) |
|---|
| 294 | magic, version, length = read_packed( f, ">3I" ) |
|---|
| 295 | if magic != MAGIC: |
|---|
| 296 | raise "File does not have expected header" |
|---|
| 297 | if version > VERSION: |
|---|
| 298 | warn( "File claims version %d, I don't known anything about versions beyond %d. Attempting to continue", version, VERSION ) |
|---|
| 299 | self.version = version |
|---|
| 300 | for i in range( length ): |
|---|
| 301 | key_len = read_packed( f, ">I" ) |
|---|
| 302 | key = f.read( key_len ) |
|---|
| 303 | offset = read_packed( f, ">I" ) |
|---|
| 304 | if version == 0: |
|---|
| 305 | value_size = 4 |
|---|
| 306 | else: |
|---|
| 307 | value_size = read_packed( f, ">I" ) |
|---|
| 308 | assert value_size % 4 == 0, "unsupported value size: %s" % value_size |
|---|
| 309 | self.indexes[ key ] = None |
|---|
| 310 | self.offsets[ key ] = (offset,value_size) |
|---|
| 311 | f.close() |
|---|
| 312 | |
|---|
| 313 | def write( self, f ): |
|---|
| 314 | keys = self.indexes.keys() |
|---|
| 315 | keys.sort() |
|---|
| 316 | # First determine the size of the header |
|---|
| 317 | base = calcsize( ">3I" ) |
|---|
| 318 | for key in keys: |
|---|
| 319 | key = str( key ) |
|---|
| 320 | base += calcsize( ">I" ) |
|---|
| 321 | base += len( key ) |
|---|
| 322 | base += calcsize( ">2I" ) |
|---|
| 323 | # Now actually write the header |
|---|
| 324 | write_packed( f, ">3I", MAGIC, VERSION, len( self.indexes ) ) |
|---|
| 325 | # And write the index table |
|---|
| 326 | for key in keys: |
|---|
| 327 | key = str( key ) |
|---|
| 328 | # Write the string prefixed by its length (pascal!) |
|---|
| 329 | write_packed( f, ">I", len( key ) ) |
|---|
| 330 | f.write( key ) |
|---|
| 331 | # Write offset |
|---|
| 332 | write_packed( f, ">I", base ) |
|---|
| 333 | base += self.indexes[key].bytes_required() |
|---|
| 334 | # Write value size |
|---|
| 335 | write_packed( f, ">I", self.indexes[key].value_size ) |
|---|
| 336 | # And finally write each index in order |
|---|
| 337 | for key in keys: |
|---|
| 338 | self.indexes[key].write( f ) |
|---|
| 339 | |
|---|
| 340 | class Index: |
|---|
| 341 | |
|---|
| 342 | def __init__( self, min=MIN, max=DEFAULT_MAX, filename=None, offset=0, value_size=None, version=None ): |
|---|
| 343 | self._value_size = value_size |
|---|
| 344 | self.max_val = 1 # (1, rather than 0, to force value_size > 0) |
|---|
| 345 | if filename is None: |
|---|
| 346 | self.new( min, max ) |
|---|
| 347 | else: |
|---|
| 348 | self.open( filename, offset, version ) |
|---|
| 349 | |
|---|
| 350 | def get_value_size ( self ): |
|---|
| 351 | if self._value_size != None: |
|---|
| 352 | return self._value_size |
|---|
| 353 | else: |
|---|
| 354 | return round_up_to_4( bytes_of( self.max_val ) ) |
|---|
| 355 | value_size = property( fget=get_value_size ) |
|---|
| 356 | |
|---|
| 357 | def new( self, min, max ): |
|---|
| 358 | """Create an empty index for intervals in the range min, max""" |
|---|
| 359 | # Ensure the range will fit given the shifting strategy |
|---|
| 360 | assert MIN <= min <= max <= MAX |
|---|
| 361 | self.min = min |
|---|
| 362 | self.max = max |
|---|
| 363 | # Determine offsets to use |
|---|
| 364 | self.offsets = offsets_for_max_size( max ) |
|---|
| 365 | # Determine the largest bin we will actually use |
|---|
| 366 | self.bin_count = bin_for_range( max - 1, max, offsets = self.offsets ) + 1 |
|---|
| 367 | # Create empty bins |
|---|
| 368 | self.bins = [ [] for i in range( self.bin_count ) ] |
|---|
| 369 | |
|---|
| 370 | def open( self, filename, offset, version ): |
|---|
| 371 | self.filename = filename |
|---|
| 372 | self.offset = offset |
|---|
| 373 | # Open the file and seek to where we expect our header |
|---|
| 374 | f = open( filename ) |
|---|
| 375 | f.seek( offset ) |
|---|
| 376 | # Read min/max |
|---|
| 377 | min, max = read_packed( f, ">2I" ) |
|---|
| 378 | self.new( min, max ) |
|---|
| 379 | # Decide how many levels of bins based on 'max' |
|---|
| 380 | if version < 2: |
|---|
| 381 | # Prior to version 2 all files used the bins for 512MB |
|---|
| 382 | self.offsets = offsets_for_max_size( OLD_MAX - 1 ) |
|---|
| 383 | else: |
|---|
| 384 | self.offsets = offsets_for_max_size( max ) |
|---|
| 385 | # Read bin indexes |
|---|
| 386 | self.bin_offsets = [] |
|---|
| 387 | self.bin_sizes = [] |
|---|
| 388 | for i in range( self.bin_count ): |
|---|
| 389 | o, s = read_packed( f, ">2I" ) |
|---|
| 390 | self.bin_offsets.append( o ) |
|---|
| 391 | self.bin_sizes.append( s ) |
|---|
| 392 | # Initialize bins to None, indicating that they need to be loaded |
|---|
| 393 | self.bins = [ None for i in range( self.bin_count ) ] |
|---|
| 394 | |
|---|
| 395 | def add( self, start, end, val ): |
|---|
| 396 | """Add the interval (start,end) with associated value val to the index""" |
|---|
| 397 | insort( self.bins[ bin_for_range( start, end, offsets=self.offsets ) ], ( start, end, val ) ) |
|---|
| 398 | assert val >= 0 |
|---|
| 399 | self.max_val = max(self.max_val,val) |
|---|
| 400 | |
|---|
| 401 | def find( self, start, end ): |
|---|
| 402 | rval = [] |
|---|
| 403 | start_bin = ( max( start, self.min ) ) >> BIN_FIRST_SHIFT |
|---|
| 404 | end_bin = ( min( end, self.max ) - 1 ) >> BIN_FIRST_SHIFT |
|---|
| 405 | for offset in self.offsets: |
|---|
| 406 | for i in range( start_bin + offset, end_bin + offset + 1 ): |
|---|
| 407 | if self.bins[i] is None: self.load_bin( i ) |
|---|
| 408 | # Iterate over bin and insert any overlapping elements into return value |
|---|
| 409 | for el_start, el_end, val in self.bins[i]: |
|---|
| 410 | if el_start < end and el_end > start: |
|---|
| 411 | insort_right( rval, ( el_start, el_end, val ) ) |
|---|
| 412 | start_bin >>= BIN_NEXT_SHIFT |
|---|
| 413 | end_bin >>= BIN_NEXT_SHIFT |
|---|
| 414 | return rval |
|---|
| 415 | |
|---|
| 416 | def iterate( self ): |
|---|
| 417 | for i in range( self.bin_count ): |
|---|
| 418 | if self.bins[i] is None: self.load_bin( i ) |
|---|
| 419 | for entry in self.bins[i]: yield entry |
|---|
| 420 | |
|---|
| 421 | def load_bin( self, index ): |
|---|
| 422 | bin = [] |
|---|
| 423 | if self.bin_sizes[index] == 0: |
|---|
| 424 | self.bins[index] = bin |
|---|
| 425 | return |
|---|
| 426 | f = open( self.filename ) |
|---|
| 427 | f.seek( self.bin_offsets[index] ) |
|---|
| 428 | # One big read for happy NFS |
|---|
| 429 | item_size = self.value_size + calcsize( ">2I" ) |
|---|
| 430 | buffer = f.read( self.bin_sizes[index] * item_size ) |
|---|
| 431 | for i in range( self.bin_sizes[index] ): |
|---|
| 432 | start, end = unpack( ">2I", buffer[ i*item_size : i*item_size+8 ] ) |
|---|
| 433 | val = unpack_uints( buffer[ i*item_size+8 : (i+1)*item_size ] ) |
|---|
| 434 | bin.append( (start, end, val) ) |
|---|
| 435 | self.bins[index] = bin |
|---|
| 436 | f.close() |
|---|
| 437 | |
|---|
| 438 | def write( self, f ): |
|---|
| 439 | value_size = self.value_size |
|---|
| 440 | item_size = value_size + calcsize( ">2I" ) |
|---|
| 441 | # Write min/max |
|---|
| 442 | write_packed( f, ">2I", self.min, self.max ) |
|---|
| 443 | # Write table of bin sizes and offsets |
|---|
| 444 | base = f.tell() + self.bin_count * calcsize( ">2I" ) |
|---|
| 445 | for bin in self.bins: |
|---|
| 446 | write_packed( f, ">2I", base, len( bin ) ) |
|---|
| 447 | base += len( bin ) * item_size |
|---|
| 448 | # Write contents of each bin |
|---|
| 449 | for bin in self.bins: |
|---|
| 450 | for start, end, val in bin: |
|---|
| 451 | write_packed( f, ">2I", start, end ) |
|---|
| 452 | write_packed_uints( f, val, value_size ) |
|---|
| 453 | |
|---|
| 454 | def bytes_required( self ): |
|---|
| 455 | item_size = self.value_size + calcsize( ">2I" ) |
|---|
| 456 | rval = calcsize( ">2I" ) |
|---|
| 457 | rval += self.bin_count * calcsize( ">2I" ) |
|---|
| 458 | for bin in self.bins: |
|---|
| 459 | rval += len( bin ) * item_size |
|---|
| 460 | return rval |
|---|
| 461 | |
|---|
| 462 | def write_packed( f, pattern, *vals ): |
|---|
| 463 | f.write( pack( pattern, *vals ) ) |
|---|
| 464 | |
|---|
| 465 | def read_packed( f, pattern ): |
|---|
| 466 | rval = unpack( pattern, f.read( calcsize( pattern ) ) ) |
|---|
| 467 | if len( rval ) == 1: return rval[0] |
|---|
| 468 | return rval |
|---|
| 469 | |
|---|
| 470 | def write_packed_uints( f, v, num_bytes ): |
|---|
| 471 | if num_bytes < 4: |
|---|
| 472 | write_packed( f, ">I", v ) |
|---|
| 473 | else: |
|---|
| 474 | parts = [] |
|---|
| 475 | while num_bytes > 0: |
|---|
| 476 | parts.append( v & 0xFFFFFFFFL ) |
|---|
| 477 | v >>= 32 |
|---|
| 478 | num_bytes -= 4 |
|---|
| 479 | parts.reverse() # (write most-significant chunk first) |
|---|
| 480 | write_packed( f, ">%dI" % len( parts ), *parts ) |
|---|
| 481 | |
|---|
| 482 | def unpack_uints( parts ): |
|---|
| 483 | chunks = len( parts )/4 |
|---|
| 484 | vals = unpack( ">%dI" % chunks, parts ) |
|---|
| 485 | val = vals[0] |
|---|
| 486 | for v in vals[1:]: |
|---|
| 487 | val = (val << 32) + v |
|---|
| 488 | return val |
|---|
| 489 | |
|---|
| 490 | def bytes_of( v ): |
|---|
| 491 | assert v > 0 |
|---|
| 492 | b = 0 |
|---|
| 493 | while v > 0: |
|---|
| 494 | v >>= 8 |
|---|
| 495 | b += 1 |
|---|
| 496 | return b |
|---|
| 497 | |
|---|
| 498 | def round_up_to_4( v ): |
|---|
| 499 | if v % 4 == 0: |
|---|
| 500 | return v |
|---|
| 501 | else: |
|---|
| 502 | return v + 4 - (v % 4) |
|---|
| 503 | |
|---|