""" Semi-random access to bz2 compressed data. """ import os import bisect import sys from _seekbzip2 import SeekBzip2 class SeekableBzip2File( object ): """ Filelike object supporting read-only semi-random access to bz2 compressed files for which an offset table (bz2t) has been generated by `bzip-table`. """ def __init__( self, filename, table_filename, **kwargs ): self.filename = filename self.table_filename = table_filename self.init_table() self.init_bz2() self.pos = 0 self.dirty = True def init_bz2( self ): self.seek_bz2 = SeekBzip2( self.filename ) def init_table( self ): # Position in plaintext file self.table_positions = [] # Position of corresponding block in bz2 file (bits) self.table_bz2positions = [] pos = 0 for line in open( self.table_filename ): fields = line.split() # Position of the compressed block in the bz2 file bz2_pos = int( fields[0] ) # print >> sys.stderr, fields[0], bz2_pos # Length of the block when uncompressed length = int( fields[1] ) self.table_positions.append( pos ) self.table_bz2positions.append( bz2_pos ) old_pos = pos pos = pos + length assert pos > old_pos self.size = pos #print >> sys.stderr, self.size #print >> sys.stderr, self.table_bz2positions def close( self ): self.seek_bz2.close() def fix_dirty( self ): # Our virtual position in the uncompressed data is out of sync # FIXME: If we're moving to a later position that is still in # the same block, we could just read and throw out bytes in the # compressed stream, less wasteful then backtracking chunk, offset = self.get_chunk_and_offset( self.pos ) # Get the seek position for that chunk and seek to it bz2_seek_pos = self.table_bz2positions[chunk] # print >>sys.stderr, "bz2 seek pos:", bz2_seek_pos self.seek_bz2.seek( bz2_seek_pos ) # Consume bytes to move to the correct position assert len( self.seek_bz2.read( offset ) ) == offset # Update state self.dirty = False def read( self, sizehint=-1 ): if sizehint < 0: chunks = [] while 1: self._read( 1024*1024 ) if val: chunks.append( val ) else: break return "".join( chunks ) else: return self._read( sizehint ) def _read( self, size ): if self.dirty: self.fix_dirty() val = self.seek_bz2.read( size ) if val is None: # EOF self.pos = self.size val = "" else: self.pos = self.pos + len( val ) return val def readline( self, size=-1 ): if self.dirty: self.fix_dirty() val = self.seek_bz2.readline( size ) if val is None: # EOF self.pos = self.size val = "" else: self.pos = self.pos + len( val ) return val def tell( self ): return self.pos def get_chunk_and_offset( self, position ): # Find the chunk that position is in using a binary search chunk = bisect.bisect( self.table_positions, position ) - 1 offset = position - self.table_positions[chunk] return chunk, offset def seek( self, offset, whence=0 ): # Determine absolute target position if whence == 0: target_pos = offset elif whence == 1: target_pos = self.pos + offset elif whence == 2: target_pos = self.size - offset else: raise Exception( "Invalid `whence` argument: %r", whence ) # Check if this is a noop if target_pos == self.pos: return # Verify it is valid assert 0 <= target_pos < self.size, "Attempt to seek outside file" # Move the position self.pos = target_pos # Mark as dirty, the next time a read is done we need to actually # move the position in the bzip2 file self.dirty = True # ---- File like methods ------------------------------------------------ def next(self): ln = self.readline() if ln == "": raise StopIteration() return ln def __iter__(self): return self def readlines(self,sizehint=-1): return [ln for ln in self] def xreadlines(self): return iter(self)