| 1 | """ |
|---|
| 2 | Semi-random access to bz2 compressed data. |
|---|
| 3 | """ |
|---|
| 4 | |
|---|
| 5 | import os |
|---|
| 6 | import bisect |
|---|
| 7 | import sys |
|---|
| 8 | |
|---|
| 9 | from _seekbzip2 import SeekBzip2 |
|---|
| 10 | |
|---|
| 11 | class SeekableBzip2File( object ): |
|---|
| 12 | """ |
|---|
| 13 | Filelike object supporting read-only semi-random access to bz2 compressed |
|---|
| 14 | files for which an offset table (bz2t) has been generated by `bzip-table`. |
|---|
| 15 | """ |
|---|
| 16 | |
|---|
| 17 | def __init__( self, filename, table_filename, **kwargs ): |
|---|
| 18 | self.filename = filename |
|---|
| 19 | self.table_filename = table_filename |
|---|
| 20 | self.init_table() |
|---|
| 21 | self.init_bz2() |
|---|
| 22 | self.pos = 0 |
|---|
| 23 | self.dirty = True |
|---|
| 24 | |
|---|
| 25 | def init_bz2( self ): |
|---|
| 26 | self.seek_bz2 = SeekBzip2( self.filename ) |
|---|
| 27 | |
|---|
| 28 | def init_table( self ): |
|---|
| 29 | # Position in plaintext file |
|---|
| 30 | self.table_positions = [] |
|---|
| 31 | # Position of corresponding block in bz2 file (bits) |
|---|
| 32 | self.table_bz2positions = [] |
|---|
| 33 | pos = 0 |
|---|
| 34 | for line in open( self.table_filename ): |
|---|
| 35 | fields = line.split() |
|---|
| 36 | # Position of the compressed block in the bz2 file |
|---|
| 37 | bz2_pos = int( fields[0] ) |
|---|
| 38 | # print >> sys.stderr, fields[0], bz2_pos |
|---|
| 39 | # Length of the block when uncompressed |
|---|
| 40 | length = int( fields[1] ) |
|---|
| 41 | self.table_positions.append( pos ) |
|---|
| 42 | self.table_bz2positions.append( bz2_pos ) |
|---|
| 43 | old_pos = pos |
|---|
| 44 | pos = pos + length |
|---|
| 45 | assert pos > old_pos |
|---|
| 46 | self.size = pos |
|---|
| 47 | #print >> sys.stderr, self.size |
|---|
| 48 | #print >> sys.stderr, self.table_bz2positions |
|---|
| 49 | |
|---|
| 50 | def close( self ): |
|---|
| 51 | self.seek_bz2.close() |
|---|
| 52 | |
|---|
| 53 | def fix_dirty( self ): |
|---|
| 54 | # Our virtual position in the uncompressed data is out of sync |
|---|
| 55 | # FIXME: If we're moving to a later position that is still in |
|---|
| 56 | # the same block, we could just read and throw out bytes in the |
|---|
| 57 | # compressed stream, less wasteful then backtracking |
|---|
| 58 | chunk, offset = self.get_chunk_and_offset( self.pos ) |
|---|
| 59 | # Get the seek position for that chunk and seek to it |
|---|
| 60 | bz2_seek_pos = self.table_bz2positions[chunk] |
|---|
| 61 | # print >>sys.stderr, "bz2 seek pos:", bz2_seek_pos |
|---|
| 62 | self.seek_bz2.seek( bz2_seek_pos ) |
|---|
| 63 | # Consume bytes to move to the correct position |
|---|
| 64 | assert len( self.seek_bz2.read( offset ) ) == offset |
|---|
| 65 | # Update state |
|---|
| 66 | self.dirty = False |
|---|
| 67 | |
|---|
| 68 | def read( self, sizehint=-1 ): |
|---|
| 69 | if sizehint < 0: |
|---|
| 70 | chunks = [] |
|---|
| 71 | while 1: |
|---|
| 72 | self._read( 1024*1024 ) |
|---|
| 73 | if val: |
|---|
| 74 | chunks.append( val ) |
|---|
| 75 | else: |
|---|
| 76 | break |
|---|
| 77 | return "".join( chunks ) |
|---|
| 78 | else: |
|---|
| 79 | return self._read( sizehint ) |
|---|
| 80 | |
|---|
| 81 | def _read( self, size ): |
|---|
| 82 | if self.dirty: self.fix_dirty() |
|---|
| 83 | val = self.seek_bz2.read( size ) |
|---|
| 84 | if val is None: |
|---|
| 85 | # EOF |
|---|
| 86 | self.pos = self.size |
|---|
| 87 | val = "" |
|---|
| 88 | else: |
|---|
| 89 | self.pos = self.pos + len( val ) |
|---|
| 90 | return val |
|---|
| 91 | |
|---|
| 92 | def readline( self, size=-1 ): |
|---|
| 93 | if self.dirty: self.fix_dirty() |
|---|
| 94 | val = self.seek_bz2.readline( size ) |
|---|
| 95 | if val is None: |
|---|
| 96 | # EOF |
|---|
| 97 | self.pos = self.size |
|---|
| 98 | val = "" |
|---|
| 99 | else: |
|---|
| 100 | self.pos = self.pos + len( val ) |
|---|
| 101 | return val |
|---|
| 102 | |
|---|
| 103 | def tell( self ): |
|---|
| 104 | return self.pos |
|---|
| 105 | |
|---|
| 106 | def get_chunk_and_offset( self, position ): |
|---|
| 107 | # Find the chunk that position is in using a binary search |
|---|
| 108 | chunk = bisect.bisect( self.table_positions, position ) - 1 |
|---|
| 109 | offset = position - self.table_positions[chunk] |
|---|
| 110 | return chunk, offset |
|---|
| 111 | |
|---|
| 112 | def seek( self, offset, whence=0 ): |
|---|
| 113 | # Determine absolute target position |
|---|
| 114 | if whence == 0: |
|---|
| 115 | target_pos = offset |
|---|
| 116 | elif whence == 1: |
|---|
| 117 | target_pos = self.pos + offset |
|---|
| 118 | elif whence == 2: |
|---|
| 119 | target_pos = self.size - offset |
|---|
| 120 | else: |
|---|
| 121 | raise Exception( "Invalid `whence` argument: %r", whence ) |
|---|
| 122 | # Check if this is a noop |
|---|
| 123 | if target_pos == self.pos: |
|---|
| 124 | return |
|---|
| 125 | # Verify it is valid |
|---|
| 126 | assert 0 <= target_pos < self.size, "Attempt to seek outside file" |
|---|
| 127 | # Move the position |
|---|
| 128 | self.pos = target_pos |
|---|
| 129 | # Mark as dirty, the next time a read is done we need to actually |
|---|
| 130 | # move the position in the bzip2 file |
|---|
| 131 | self.dirty = True |
|---|
| 132 | |
|---|
| 133 | # ---- File like methods ------------------------------------------------ |
|---|
| 134 | |
|---|
| 135 | def next(self): |
|---|
| 136 | ln = self.readline() |
|---|
| 137 | if ln == "": |
|---|
| 138 | raise StopIteration() |
|---|
| 139 | return ln |
|---|
| 140 | |
|---|
| 141 | def __iter__(self): |
|---|
| 142 | return self |
|---|
| 143 | |
|---|
| 144 | def readlines(self,sizehint=-1): |
|---|
| 145 | return [ln for ln in self] |
|---|
| 146 | |
|---|
| 147 | def xreadlines(self): |
|---|
| 148 | return iter(self) |
|---|