| 1 | """ |
|---|
| 2 | Semi-random access to bz2 compressed data. |
|---|
| 3 | """ |
|---|
| 4 | |
|---|
| 5 | import os |
|---|
| 6 | import bisect |
|---|
| 7 | import sys |
|---|
| 8 | try: |
|---|
| 9 | import pkg_resources |
|---|
| 10 | pkg_resources.require( 'python_lzo' ) |
|---|
| 11 | except: |
|---|
| 12 | pass |
|---|
| 13 | import lzo |
|---|
| 14 | import struct |
|---|
| 15 | |
|---|
| 16 | from bx_extras import lrucache |
|---|
| 17 | from cStringIO import StringIO |
|---|
| 18 | |
|---|
| 19 | class SeekableLzopFile( object ): |
|---|
| 20 | """ |
|---|
| 21 | Filelike object supporting read-only semi-random access to bz2 compressed |
|---|
| 22 | files for which an offset table (bz2t) has been generated by `bzip-table`. |
|---|
| 23 | """ |
|---|
| 24 | |
|---|
| 25 | def __init__( self, filename, table_filename, block_cache_size=0, **kwargs ): |
|---|
| 26 | self.filename = filename |
|---|
| 27 | self.table_filename = table_filename |
|---|
| 28 | self.init_table() |
|---|
| 29 | self.file = open( self.filename, "r" ) |
|---|
| 30 | self.dirty = True |
|---|
| 31 | self.at_eof = False |
|---|
| 32 | self.file_pos = 0 |
|---|
| 33 | self.current_block_index = -1 |
|---|
| 34 | self.current_block = None |
|---|
| 35 | if block_cache_size > 0: |
|---|
| 36 | self.cache = lrucache.LRUCache( block_cache_size ) |
|---|
| 37 | else: |
|---|
| 38 | self.cache = None |
|---|
| 39 | |
|---|
| 40 | def init_table( self ): |
|---|
| 41 | self.block_size = None |
|---|
| 42 | self.block_info = [] |
|---|
| 43 | # Position of corresponding block in compressed file (in bytes) |
|---|
| 44 | for line in open( self.table_filename ): |
|---|
| 45 | fields = line.split() |
|---|
| 46 | if fields[0] == "s": |
|---|
| 47 | self.block_size = int( fields[1] ) |
|---|
| 48 | if fields[0] == "o": |
|---|
| 49 | offset = int( fields[1] ) |
|---|
| 50 | compressed_size = int( fields[2] ) |
|---|
| 51 | size = int( fields[3] ) |
|---|
| 52 | self.block_info.append( ( offset, compressed_size, size ) ) |
|---|
| 53 | self.nblocks = len( self.block_info ) |
|---|
| 54 | |
|---|
| 55 | def close( self ): |
|---|
| 56 | self.file.close() |
|---|
| 57 | |
|---|
| 58 | def load_block( self, index ): |
|---|
| 59 | if self.cache is not None and index in self.cache: |
|---|
| 60 | return self.cache[index] |
|---|
| 61 | else: |
|---|
| 62 | offset, csize, size = self.block_info[ index ] |
|---|
| 63 | # Get the block of compressed data |
|---|
| 64 | self.file.seek( offset ) |
|---|
| 65 | data = self.file.read( csize ) |
|---|
| 66 | # Need to prepend a header for python-lzo module (silly) |
|---|
| 67 | data = ''.join( ( '\xf0', struct.pack( "!I", size ), data ) ) |
|---|
| 68 | value = lzo.decompress( data ) |
|---|
| 69 | if self.cache is not None: |
|---|
| 70 | self.cache[index] = value |
|---|
| 71 | return value |
|---|
| 72 | |
|---|
| 73 | def fix_dirty( self ): |
|---|
| 74 | chunk, offset = self.get_block_and_offset( self.file_pos ) |
|---|
| 75 | if self.current_block_index != chunk: |
|---|
| 76 | self.current_block = StringIO( self.load_block( chunk ) ) |
|---|
| 77 | self.current_block.read( offset ) |
|---|
| 78 | self.current_block_index = chunk |
|---|
| 79 | else: |
|---|
| 80 | self.current_block.seek( offset ) |
|---|
| 81 | self.dirty = False |
|---|
| 82 | |
|---|
| 83 | def get_block_and_offset( self, index ): |
|---|
| 84 | return int( index // self.block_size ), int( index % self.block_size ) |
|---|
| 85 | |
|---|
| 86 | def seek( self, offset, whence=0 ): |
|---|
| 87 | """ |
|---|
| 88 | Move the file pointer to a particular offset. |
|---|
| 89 | """ |
|---|
| 90 | # Determine absolute target position |
|---|
| 91 | if whence == 0: |
|---|
| 92 | target_pos = offset |
|---|
| 93 | elif whence == 1: |
|---|
| 94 | target_pos = self.file_pos + offset |
|---|
| 95 | elif whence == 2: |
|---|
| 96 | raise Exception( "seek from end not supported" ) |
|---|
| 97 | ## target_pos = self.size - offset |
|---|
| 98 | else: |
|---|
| 99 | raise Exception( "Invalid `whence` argument: %r", whence ) |
|---|
| 100 | # Check if this is a noop |
|---|
| 101 | if target_pos == self.file_pos: |
|---|
| 102 | return |
|---|
| 103 | # Verify it is valid |
|---|
| 104 | ## assert 0 <= target_pos < self.size, "Attempt to seek outside file" |
|---|
| 105 | # Move the position |
|---|
| 106 | self.file_pos = target_pos |
|---|
| 107 | # Mark as dirty, the next time a read is done we need to actually |
|---|
| 108 | # move the position in the bzip2 file |
|---|
| 109 | self.dirty = True |
|---|
| 110 | |
|---|
| 111 | def tell( self ): |
|---|
| 112 | return self.file_pos |
|---|
| 113 | |
|---|
| 114 | def readline( self ): |
|---|
| 115 | if self.dirty: |
|---|
| 116 | self.fix_dirty() |
|---|
| 117 | if self.at_eof: |
|---|
| 118 | return "" |
|---|
| 119 | rval = [] |
|---|
| 120 | while 1: |
|---|
| 121 | line = self.current_block.readline() |
|---|
| 122 | self.file_pos += len( line ) |
|---|
| 123 | rval.append( line ) |
|---|
| 124 | if len( line ) > 0 and line[-1] == '\n': |
|---|
| 125 | break |
|---|
| 126 | elif self.current_block_index == self.nblocks - 1: |
|---|
| 127 | self.at_eof = True |
|---|
| 128 | break |
|---|
| 129 | else: |
|---|
| 130 | self.current_block_index += 1 |
|---|
| 131 | self.current_block = StringIO( self.load_block( self.current_block_index ) ) |
|---|
| 132 | return "".join( rval ) |
|---|
| 133 | |
|---|
| 134 | def next( self ): |
|---|
| 135 | line = self.readline() |
|---|
| 136 | if line == "": |
|---|
| 137 | raise StopIteration |
|---|
| 138 | |
|---|
| 139 | def __iter__( self ): |
|---|
| 140 | return self |
|---|
| 141 | |
|---|
| 142 | # --- Factor out --- |
|---|
| 143 | |
|---|
| 144 | MAGIC="\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a" |
|---|
| 145 | |
|---|
| 146 | F_ADLER32_D = 0x00000001L |
|---|
| 147 | F_ADLER32_C = 0x00000002L |
|---|
| 148 | F_H_EXTRA_FIELD = 0x00000040L |
|---|
| 149 | F_H_GMTDIFF = 0x00000080L |
|---|
| 150 | F_CRC32_D = 0x00000100L |
|---|
| 151 | F_CRC32_C = 0x00000200L |
|---|
| 152 | F_MULTIPART = 0x00000400L |
|---|
| 153 | F_H_FILTER = 0x00000800L |
|---|
| 154 | F_H_CRC32 = 0x00001000L |
|---|
| 155 | |
|---|
| 156 | assert struct.calcsize( "!H" ) == 2 |
|---|
| 157 | assert struct.calcsize( "!I" ) == 4 |
|---|
| 158 | |
|---|
| 159 | class UnpackWrapper( object ): |
|---|
| 160 | def __init__( self, file ): |
|---|
| 161 | self.file = file |
|---|
| 162 | def read( self, amt ): |
|---|
| 163 | return self.file.read( amt ) |
|---|
| 164 | def get( self, fmt ): |
|---|
| 165 | t = struct.unpack( fmt, self.file.read( struct.calcsize( fmt ) ) ) |
|---|
| 166 | return t[0] |
|---|