| 1 | """ | 
|---|
| 2 | Semi-random access to bz2 compressed data. | 
|---|
| 3 | """ | 
|---|
| 4 |  | 
|---|
| 5 | import os | 
|---|
| 6 | import bisect | 
|---|
| 7 | import sys | 
|---|
| 8 | try: | 
|---|
| 9 | import pkg_resources | 
|---|
| 10 | pkg_resources.require( 'python_lzo' ) | 
|---|
| 11 | except: | 
|---|
| 12 | pass | 
|---|
| 13 | import lzo | 
|---|
| 14 | import struct | 
|---|
| 15 |  | 
|---|
| 16 | from bx_extras import lrucache | 
|---|
| 17 | from cStringIO import StringIO | 
|---|
| 18 |  | 
|---|
| 19 | class SeekableLzopFile( object ): | 
|---|
| 20 | """ | 
|---|
| 21 | Filelike object supporting read-only semi-random access to bz2 compressed | 
|---|
| 22 | files for which an offset table (bz2t) has been generated by `bzip-table`. | 
|---|
| 23 | """ | 
|---|
| 24 |  | 
|---|
| 25 | def __init__( self, filename, table_filename, block_cache_size=0, **kwargs ): | 
|---|
| 26 | self.filename = filename | 
|---|
| 27 | self.table_filename = table_filename | 
|---|
| 28 | self.init_table() | 
|---|
| 29 | self.file = open( self.filename, "r" ) | 
|---|
| 30 | self.dirty = True | 
|---|
| 31 | self.at_eof = False | 
|---|
| 32 | self.file_pos = 0 | 
|---|
| 33 | self.current_block_index = -1 | 
|---|
| 34 | self.current_block = None | 
|---|
| 35 | if block_cache_size > 0: | 
|---|
| 36 | self.cache = lrucache.LRUCache( block_cache_size ) | 
|---|
| 37 | else: | 
|---|
| 38 | self.cache = None | 
|---|
| 39 |  | 
|---|
| 40 | def init_table( self ): | 
|---|
| 41 | self.block_size = None | 
|---|
| 42 | self.block_info = [] | 
|---|
| 43 | # Position of corresponding block in compressed file (in bytes) | 
|---|
| 44 | for line in open( self.table_filename ): | 
|---|
| 45 | fields = line.split() | 
|---|
| 46 | if fields[0] == "s": | 
|---|
| 47 | self.block_size = int( fields[1] ) | 
|---|
| 48 | if fields[0] == "o": | 
|---|
| 49 | offset = int( fields[1] ) | 
|---|
| 50 | compressed_size = int( fields[2] ) | 
|---|
| 51 | size = int( fields[3] ) | 
|---|
| 52 | self.block_info.append( ( offset, compressed_size, size ) ) | 
|---|
| 53 | self.nblocks = len( self.block_info ) | 
|---|
| 54 |  | 
|---|
| 55 | def close( self ): | 
|---|
| 56 | self.file.close() | 
|---|
| 57 |  | 
|---|
| 58 | def load_block( self, index ): | 
|---|
| 59 | if self.cache is not None and index in self.cache: | 
|---|
| 60 | return self.cache[index] | 
|---|
| 61 | else: | 
|---|
| 62 | offset, csize, size = self.block_info[ index ] | 
|---|
| 63 | # Get the block of compressed data | 
|---|
| 64 | self.file.seek( offset ) | 
|---|
| 65 | data = self.file.read( csize ) | 
|---|
| 66 | # Need to prepend a header for python-lzo module (silly) | 
|---|
| 67 | data = ''.join( ( '\xf0', struct.pack( "!I", size ), data ) ) | 
|---|
| 68 | value = lzo.decompress( data ) | 
|---|
| 69 | if self.cache is not None: | 
|---|
| 70 | self.cache[index] = value | 
|---|
| 71 | return value | 
|---|
| 72 |  | 
|---|
| 73 | def fix_dirty( self ): | 
|---|
| 74 | chunk, offset = self.get_block_and_offset( self.file_pos ) | 
|---|
| 75 | if self.current_block_index != chunk: | 
|---|
| 76 | self.current_block = StringIO( self.load_block( chunk ) ) | 
|---|
| 77 | self.current_block.read( offset ) | 
|---|
| 78 | self.current_block_index = chunk | 
|---|
| 79 | else: | 
|---|
| 80 | self.current_block.seek( offset ) | 
|---|
| 81 | self.dirty = False | 
|---|
| 82 |  | 
|---|
| 83 | def get_block_and_offset( self, index ): | 
|---|
| 84 | return int( index // self.block_size ), int( index % self.block_size ) | 
|---|
| 85 |  | 
|---|
| 86 | def seek( self, offset, whence=0 ): | 
|---|
| 87 | """ | 
|---|
| 88 | Move the file pointer to a particular offset. | 
|---|
| 89 | """ | 
|---|
| 90 | # Determine absolute target position | 
|---|
| 91 | if whence == 0: | 
|---|
| 92 | target_pos = offset | 
|---|
| 93 | elif whence == 1: | 
|---|
| 94 | target_pos = self.file_pos + offset | 
|---|
| 95 | elif whence == 2: | 
|---|
| 96 | raise Exception( "seek from end not supported" ) | 
|---|
| 97 | ## target_pos = self.size - offset | 
|---|
| 98 | else: | 
|---|
| 99 | raise Exception( "Invalid `whence` argument: %r", whence ) | 
|---|
| 100 | # Check if this is a noop | 
|---|
| 101 | if target_pos == self.file_pos: | 
|---|
| 102 | return | 
|---|
| 103 | # Verify it is valid | 
|---|
| 104 | ## assert 0 <= target_pos < self.size, "Attempt to seek outside file" | 
|---|
| 105 | # Move the position | 
|---|
| 106 | self.file_pos = target_pos | 
|---|
| 107 | # Mark as dirty, the next time a read is done we need to actually | 
|---|
| 108 | # move the position in the bzip2 file | 
|---|
| 109 | self.dirty = True | 
|---|
| 110 |  | 
|---|
| 111 | def tell( self ): | 
|---|
| 112 | return self.file_pos | 
|---|
| 113 |  | 
|---|
| 114 | def readline( self ): | 
|---|
| 115 | if self.dirty: | 
|---|
| 116 | self.fix_dirty() | 
|---|
| 117 | if self.at_eof: | 
|---|
| 118 | return "" | 
|---|
| 119 | rval = [] | 
|---|
| 120 | while 1: | 
|---|
| 121 | line = self.current_block.readline() | 
|---|
| 122 | self.file_pos += len( line ) | 
|---|
| 123 | rval.append( line ) | 
|---|
| 124 | if len( line ) > 0 and line[-1] == '\n': | 
|---|
| 125 | break | 
|---|
| 126 | elif self.current_block_index == self.nblocks - 1: | 
|---|
| 127 | self.at_eof = True | 
|---|
| 128 | break | 
|---|
| 129 | else: | 
|---|
| 130 | self.current_block_index += 1 | 
|---|
| 131 | self.current_block = StringIO( self.load_block( self.current_block_index ) ) | 
|---|
| 132 | return "".join( rval ) | 
|---|
| 133 |  | 
|---|
| 134 | def next( self ): | 
|---|
| 135 | line = self.readline() | 
|---|
| 136 | if line == "": | 
|---|
| 137 | raise StopIteration | 
|---|
| 138 |  | 
|---|
| 139 | def __iter__( self ): | 
|---|
| 140 | return self | 
|---|
| 141 |  | 
|---|
| 142 | # --- Factor out --- | 
|---|
| 143 |  | 
|---|
| 144 | MAGIC="\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a" | 
|---|
| 145 |  | 
|---|
| 146 | F_ADLER32_D     = 0x00000001L | 
|---|
| 147 | F_ADLER32_C     = 0x00000002L | 
|---|
| 148 | F_H_EXTRA_FIELD = 0x00000040L | 
|---|
| 149 | F_H_GMTDIFF     = 0x00000080L | 
|---|
| 150 | F_CRC32_D       = 0x00000100L | 
|---|
| 151 | F_CRC32_C       = 0x00000200L | 
|---|
| 152 | F_MULTIPART     = 0x00000400L | 
|---|
| 153 | F_H_FILTER      = 0x00000800L | 
|---|
| 154 | F_H_CRC32       = 0x00001000L | 
|---|
| 155 |  | 
|---|
| 156 | assert struct.calcsize( "!H" ) == 2 | 
|---|
| 157 | assert struct.calcsize( "!I" ) == 4 | 
|---|
| 158 |  | 
|---|
| 159 | class UnpackWrapper( object ): | 
|---|
| 160 | def __init__( self, file ): | 
|---|
| 161 | self.file = file | 
|---|
| 162 | def read( self, amt ): | 
|---|
| 163 | return self.file.read( amt ) | 
|---|
| 164 | def get( self, fmt ): | 
|---|
| 165 | t = struct.unpack( fmt, self.file.read( struct.calcsize( fmt ) ) ) | 
|---|
| 166 | return t[0] | 
|---|