[3] | 1 | """ |
---|
| 2 | Semi-random access to bz2 compressed data. |
---|
| 3 | """ |
---|
| 4 | |
---|
| 5 | import os |
---|
| 6 | import bisect |
---|
| 7 | import sys |
---|
| 8 | try: |
---|
| 9 | import pkg_resources |
---|
| 10 | pkg_resources.require( 'python_lzo' ) |
---|
| 11 | except: |
---|
| 12 | pass |
---|
| 13 | import lzo |
---|
| 14 | import struct |
---|
| 15 | |
---|
| 16 | from bx_extras import lrucache |
---|
| 17 | from cStringIO import StringIO |
---|
| 18 | |
---|
| 19 | class SeekableLzopFile( object ): |
---|
| 20 | """ |
---|
| 21 | Filelike object supporting read-only semi-random access to bz2 compressed |
---|
| 22 | files for which an offset table (bz2t) has been generated by `bzip-table`. |
---|
| 23 | """ |
---|
| 24 | |
---|
| 25 | def __init__( self, filename, table_filename, block_cache_size=0, **kwargs ): |
---|
| 26 | self.filename = filename |
---|
| 27 | self.table_filename = table_filename |
---|
| 28 | self.init_table() |
---|
| 29 | self.file = open( self.filename, "r" ) |
---|
| 30 | self.dirty = True |
---|
| 31 | self.at_eof = False |
---|
| 32 | self.file_pos = 0 |
---|
| 33 | self.current_block_index = -1 |
---|
| 34 | self.current_block = None |
---|
| 35 | if block_cache_size > 0: |
---|
| 36 | self.cache = lrucache.LRUCache( block_cache_size ) |
---|
| 37 | else: |
---|
| 38 | self.cache = None |
---|
| 39 | |
---|
| 40 | def init_table( self ): |
---|
| 41 | self.block_size = None |
---|
| 42 | self.block_info = [] |
---|
| 43 | # Position of corresponding block in compressed file (in bytes) |
---|
| 44 | for line in open( self.table_filename ): |
---|
| 45 | fields = line.split() |
---|
| 46 | if fields[0] == "s": |
---|
| 47 | self.block_size = int( fields[1] ) |
---|
| 48 | if fields[0] == "o": |
---|
| 49 | offset = int( fields[1] ) |
---|
| 50 | compressed_size = int( fields[2] ) |
---|
| 51 | size = int( fields[3] ) |
---|
| 52 | self.block_info.append( ( offset, compressed_size, size ) ) |
---|
| 53 | self.nblocks = len( self.block_info ) |
---|
| 54 | |
---|
| 55 | def close( self ): |
---|
| 56 | self.file.close() |
---|
| 57 | |
---|
| 58 | def load_block( self, index ): |
---|
| 59 | if self.cache is not None and index in self.cache: |
---|
| 60 | return self.cache[index] |
---|
| 61 | else: |
---|
| 62 | offset, csize, size = self.block_info[ index ] |
---|
| 63 | # Get the block of compressed data |
---|
| 64 | self.file.seek( offset ) |
---|
| 65 | data = self.file.read( csize ) |
---|
| 66 | # Need to prepend a header for python-lzo module (silly) |
---|
| 67 | data = ''.join( ( '\xf0', struct.pack( "!I", size ), data ) ) |
---|
| 68 | value = lzo.decompress( data ) |
---|
| 69 | if self.cache is not None: |
---|
| 70 | self.cache[index] = value |
---|
| 71 | return value |
---|
| 72 | |
---|
| 73 | def fix_dirty( self ): |
---|
| 74 | chunk, offset = self.get_block_and_offset( self.file_pos ) |
---|
| 75 | if self.current_block_index != chunk: |
---|
| 76 | self.current_block = StringIO( self.load_block( chunk ) ) |
---|
| 77 | self.current_block.read( offset ) |
---|
| 78 | self.current_block_index = chunk |
---|
| 79 | else: |
---|
| 80 | self.current_block.seek( offset ) |
---|
| 81 | self.dirty = False |
---|
| 82 | |
---|
| 83 | def get_block_and_offset( self, index ): |
---|
| 84 | return int( index // self.block_size ), int( index % self.block_size ) |
---|
| 85 | |
---|
| 86 | def seek( self, offset, whence=0 ): |
---|
| 87 | """ |
---|
| 88 | Move the file pointer to a particular offset. |
---|
| 89 | """ |
---|
| 90 | # Determine absolute target position |
---|
| 91 | if whence == 0: |
---|
| 92 | target_pos = offset |
---|
| 93 | elif whence == 1: |
---|
| 94 | target_pos = self.file_pos + offset |
---|
| 95 | elif whence == 2: |
---|
| 96 | raise Exception( "seek from end not supported" ) |
---|
| 97 | ## target_pos = self.size - offset |
---|
| 98 | else: |
---|
| 99 | raise Exception( "Invalid `whence` argument: %r", whence ) |
---|
| 100 | # Check if this is a noop |
---|
| 101 | if target_pos == self.file_pos: |
---|
| 102 | return |
---|
| 103 | # Verify it is valid |
---|
| 104 | ## assert 0 <= target_pos < self.size, "Attempt to seek outside file" |
---|
| 105 | # Move the position |
---|
| 106 | self.file_pos = target_pos |
---|
| 107 | # Mark as dirty, the next time a read is done we need to actually |
---|
| 108 | # move the position in the bzip2 file |
---|
| 109 | self.dirty = True |
---|
| 110 | |
---|
| 111 | def tell( self ): |
---|
| 112 | return self.file_pos |
---|
| 113 | |
---|
| 114 | def readline( self ): |
---|
| 115 | if self.dirty: |
---|
| 116 | self.fix_dirty() |
---|
| 117 | if self.at_eof: |
---|
| 118 | return "" |
---|
| 119 | rval = [] |
---|
| 120 | while 1: |
---|
| 121 | line = self.current_block.readline() |
---|
| 122 | self.file_pos += len( line ) |
---|
| 123 | rval.append( line ) |
---|
| 124 | if len( line ) > 0 and line[-1] == '\n': |
---|
| 125 | break |
---|
| 126 | elif self.current_block_index == self.nblocks - 1: |
---|
| 127 | self.at_eof = True |
---|
| 128 | break |
---|
| 129 | else: |
---|
| 130 | self.current_block_index += 1 |
---|
| 131 | self.current_block = StringIO( self.load_block( self.current_block_index ) ) |
---|
| 132 | return "".join( rval ) |
---|
| 133 | |
---|
| 134 | def next( self ): |
---|
| 135 | line = self.readline() |
---|
| 136 | if line == "": |
---|
| 137 | raise StopIteration |
---|
| 138 | |
---|
| 139 | def __iter__( self ): |
---|
| 140 | return self |
---|
| 141 | |
---|
| 142 | # --- Factor out --- |
---|
| 143 | |
---|
| 144 | MAGIC="\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a" |
---|
| 145 | |
---|
| 146 | F_ADLER32_D = 0x00000001L |
---|
| 147 | F_ADLER32_C = 0x00000002L |
---|
| 148 | F_H_EXTRA_FIELD = 0x00000040L |
---|
| 149 | F_H_GMTDIFF = 0x00000080L |
---|
| 150 | F_CRC32_D = 0x00000100L |
---|
| 151 | F_CRC32_C = 0x00000200L |
---|
| 152 | F_MULTIPART = 0x00000400L |
---|
| 153 | F_H_FILTER = 0x00000800L |
---|
| 154 | F_H_CRC32 = 0x00001000L |
---|
| 155 | |
---|
| 156 | assert struct.calcsize( "!H" ) == 2 |
---|
| 157 | assert struct.calcsize( "!I" ) == 4 |
---|
| 158 | |
---|
| 159 | class UnpackWrapper( object ): |
---|
| 160 | def __init__( self, file ): |
---|
| 161 | self.file = file |
---|
| 162 | def read( self, amt ): |
---|
| 163 | return self.file.read( amt ) |
---|
| 164 | def get( self, fmt ): |
---|
| 165 | t = struct.unpack( fmt, self.file.read( struct.calcsize( fmt ) ) ) |
---|
| 166 | return t[0] |
---|