1 | """ |
---|
2 | Semi-random access to bz2 compressed data. |
---|
3 | """ |
---|
4 | |
---|
5 | import os |
---|
6 | import bisect |
---|
7 | import sys |
---|
8 | try: |
---|
9 | import pkg_resources |
---|
10 | pkg_resources.require( 'python_lzo' ) |
---|
11 | except: |
---|
12 | pass |
---|
13 | import lzo |
---|
14 | import struct |
---|
15 | |
---|
16 | from bx_extras import lrucache |
---|
17 | from cStringIO import StringIO |
---|
18 | |
---|
19 | class SeekableLzopFile( object ): |
---|
20 | """ |
---|
21 | Filelike object supporting read-only semi-random access to bz2 compressed |
---|
22 | files for which an offset table (bz2t) has been generated by `bzip-table`. |
---|
23 | """ |
---|
24 | |
---|
25 | def __init__( self, filename, table_filename, block_cache_size=0, **kwargs ): |
---|
26 | self.filename = filename |
---|
27 | self.table_filename = table_filename |
---|
28 | self.init_table() |
---|
29 | self.file = open( self.filename, "r" ) |
---|
30 | self.dirty = True |
---|
31 | self.at_eof = False |
---|
32 | self.file_pos = 0 |
---|
33 | self.current_block_index = -1 |
---|
34 | self.current_block = None |
---|
35 | if block_cache_size > 0: |
---|
36 | self.cache = lrucache.LRUCache( block_cache_size ) |
---|
37 | else: |
---|
38 | self.cache = None |
---|
39 | |
---|
40 | def init_table( self ): |
---|
41 | self.block_size = None |
---|
42 | self.block_info = [] |
---|
43 | # Position of corresponding block in compressed file (in bytes) |
---|
44 | for line in open( self.table_filename ): |
---|
45 | fields = line.split() |
---|
46 | if fields[0] == "s": |
---|
47 | self.block_size = int( fields[1] ) |
---|
48 | if fields[0] == "o": |
---|
49 | offset = int( fields[1] ) |
---|
50 | compressed_size = int( fields[2] ) |
---|
51 | size = int( fields[3] ) |
---|
52 | self.block_info.append( ( offset, compressed_size, size ) ) |
---|
53 | self.nblocks = len( self.block_info ) |
---|
54 | |
---|
55 | def close( self ): |
---|
56 | self.file.close() |
---|
57 | |
---|
58 | def load_block( self, index ): |
---|
59 | if self.cache is not None and index in self.cache: |
---|
60 | return self.cache[index] |
---|
61 | else: |
---|
62 | offset, csize, size = self.block_info[ index ] |
---|
63 | # Get the block of compressed data |
---|
64 | self.file.seek( offset ) |
---|
65 | data = self.file.read( csize ) |
---|
66 | # Need to prepend a header for python-lzo module (silly) |
---|
67 | data = ''.join( ( '\xf0', struct.pack( "!I", size ), data ) ) |
---|
68 | value = lzo.decompress( data ) |
---|
69 | if self.cache is not None: |
---|
70 | self.cache[index] = value |
---|
71 | return value |
---|
72 | |
---|
73 | def fix_dirty( self ): |
---|
74 | chunk, offset = self.get_block_and_offset( self.file_pos ) |
---|
75 | if self.current_block_index != chunk: |
---|
76 | self.current_block = StringIO( self.load_block( chunk ) ) |
---|
77 | self.current_block.read( offset ) |
---|
78 | self.current_block_index = chunk |
---|
79 | else: |
---|
80 | self.current_block.seek( offset ) |
---|
81 | self.dirty = False |
---|
82 | |
---|
83 | def get_block_and_offset( self, index ): |
---|
84 | return int( index // self.block_size ), int( index % self.block_size ) |
---|
85 | |
---|
86 | def seek( self, offset, whence=0 ): |
---|
87 | """ |
---|
88 | Move the file pointer to a particular offset. |
---|
89 | """ |
---|
90 | # Determine absolute target position |
---|
91 | if whence == 0: |
---|
92 | target_pos = offset |
---|
93 | elif whence == 1: |
---|
94 | target_pos = self.file_pos + offset |
---|
95 | elif whence == 2: |
---|
96 | raise Exception( "seek from end not supported" ) |
---|
97 | ## target_pos = self.size - offset |
---|
98 | else: |
---|
99 | raise Exception( "Invalid `whence` argument: %r", whence ) |
---|
100 | # Check if this is a noop |
---|
101 | if target_pos == self.file_pos: |
---|
102 | return |
---|
103 | # Verify it is valid |
---|
104 | ## assert 0 <= target_pos < self.size, "Attempt to seek outside file" |
---|
105 | # Move the position |
---|
106 | self.file_pos = target_pos |
---|
107 | # Mark as dirty, the next time a read is done we need to actually |
---|
108 | # move the position in the bzip2 file |
---|
109 | self.dirty = True |
---|
110 | |
---|
111 | def tell( self ): |
---|
112 | return self.file_pos |
---|
113 | |
---|
114 | def readline( self ): |
---|
115 | if self.dirty: |
---|
116 | self.fix_dirty() |
---|
117 | if self.at_eof: |
---|
118 | return "" |
---|
119 | rval = [] |
---|
120 | while 1: |
---|
121 | line = self.current_block.readline() |
---|
122 | self.file_pos += len( line ) |
---|
123 | rval.append( line ) |
---|
124 | if len( line ) > 0 and line[-1] == '\n': |
---|
125 | break |
---|
126 | elif self.current_block_index == self.nblocks - 1: |
---|
127 | self.at_eof = True |
---|
128 | break |
---|
129 | else: |
---|
130 | self.current_block_index += 1 |
---|
131 | self.current_block = StringIO( self.load_block( self.current_block_index ) ) |
---|
132 | return "".join( rval ) |
---|
133 | |
---|
134 | def next( self ): |
---|
135 | line = self.readline() |
---|
136 | if line == "": |
---|
137 | raise StopIteration |
---|
138 | |
---|
139 | def __iter__( self ): |
---|
140 | return self |
---|
141 | |
---|
142 | # --- Factor out --- |
---|
143 | |
---|
144 | MAGIC="\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a" |
---|
145 | |
---|
146 | F_ADLER32_D = 0x00000001L |
---|
147 | F_ADLER32_C = 0x00000002L |
---|
148 | F_H_EXTRA_FIELD = 0x00000040L |
---|
149 | F_H_GMTDIFF = 0x00000080L |
---|
150 | F_CRC32_D = 0x00000100L |
---|
151 | F_CRC32_C = 0x00000200L |
---|
152 | F_MULTIPART = 0x00000400L |
---|
153 | F_H_FILTER = 0x00000800L |
---|
154 | F_H_CRC32 = 0x00001000L |
---|
155 | |
---|
156 | assert struct.calcsize( "!H" ) == 2 |
---|
157 | assert struct.calcsize( "!I" ) == 4 |
---|
158 | |
---|
159 | class UnpackWrapper( object ): |
---|
160 | def __init__( self, file ): |
---|
161 | self.file = file |
---|
162 | def read( self, amt ): |
---|
163 | return self.file.read( amt ) |
---|
164 | def get( self, fmt ): |
---|
165 | t = struct.unpack( fmt, self.file.read( struct.calcsize( fmt ) ) ) |
---|
166 | return t[0] |
---|