root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/misc/seeklzop.py @ 3

リビジョン 3, 5.3 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Semi-random access to bz2 compressed data.
3"""
4
5import os
6import bisect
7import sys
8try:
9    import pkg_resources
10    pkg_resources.require( 'python_lzo' )
11except:
12    pass
13import lzo
14import struct
15
16from bx_extras import lrucache   
17from cStringIO import StringIO
18   
19class SeekableLzopFile( object ):
20    """
21    Filelike object supporting read-only semi-random access to bz2 compressed
22    files for which an offset table (bz2t) has been generated by `bzip-table`.
23    """
24   
25    def __init__( self, filename, table_filename, block_cache_size=0, **kwargs ):
26        self.filename = filename
27        self.table_filename = table_filename
28        self.init_table()
29        self.file = open( self.filename, "r" )
30        self.dirty = True
31        self.at_eof = False
32        self.file_pos = 0
33        self.current_block_index = -1
34        self.current_block = None
35        if block_cache_size > 0:
36            self.cache = lrucache.LRUCache( block_cache_size )
37        else:
38            self.cache = None
39       
40    def init_table( self ):
41        self.block_size = None
42        self.block_info = []
43        # Position of corresponding block in compressed file (in bytes)
44        for line in open( self.table_filename ):
45            fields = line.split()
46            if fields[0] == "s":
47                self.block_size = int( fields[1] )
48            if fields[0] == "o":
49                offset = int( fields[1] )
50                compressed_size = int( fields[2] )
51                size = int( fields[3] )
52                self.block_info.append( ( offset, compressed_size, size ) )
53        self.nblocks = len( self.block_info )
54       
55    def close( self ):
56        self.file.close()
57       
58    def load_block( self, index ):
59        if self.cache is not None and index in self.cache:
60            return self.cache[index]
61        else:     
62            offset, csize, size = self.block_info[ index ]
63            # Get the block of compressed data
64            self.file.seek( offset )
65            data = self.file.read( csize )
66            # Need to prepend a header for python-lzo module (silly)
67            data = ''.join( ( '\xf0', struct.pack( "!I", size ), data ) )
68            value = lzo.decompress( data )
69            if self.cache is not None:
70                self.cache[index] = value
71            return value
72       
73    def fix_dirty( self ):
74        chunk, offset = self.get_block_and_offset( self.file_pos )
75        if self.current_block_index != chunk:
76            self.current_block = StringIO( self.load_block( chunk ) )
77            self.current_block.read( offset )
78            self.current_block_index = chunk
79        else:
80            self.current_block.seek( offset )
81        self.dirty = False
82       
83    def get_block_and_offset( self, index ):
84        return int( index // self.block_size ), int( index % self.block_size )
85
86    def seek( self, offset, whence=0 ):
87        """
88        Move the file pointer to a particular offset.
89        """
90        # Determine absolute target position
91        if whence == 0:
92            target_pos = offset
93        elif whence == 1:
94            target_pos = self.file_pos + offset
95        elif whence == 2:
96            raise Exception( "seek from end not supported" )
97            ## target_pos = self.size - offset
98        else:
99            raise Exception( "Invalid `whence` argument: %r", whence )
100        # Check if this is a noop
101        if target_pos == self.file_pos:
102            return   
103        # Verify it is valid
104        ## assert 0 <= target_pos < self.size, "Attempt to seek outside file"
105        # Move the position
106        self.file_pos = target_pos
107        # Mark as dirty, the next time a read is done we need to actually
108        # move the position in the bzip2 file
109        self.dirty = True
110       
111    def tell( self ):
112        return self.file_pos
113       
114    def readline( self ):
115        if self.dirty:
116            self.fix_dirty()
117        if self.at_eof:
118            return ""
119        rval = []
120        while 1:
121            line = self.current_block.readline()
122            self.file_pos += len( line )
123            rval.append( line )
124            if len( line ) > 0 and line[-1] == '\n':
125                break
126            elif self.current_block_index == self.nblocks - 1:
127                self.at_eof = True
128                break
129            else:
130                self.current_block_index += 1
131                self.current_block = StringIO( self.load_block( self.current_block_index ) )     
132        return "".join( rval )
133           
134    def next( self ):
135        line = self.readline()
136        if line == "":
137            raise StopIteration
138           
139    def __iter__( self ):
140        return self
141
142# --- Factor out ---       
143       
144MAGIC="\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a"
145
146F_ADLER32_D     = 0x00000001L
147F_ADLER32_C     = 0x00000002L
148F_H_EXTRA_FIELD = 0x00000040L
149F_H_GMTDIFF     = 0x00000080L
150F_CRC32_D       = 0x00000100L
151F_CRC32_C       = 0x00000200L
152F_MULTIPART     = 0x00000400L
153F_H_FILTER      = 0x00000800L
154F_H_CRC32       = 0x00001000L
155
156assert struct.calcsize( "!H" ) == 2
157assert struct.calcsize( "!I" ) == 4
158
159class UnpackWrapper( object ):
160    def __init__( self, file ):
161        self.file = file
162    def read( self, amt ):
163        return self.file.read( amt )
164    def get( self, fmt ):
165        t = struct.unpack( fmt, self.file.read( struct.calcsize( fmt ) ) )
166        return t[0]
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。