root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/misc/seekbzip2.py @ 3

リビジョン 3, 4.7 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Semi-random access to bz2 compressed data.
3"""
4
5import os
6import bisect
7import sys
8   
9from _seekbzip2 import SeekBzip2
10   
11class SeekableBzip2File( object ):
12    """
13    Filelike object supporting read-only semi-random access to bz2 compressed
14    files for which an offset table (bz2t) has been generated by `bzip-table`.
15    """
16   
17    def __init__( self, filename, table_filename, **kwargs ):
18        self.filename = filename
19        self.table_filename = table_filename
20        self.init_table()
21        self.init_bz2()
22        self.pos = 0
23        self.dirty = True
24       
25    def init_bz2( self ):
26        self.seek_bz2 = SeekBzip2( self.filename )
27       
28    def init_table( self ):
29        # Position in plaintext file
30        self.table_positions = []
31        # Position of corresponding block in bz2 file (bits)
32        self.table_bz2positions = []
33        pos = 0
34        for line in open( self.table_filename ):
35            fields = line.split()
36            # Position of the compressed block in the bz2 file
37            bz2_pos = int( fields[0] )
38            # print >> sys.stderr, fields[0], bz2_pos
39            # Length of the block when uncompressed
40            length = int( fields[1] )
41            self.table_positions.append( pos )
42            self.table_bz2positions.append( bz2_pos )
43            old_pos = pos
44            pos = pos + length
45            assert pos > old_pos
46        self.size = pos
47        #print >> sys.stderr, self.size
48        #print >> sys.stderr, self.table_bz2positions
49       
50    def close( self ):
51        self.seek_bz2.close()
52       
53    def fix_dirty( self ):
54        # Our virtual position in the uncompressed data is out of sync
55        # FIXME: If we're moving to a later position that is still in
56        # the same block, we could just read and throw out bytes in the
57        # compressed stream, less wasteful then backtracking
58        chunk, offset = self.get_chunk_and_offset( self.pos )
59        # Get the seek position for that chunk and seek to it
60        bz2_seek_pos = self.table_bz2positions[chunk]
61        # print >>sys.stderr, "bz2 seek pos:", bz2_seek_pos
62        self.seek_bz2.seek( bz2_seek_pos )
63        # Consume bytes to move to the correct position
64        assert len( self.seek_bz2.read( offset ) ) == offset
65        # Update state
66        self.dirty = False
67       
68    def read( self, sizehint=-1 ):
69        if sizehint < 0:
70            chunks = []
71            while 1:
72                self._read( 1024*1024 )
73                if val:
74                    chunks.append( val )
75                else:
76                    break
77            return "".join( chunks )
78        else:
79            return self._read( sizehint )
80       
81    def _read( self, size ):
82        if self.dirty: self.fix_dirty()
83        val = self.seek_bz2.read( size )
84        if val is None:
85            # EOF
86            self.pos = self.size
87            val = ""
88        else:
89            self.pos = self.pos + len( val )
90        return val
91       
92    def readline( self, size=-1 ):
93        if self.dirty: self.fix_dirty()
94        val = self.seek_bz2.readline( size )
95        if val is None:
96            # EOF
97            self.pos = self.size
98            val = ""
99        else:
100            self.pos = self.pos + len( val )
101        return val
102       
103    def tell( self ):
104        return self.pos
105           
106    def get_chunk_and_offset( self, position ):
107        # Find the chunk that position is in using a binary search
108        chunk = bisect.bisect( self.table_positions, position ) - 1
109        offset = position - self.table_positions[chunk]
110        return chunk, offset
111       
112    def seek( self, offset, whence=0 ):
113        # Determine absolute target position
114        if whence == 0:
115            target_pos = offset
116        elif whence == 1:
117            target_pos = self.pos + offset
118        elif whence == 2:
119            target_pos = self.size - offset
120        else:
121            raise Exception( "Invalid `whence` argument: %r", whence )
122        # Check if this is a noop
123        if target_pos == self.pos:
124            return   
125        # Verify it is valid
126        assert 0 <= target_pos < self.size, "Attempt to seek outside file"
127        # Move the position
128        self.pos = target_pos
129        # Mark as dirty, the next time a read is done we need to actually
130        # move the position in the bzip2 file
131        self.dirty = True
132       
133    # ---- File like methods ------------------------------------------------
134   
135    def next(self):
136        ln = self.readline()
137        if ln == "":
138            raise StopIteration()
139        return ln
140   
141    def __iter__(self):
142        return self
143       
144    def readlines(self,sizehint=-1):
145        return [ln for ln in self]
146
147    def xreadlines(self):
148        return iter(self)
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。