root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/binned_array.py @ 3

リビジョン 3, 12.4 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Numeric arrays stored as individually compressed blocks on disk, allowing
3pseudo-random acccess.
4
5`BinnedArray` is used to build such an array in memory and save it to disk.
6`BinnedArrayWriter` can instead be used when creating the array sequentially
7(does not require keeping all data in memory). `FileBinnedArray` provides
8read only access to an on disk binned array.
9"""
10
11from __future__ import division
12
13import math
14
15from numpy import *
16from struct import *
17from bx_extras.lrucache import LRUCache
18
19import sys
20platform_is_little_endian = ( sys.byteorder == 'little' )
21
22MAGIC=0x4AB04612
23
24# Version incremented from version 0 to version 1 by Ian Schenck, June
25# 23, 2006.  Version 1 supports different typecodes, and in doing so
26# breaks the original header format.  The new FileBinnedArray is
27# backwards compatible with version 0.
28
29# Version 1 -> 2 by James Taylor, allow specifying different compression
30# types.
31
32VERSION=2
33
34# Compression types
35
36comp_types = dict()
37
38comp_types['none'] = ( lambda x: x, lambda x: x )
39
40try:
41    import zlib
42    comp_types['zlib'] = ( zlib.compress, zlib.decompress )
43except:
44    pass
45   
46try:
47    import lzo
48    comp_types['lzo'] = ( lzo.compress, lzo.decompress )
49except:
50    pass
51
52
53MAX=512*1024*1024
54
55class BinnedArray( object ):
56    def __init__( self, bin_size=512*1024, default=NaN, max_size=MAX, typecode="f" ):
57        self.max_size = max_size
58        self.bin_size = bin_size
59        self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
60        self.bins = [ None ] * self.nbins
61        self.default = default
62        self.typecode = typecode
63    def get_bin_offset( self, index ):
64        return index // self.bin_size, index % self.bin_size
65    def init_bin( self, index ):
66        # self.bins[index] = zeros( self.bin_size ) * self.default
67        self.bins[index] = zeros( self.bin_size, self.typecode )
68        self.bins[index][:] = self.default
69    def get( self, key ):
70        bin, offset = self.get_bin_offset( key )
71        if self.bins[bin] is None:
72            return self.default
73        else:
74            return self.bins[bin][offset]
75    def set( self, key, value ):
76        bin, offset = self.get_bin_offset( key )
77        if self.bins[bin] is None:
78            self.init_bin( bin )
79        self.bins[bin][offset] = value
80    def get_range( self, start, end ):
81        size = end - start
82        assert size >= 0
83        rval = []
84        while size > 0:
85            bin, offset = self.get_bin_offset( start )
86            delta = self.bin_size - offset
87            if self.bins[bin] is None:
88                if delta < size:
89                    rval.append( resize( array(self.default, self.typecode), (delta,) ) )
90                    size -= delta
91                    start += delta
92                else:
93                    rval.append( resize( array(self.default, "f"), (size,) ) )
94                    size = 0
95            else:
96                if delta < size:
97                    rval.append( self.bins[bin][offset:offset+delta] )
98                    size -= delta
99                    start += delta
100                else:
101                    rval.append( self.bins[bin][offset:offset+size] )
102                    size = 0
103        return concatenate( rval )
104    def __getitem__( self, key ):
105        if isinstance( key, slice ):
106            start, stop, stride = key.indices( self.max_size )
107            assert stride == 1, "Slices with strides are not supported"
108            return self.get_range( start, stop )
109        else:
110            return self.get( key )
111    def __setitem__( self, key, value ):
112        return self.set( key, value )
113    def to_file( self, f, comp_type='zlib' ):
114        # Get compress method
115        compress, _ = comp_types[comp_type]
116        # Write header
117        write_packed( f, ">5I", MAGIC, VERSION, self.max_size, self.bin_size, self.nbins )
118        # save type code
119        f.write( pack('c',self.typecode ) )
120        # save compression type
121        f.write( comp_type[0:4].ljust( 4 ) )
122        # write default value
123        a = array( self.default, self.typecode )
124        # Struct module can't deal with NaN and endian conversion, we'll hack
125        # around that by byteswapping the array
126        if platform_is_little_endian:
127            a = a.byteswap()
128        f.write( a.tostring() )
129        # Save current position (start of bin offsets)
130        index_start_pos = f.tell()
131        # Skip forward to save space for index
132        f.seek( calcsize( ">2I" ) * self.nbins, 1 )
133        bin_pos_and_size = []
134        # Write each bin
135        for bin in self.bins:
136            if bin is None:
137                bin_pos_and_size.append( ( 0, 0 ) )
138            else:
139                assert bin.dtype.char == self.typecode
140                if platform_is_little_endian:
141                    s = bin.byteswap().tostring()
142                else:
143                    s = bin.tostring()
144                compressed = compress( s )
145                bin_pos_and_size.append( ( f.tell(), len( compressed ) ) )
146                f.write( compressed )
147        # Go back and fill in table
148        f.seek( index_start_pos )
149        for pos, size in bin_pos_and_size:
150            write_packed( f, ">2I", pos, size )
151           
152class FileBinnedArray( object ):
153    def __init__( self, f, cache=32):
154        # If cache=None, then everything is allowed to stay in memory,
155        # this is the default behavior.
156        self.f = f
157        M, V, max_size, bin_size, nbins = read_packed( f, ">5I" )
158        assert M == MAGIC
159        # assert version less than max supported
160        assert V <= VERSION, "File is version %d but I don't know about anything beyond %d" % ( V, VERSION )
161        self.max_size = max_size
162        self.bin_size = bin_size
163        self.nbins = nbins       
164        self.bins = LRUCache(size=cache)
165        # Read typecode
166        if V >= 1:
167            self.typecode = unpack( 'c', f.read(1) )[0]
168        else:
169            self.typecode = 'f'
170        # Read compression type
171        if V >= 2:
172            self.comp_type = f.read( 4 ).strip()
173        else:
174            self.comp_type = 'zlib'
175        self.decompress = comp_types[self.comp_type][1]
176        # Read default value
177        s = f.read( calcsize( self.typecode ) )
178        a = fromstring( s, self.typecode )
179        if platform_is_little_endian:
180            a = a.byteswap()
181        self.default = a[0]
182        # Read bin sizes and offsets
183        self.bin_pos = []
184        self.bin_sizes = []
185        for i in range( nbins ):
186            pos, size = read_packed( f, ">2I" )
187            self.bin_pos.append( pos )
188            self.bin_sizes.append( size )
189    def get_bin_offset( self, index ):
190        return int( index // self.bin_size ), int( index % self.bin_size )
191    def load_bin( self, index ):
192        assert self.bin_pos[index] != 0
193        self.f.seek( self.bin_pos[index] )
194        raw = self.f.read( self.bin_sizes[index] )
195        a = fromstring( self.decompress( raw ), self.typecode )
196        if platform_is_little_endian:
197            a = a.byteswap()
198        assert len( a ) == self.bin_size
199        self.bins[index] = a
200    def get( self, key ):
201        bin, offset = self.get_bin_offset( key )
202        if bin in self.bins:
203            return self.bins[bin][offset]
204        elif self.bin_pos[bin]:
205            self.load_bin( bin )
206            return self.bins[bin][offset]
207        else:
208            return self.default
209    def get_range( self, start, end ):
210        size = end - start
211        assert size >= 0
212        rval = []
213        while size > 0:
214            bin, offset = self.get_bin_offset( start )
215            delta = self.bin_size - offset
216            if not bin in self.bins and self.bin_pos[bin] != 0:
217                self.load_bin( bin )
218            if self.bins[bin] is None:
219                if delta < size:
220                    rval.append( resize( array(self.default, self.typecode), (delta,) ) )
221                    size -= delta
222                    start += delta
223                else:
224                    rval.append( resize( array(self.default, self.typecode), (size,) ) )
225                    size = 0
226            else:
227                if delta < size:
228                    rval.append( self.bins[bin][offset:offset+delta] )
229                    size -= delta
230                    start += delta
231                else:
232                    rval.append( self.bins[bin][offset:offset+size] )
233                    size = 0
234        return concatenate( rval )
235    def __getitem__( self, key ):
236        if isinstance( key, slice ):
237            start, stop, stride = key.indices( self.max_size )
238            assert stride == 1, "Slices with strides are not supported"
239            return self.get_range( start, stop )
240        else:
241            return self.get( key )
242       
243class BinnedArrayWriter( object ):
244    def __init__( self, f, bin_size=512*1024, default=NaN, max_size=MAX, typecode="f", comp_type='zlib' ):
245        # All parameters in the constructor are immutable after creation
246        self.f = f
247        self.max_size = max_size
248        self.bin_size = bin_size
249        self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
250        self.default = default
251        self.typecode = typecode
252        self.bin = 0
253        self.bin_pos = 0
254        self.bin_index = []
255        self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
256        self.buffer_contains_values = False
257        self.comp_type = comp_type
258        self.compress = comp_types[comp_type][0]
259        self.write_header()
260        # Start the first bin
261        ## self.bin_index = [ (self.data_offset, 0) ]
262        # Put the fp at the start of the data (we go back and fill in the index at the end)
263        self.f.seek( self.data_offset )
264
265    def write_header( self ):
266        self.f.seek(0)
267        # Write header
268        write_packed( self.f, ">5I", MAGIC, VERSION, self.max_size, self.bin_size, self.nbins )
269        # save type code
270        self.f.write( pack('c',self.typecode ) )
271        # write default value
272        a = array( self.default, self.typecode )
273        # write comp type
274        self.f.write( self.comp_type[0:4].ljust(4) )
275        # write default
276        # Struct module can't deal with NaN and endian conversion, we'll hack
277        # around that by byteswapping the array
278        if platform_is_little_endian:
279            a = a.byteswap()
280        self.f.write( a.tostring() )
281        # Save current position (start of bin offsets)
282        self.index_pos = self.f.tell()
283        self.data_offset = self.index_pos + (self.nbins * calcsize( ">2I" ))
284       
285    def write_index( self ):
286        self.f.seek(self.index_pos)
287        for pos, size in self.bin_index:
288            write_packed( self.f, ">2I", pos, size )
289
290    def skip( self ):
291        self.bin_pos += 1
292        if self.bin_pos == self.bin_size:
293            self.flush()
294            self.bin_pos = 0
295            self.bin += 1
296            assert self.bin <= self.nbins
297            self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
298            self.buffer_contains_values = False
299            ## self.bin_index.append( (self.f.tell(), 0) )
300
301    def write( self, data ):
302        self.buffer[self.bin_pos] = data
303        self.buffer_contains_values = True
304        self.bin_pos += 1
305        if self.bin_pos == self.bin_size:
306            self.flush()
307            self.bin_pos = 0
308            self.bin += 1
309            assert self.bin <= self.nbins
310            self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
311            self.buffer_contains_values = False
312            ## self.bin_index.append( (self.f.tell(), 0) )
313
314    def flush( self ):
315        # Flush buffer to file
316        if self.buffer_contains_values:
317            ## pos, size = self.bin_index[self.bin]
318            ## self.f.seek( pos )
319            pos = self.f.tell()
320            if platform_is_little_endian:
321                s = self.buffer.byteswap().tostring()
322            else:
323                s = self.buffer.tostring()
324            compressed = self.compress( s )
325            size = len( compressed )
326            assert len( self.bin_index ) == self.bin
327            self.bin_index.append( ( pos, size ) )
328            self.f.write( compressed )
329        else:
330            assert len( self.bin_index ) == self.bin
331            self.bin_index.append( ( 0, 0 ) )
332
333    def finish( self ):
334        self.flush()
335        self.nbins = self.bin + 1
336        self.write_header()
337        self.write_index()
338
339def write_packed( f, pattern, *vals ):
340    f.write( pack( pattern, *vals ) )
341   
342def read_packed( f, pattern ):
343    rval = unpack( pattern, f.read( calcsize( pattern ) ) )
344    if len( rval ) == 1: return rval[0]
345    return rval
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。