root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/interval_index_file.py @ 3

リビジョン 3, 19.0 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Classes for index files that map genomic intervals to values.
3
4:Authors: James Taylor (james@bx.psu.edu), Bob Harris (rsharris@bx.psu.edu)
5
6An interval index file maps genomic intervals to values.
7
8This implementation writes version 1 file format, and reads versions 0 and 1.
9
10Index File Format
11-----------------
12
13All fields are in big-endian format (most significant byte first).
14
15All intervals are origin-zero, inclusive start, exclusive end.
16
17The file begins with an index file header, then is immediately followed
18by an index table.  The index table points to index headers, and index
19headers point to bins.  Index headers and bins are referenced via pointers
20(file offsets), and can be placed more or less anywhere in the file.
21
22File header
23~~~~~~~~~~~
24
25============ ===========   =================================================
26offset 0x00: 2C FF 80 0A   magic number
27offset 0x04: 00 00 00 01   version (00 00 00 00 is also supported)
28offset 0x08: 00 00 00 2A   (N) number of index sets
29offset 0x0C:  ...          index table
30============ ===========   =================================================
31
32Index table
33~~~~~~~~~~~
34
35The index table is a list of N index headers, packed sequentially and
36sorted by name.  The first begins at offset 0x0C.  Each header describes
37one set of intervals.
38
39============ ===========   =================================================
40offset:      xx xx xx xx   (L) length of index src name
41offset+4:     ...          index src name (e.g. canFam1.chr1)
42offset+4+L:  xx xx xx xx   offset (in this file) to index data
43offset+8+L:  xx xx xx xx   (B) number of bytes in each value;  for version
44                           0, this field is absent, and B is assumed to be 4
45============ ===========   =================================================
46
47Index data
48~~~~~~~~~~
49
50The index data for (for one index table) consists of the overall range of
51intervals followed by an array of pointers to bins.  The length of the
52array is 1+binForRange(maxEnd-1,maxEnd), where maxEnd is the maximum
53interval end.
54
55============ ===========   =================================================
56offset:      xx xx xx xx   minimum interval start
57offset+4:    xx xx xx xx   maximum interval end
58offset+8:    xx xx xx xx   offset (in this file) to bin 0
59offset+12:   xx xx xx xx   number of intervals in bin 0
60offset+16:   xx xx xx xx   offset (in this file) to bin 1
61offset+20:   xx xx xx xx   number of intervals in bin 1
62...          ...           ...
63============ ===========   =================================================
64
65Bin
66~~~
67
68A bin is an array of (start,end,val), sorted by increasing start (with
69end and val as tiebreakers).  Note that bins may be empty (the number of
70intervals indicated in the index data is zero).  Note that B is determined
71from the appropriate entry in the index table.
72
73============ ===========   =================================================
74offset:      xx xx xx xx   start for interval 1
75offset+4:    xx xx xx xx   end   for interval 1
76offset+8:     ...          (B bytes) value for interval 1
77offset+8+B:  xx xx xx xx   start for interval 2
78offset+12+B: xx xx xx xx   end   for interval 2
79offset+16+B:  ...          (B bytes) value for interval 2
80...          ...           ...
81============ ===========   =================================================
82"""
83
84from bisect import *
85from struct import *
86
87from bx.misc import filecache
88
89try:
90    from bx.misc import seekbzip2
91except:
92    seekbzip2 = None
93   
94try:
95    from bx.misc import seeklzop
96except:
97    seeklzop = None
98
99import os.path
100
101__all__ = [ 'Indexes', 'Index' ]
102
103MAGIC = 0x2cff800a
104VERSION = 2
105
106# These three constants determine the structure of the default binning strategy
107BIN_LEVELS = 6        # Number of levels of bins to build
108BIN_FIRST_SHIFT = 17  # Number of bits for the bottom level bin
109BIN_NEXT_SHIFT = 3    # Number of bits for each higher level bin
110
111# Build offset and max size arrays for each bin level
112BIN_OFFSETS = [ 1, 0 ]
113BIN_OFFSETS_MAX = [ ( 1 << BIN_FIRST_SHIFT << BIN_NEXT_SHIFT ), ( 1 << BIN_FIRST_SHIFT ) ]
114for i in range( BIN_LEVELS - 2 ):
115    BIN_OFFSETS.insert( 0, ( 2 ** (3*(i+1)) ) + BIN_OFFSETS[0] )
116    BIN_OFFSETS_MAX.insert( 0, ( BIN_OFFSETS_MAX[0] << BIN_NEXT_SHIFT ) )
117# The maximum size for the top bin is actually bigger than the signed integers
118# we use to store positions in the file, so we'll change it to prevent confusion
119BIN_OFFSETS_MAX[ 0 ] = max
120
121# Constants for the minimum and maximum size of the overall interval
122MIN = 0                     
123OLD_MAX = 512*1024*1024     # Maximum size supported by versions < 2
124DEFAULT_MAX = 512*1024*1024 # Default max size to use when none is passed
125MAX = ( 2 ** 31 )           # Absolute max size (limited by file format)
126
127def offsets_for_max_size( max_size ):
128    """
129    Return the subset of offsets needed to contain intervals over (0,max_size)
130    """
131    for i, max in enumerate( reversed( BIN_OFFSETS_MAX ) ):
132        if max_size < max:
133            break
134    else:
135        raise Exception( "%d is larger than the maximum possible size (%d)" % ( max_size, BIN_OFFSETS_MAX[0] ) )
136    return BIN_OFFSETS[ ( len(BIN_OFFSETS) - i - 1 ) : ]
137
138def bin_for_range( start, end, offsets=None ):
139    """Find the smallest bin that can contain interval (start,end)"""
140    if offsets is None:
141        offsets = BIN_OFFSETS
142    start_bin, end_bin = start, end - 1
143    start_bin >>= BIN_FIRST_SHIFT
144    end_bin >>= BIN_FIRST_SHIFT
145    for offset in offsets:
146        if start_bin == end_bin:
147            return offset + start_bin
148        else:
149            start_bin >>= BIN_NEXT_SHIFT
150            end_bin >>= BIN_NEXT_SHIFT
151    raise "Interval (%d,%d) out of range"
152
153class AbstractMultiIndexedAccess( object ):
154    """
155    Allows accessing multiple indexes / files as if they were one
156    """
157    indexed_access_class = None
158    def __init__( self, filenames, index_filenames=None, keep_open=False, use_cache=False, **kwargs ):
159        # TODO: Handle index_filenames argument
160        self.indexes = [ self.new_indexed_access( fname, keep_open=keep_open, use_cache=use_cache, **kwargs ) \
161            for fname in filenames ]
162    def new_indexed_access( self, data_filename, index_filename=None, keep_open=False, **kwargs ):
163        return self.indexed_access_class( data_filename, index_filename, keep_open, **kwargs )
164    def get( self, src, start, end ):
165        return [block for block in self.get_as_iterator( src, start, end )]
166    def get_as_iterator( self, src, start, end ):
167        for block, index, offset in self.get_as_iterator_with_index_and_offset( src, start, end ):
168            yield block
169    def get_as_iterator_with_index_and_offset( self, src, start, end ):
170        for index in self.indexes:
171            for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, start, end ):
172                yield block, idx, offset
173    def close( self ):
174        for index in self.indexes:
175            index.close()
176
177class AbstractIndexedAccess( object ):
178    """Indexed access to a data using overlap queries, requires an index file"""
179
180    def __init__( self, data_filename, index_filename=None, keep_open=False, use_cache=False, **kwargs ):
181        self.data_kwargs = kwargs
182        self.data_filename = data_filename
183        if data_filename.endswith( ".bz2" ):
184            if seekbzip2 is None:
185                raise Exception( "Trying to open .bz2 file but no seekbzip2 module found")
186            table_filename = data_filename + "t"
187            self.table_filename = table_filename
188            if not os.path.exists( table_filename ):
189                raise Exception( "Cannot find bz2t file for: " + data_filename )
190            self.file_type = "bz2t"
191            # Strip .bz2 from the filename before adding ".index"
192            data_filename_root = data_filename[:-4]
193        elif data_filename.endswith( ".lzo" ):
194            if seeklzop is None:
195                raise Exception( "Trying to open .lzo file but no seeklzop module found")
196            table_filename = data_filename + "t"
197            self.table_filename = table_filename
198            if not os.path.exists( table_filename ):
199                raise Exception( "Cannot find lzot file for: " + data_filename )
200            self.file_type = "lzot"
201            # Strip .lzo from the filename before adding ".index"
202            data_filename_root = data_filename[:-4]
203        else:
204            self.file_type = "plain"
205            data_filename_root = data_filename
206        # Open index
207        if index_filename is None:
208            index_filename = data_filename_root + ".index"
209        self.indexes = Indexes( filename=index_filename )
210        # Use a file cache?
211        self.use_cache = use_cache
212        # Open now?
213        if keep_open:
214            self.f = self.open_data()
215        else:
216            self.f = None
217
218    def close( self ):
219        if self.f:
220            self.f.close()
221            self.f = None
222
223    def open_data( self ):
224        if self.file_type == "plain":
225            return open( self.data_filename )
226        elif self.file_type == "bz2t":
227            f = seekbzip2.SeekableBzip2File( self.data_filename, self.table_filename )
228            if self.use_cache:
229                return filecache.FileCache( f, f.size )
230            else:
231                return f
232        elif self.file_type == "lzot":
233            if self.use_cache:
234                block_cache_size = 20
235            else:
236                block_cache_size = 0
237            f = seeklzop.SeekableLzopFile( self.data_filename,
238                                           self.table_filename,
239                                           block_cache_size = block_cache_size )
240            return f
241
242    def get( self, src, start, end ):
243        return [ val for val in self.get_as_iterator( src, start, end ) ]
244    def get_as_iterator( self, src, start, end ):
245        for val, index, offset in self.get_as_iterator_with_index_and_offset( src, start, end ):
246            yield val
247    def get_as_iterator_with_index_and_offset( self, src, start, end ):
248        for val_start, val_end, val in self.indexes.find( src, start, end ):
249            yield self.get_at_offset( val ), self, val
250
251    def get_at_offset( self, offset ):
252        if self.f:
253            self.f.seek( offset )
254            return self.read_at_current_offset( self.f, **self.data_kwargs )
255        else:
256            f = self.open_data()
257            try:
258                f.seek( offset )
259                return self.read_at_current_offset( f, **self.data_kwargs )
260            finally:
261                f.close()
262               
263    def read_at_current_offset( self, file, **kwargs ):
264        raise TypeError( "Abstract Method" )
265
266class Indexes:
267    """A set of indexes, each identified by a unique name"""
268
269    def __init__( self, filename=None ):
270        self.indexes = dict()
271        if filename is not None: self.open( filename )
272
273    def add( self, name, start, end, val, max=DEFAULT_MAX ):
274        if name not in self.indexes:
275            self.indexes[name] = Index( max=max )
276        self.indexes[name].add( start, end, val )
277
278    def get( self, name ):
279        if self.indexes[name] is None:
280            offset, value_size = self.offsets[name]
281            self.indexes[name] = Index( filename=self.filename, offset=offset, value_size=value_size, version=self.version )
282        return self.indexes[name]
283
284    def find( self, name, start, end ):
285        if name in self.indexes:
286            return self.get( name ).find( start, end )
287        else:
288            return []
289
290    def open( self, filename ):
291        self.filename = filename
292        self.offsets = dict()  # (will map key to (offset,value_size))
293        f = open( filename )
294        magic, version, length = read_packed( f, ">3I" )
295        if magic != MAGIC:
296            raise "File does not have expected header"
297        if version > VERSION:
298            warn( "File claims version %d, I don't known anything about versions beyond %d. Attempting to continue", version, VERSION )
299        self.version = version
300        for i in range( length ):
301            key_len = read_packed( f, ">I" )
302            key = f.read( key_len )
303            offset = read_packed( f, ">I" )
304            if version == 0:
305                value_size = 4
306            else:
307                value_size = read_packed( f, ">I" )
308                assert value_size % 4 == 0, "unsupported value size: %s" % value_size
309            self.indexes[ key ] = None
310            self.offsets[ key ] = (offset,value_size)
311        f.close()
312
313    def write( self, f ):
314        keys = self.indexes.keys()
315        keys.sort()
316        # First determine the size of the header
317        base = calcsize( ">3I" )
318        for key in keys:
319            key = str( key )
320            base += calcsize( ">I" )
321            base += len( key )
322            base += calcsize( ">2I" )
323        # Now actually write the header
324        write_packed( f, ">3I", MAGIC, VERSION, len( self.indexes ) )
325        # And write the index table
326        for key in keys:
327            key = str( key )
328            # Write the string prefixed by its length (pascal!)
329            write_packed( f, ">I", len( key ) )
330            f.write( key )
331            # Write offset
332            write_packed( f, ">I", base )
333            base += self.indexes[key].bytes_required()
334            # Write value size
335            write_packed( f, ">I", self.indexes[key].value_size )
336        # And finally write each index in order
337        for key in keys:
338            self.indexes[key].write( f )
339
340class Index:
341
342    def __init__( self, min=MIN, max=DEFAULT_MAX, filename=None, offset=0, value_size=None, version=None ):
343        self._value_size = value_size
344        self.max_val = 1   # (1, rather than 0, to force value_size > 0)
345        if filename is None:
346            self.new( min, max )
347        else:
348            self.open( filename, offset, version )
349
350    def get_value_size ( self ):
351        if self._value_size != None:
352            return self._value_size
353        else:
354            return round_up_to_4( bytes_of( self.max_val ) )
355    value_size = property( fget=get_value_size )
356
357    def new( self, min, max ):
358        """Create an empty index for intervals in the range min, max"""
359        # Ensure the range will fit given the shifting strategy
360        assert MIN <= min <= max <= MAX
361        self.min = min
362        self.max = max
363        # Determine offsets to use
364        self.offsets = offsets_for_max_size( max )
365        # Determine the largest bin we will actually use
366        self.bin_count = bin_for_range( max - 1, max, offsets = self.offsets ) + 1
367        # Create empty bins
368        self.bins = [ [] for i in range( self.bin_count ) ]
369
370    def open( self, filename, offset, version ):
371        self.filename = filename
372        self.offset = offset
373        # Open the file and seek to where we expect our header
374        f = open( filename )
375        f.seek( offset )
376        # Read min/max
377        min, max = read_packed( f, ">2I" )
378        self.new( min, max )
379        # Decide how many levels of bins based on 'max'
380        if version < 2:
381            # Prior to version 2 all files used the bins for 512MB
382            self.offsets = offsets_for_max_size( OLD_MAX - 1 )
383        else:
384            self.offsets = offsets_for_max_size( max )
385        # Read bin indexes
386        self.bin_offsets = []
387        self.bin_sizes = []
388        for i in range( self.bin_count ):
389            o, s = read_packed( f, ">2I" )
390            self.bin_offsets.append( o )
391            self.bin_sizes.append( s )
392        # Initialize bins to None, indicating that they need to be loaded
393        self.bins = [ None for i in range( self.bin_count ) ]
394
395    def add( self, start, end, val ):
396        """Add the interval (start,end) with associated value val to the index"""
397        insort( self.bins[ bin_for_range( start, end, offsets=self.offsets ) ], ( start, end, val ) )
398        assert val >= 0
399        self.max_val = max(self.max_val,val)
400
401    def find( self, start, end ):
402        rval = []
403        start_bin = ( max( start, self.min ) ) >> BIN_FIRST_SHIFT
404        end_bin = ( min( end, self.max ) - 1 ) >> BIN_FIRST_SHIFT
405        for offset in self.offsets:
406            for i in range( start_bin + offset, end_bin + offset + 1 ):
407                if self.bins[i] is None: self.load_bin( i )
408                # Iterate over bin and insert any overlapping elements into return value
409                for el_start, el_end, val in self.bins[i]:
410                    if el_start < end and el_end > start:
411                        insort_right( rval, ( el_start, el_end, val ) )
412            start_bin >>= BIN_NEXT_SHIFT
413            end_bin >>= BIN_NEXT_SHIFT
414        return rval
415
416    def iterate( self ):
417        for i in range( self.bin_count ):
418            if self.bins[i] is None: self.load_bin( i )
419            for entry in self.bins[i]:  yield entry
420
421    def load_bin( self, index ):
422        bin = []
423        if self.bin_sizes[index] == 0:
424            self.bins[index] = bin
425            return
426        f = open( self.filename )
427        f.seek( self.bin_offsets[index] )
428        # One big read for happy NFS
429        item_size = self.value_size + calcsize( ">2I" )
430        buffer = f.read( self.bin_sizes[index] * item_size )
431        for i in range( self.bin_sizes[index] ):
432            start, end = unpack( ">2I", buffer[ i*item_size : i*item_size+8 ] )
433            val = unpack_uints( buffer[ i*item_size+8 : (i+1)*item_size ] )
434            bin.append( (start, end, val) )
435        self.bins[index] = bin
436        f.close()
437
438    def write( self, f ):
439        value_size = self.value_size
440        item_size = value_size + calcsize( ">2I" )
441        # Write min/max
442        write_packed( f, ">2I", self.min, self.max )
443        # Write table of bin sizes and offsets
444        base = f.tell() + self.bin_count * calcsize( ">2I" )
445        for bin in self.bins:
446            write_packed( f, ">2I", base, len( bin ) )
447            base += len( bin ) * item_size
448        # Write contents of each bin
449        for bin in self.bins:
450            for start, end, val in bin:
451                write_packed( f, ">2I", start, end )
452                write_packed_uints( f, val, value_size )
453
454    def bytes_required( self ):
455        item_size = self.value_size + calcsize( ">2I" )
456        rval = calcsize( ">2I" )
457        rval += self.bin_count * calcsize( ">2I" )
458        for bin in self.bins:
459            rval += len( bin ) * item_size
460        return rval
461
462def write_packed( f, pattern, *vals ):
463    f.write( pack( pattern, *vals ) )
464
465def read_packed( f, pattern ):
466    rval = unpack( pattern, f.read( calcsize( pattern ) ) )
467    if len( rval ) == 1: return rval[0]
468    return rval
469
470def write_packed_uints( f, v, num_bytes ):
471    if num_bytes < 4:
472        write_packed( f, ">I", v )
473    else:
474        parts = []
475        while num_bytes > 0:
476            parts.append( v & 0xFFFFFFFFL )
477            v >>= 32
478            num_bytes -= 4
479        parts.reverse() # (write most-significant chunk first)
480        write_packed( f, ">%dI" % len( parts ), *parts )
481
482def unpack_uints( parts ):
483    chunks = len( parts )/4
484    vals = unpack( ">%dI" % chunks, parts )
485    val = vals[0]
486    for v in vals[1:]:
487        val = (val << 32) + v
488    return val
489
490def bytes_of( v ):
491    assert v > 0
492    b = 0
493    while v > 0:
494        v >>= 8
495        b += 1
496    return b
497
498def round_up_to_4( v ):
499    if v % 4 == 0:
500        return v
501    else:
502        return v + 4 - (v % 4)
503
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。