Context Navigation

binned_array.py @ 3

リビジョン 3, 12.4 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

Rev	行番号
[3]	1	"""
	2	Numeric arrays stored as individually compressed blocks on disk, allowing
	3	pseudo-random acccess.
	4
	5	`BinnedArray` is used to build such an array in memory and save it to disk.
	6	`BinnedArrayWriter` can instead be used when creating the array sequentially
	7	(does not require keeping all data in memory). `FileBinnedArray` provides
	8	read only access to an on disk binned array.
	9	"""
	10
	11	from __future__ import division
	12
	13	import math
	14
	15	from numpy import *
	16	from struct import *
	17	from bx_extras.lrucache import LRUCache
	18
	19	import sys
	20	platform_is_little_endian = ( sys.byteorder == 'little' )
	21
	22	MAGIC=0x4AB04612
	23
	24	# Version incremented from version 0 to version 1 by Ian Schenck, June
	25	# 23, 2006. Version 1 supports different typecodes, and in doing so
	26	# breaks the original header format. The new FileBinnedArray is
	27	# backwards compatible with version 0.
	28
	29	# Version 1 -> 2 by James Taylor, allow specifying different compression
	30	# types.
	31
	32	VERSION=2
	33
	34	# Compression types
	35
	36	comp_types = dict()
	37
	38	comp_types['none'] = ( lambda x: x, lambda x: x )
	39
	40	try:
	41	import zlib
	42	comp_types['zlib'] = ( zlib.compress, zlib.decompress )
	43	except:
	44	pass
	45
	46	try:
	47	import lzo
	48	comp_types['lzo'] = ( lzo.compress, lzo.decompress )
	49	except:
	50	pass
	51
	52
	53	MAX=51210241024
	54
	55	class BinnedArray( object ):
	56	def __init__( self, bin_size=512*1024, default=NaN, max_size=MAX, typecode="f" ):
	57	self.max_size = max_size
	58	self.bin_size = bin_size
	59	self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
	60	self.bins = [ None ] * self.nbins
	61	self.default = default
	62	self.typecode = typecode
	63	def get_bin_offset( self, index ):
	64	return index // self.bin_size, index % self.bin_size
	65	def init_bin( self, index ):
	66	# self.bins[index] = zeros( self.bin_size ) * self.default
	67	self.bins[index] = zeros( self.bin_size, self.typecode )
	68	self.bins[index][:] = self.default
	69	def get( self, key ):
	70	bin, offset = self.get_bin_offset( key )
	71	if self.bins[bin] is None:
	72	return self.default
	73	else:
	74	return self.bins[bin][offset]
	75	def set( self, key, value ):
	76	bin, offset = self.get_bin_offset( key )
	77	if self.bins[bin] is None:
	78	self.init_bin( bin )
	79	self.bins[bin][offset] = value
	80	def get_range( self, start, end ):
	81	size = end - start
	82	assert size >= 0
	83	rval = []
	84	while size > 0:
	85	bin, offset = self.get_bin_offset( start )
	86	delta = self.bin_size - offset
	87	if self.bins[bin] is None:
	88	if delta < size:
	89	rval.append( resize( array(self.default, self.typecode), (delta,) ) )
	90	size -= delta
	91	start += delta
	92	else:
	93	rval.append( resize( array(self.default, "f"), (size,) ) )
	94	size = 0
	95	else:
	96	if delta < size:
	97	rval.append( self.bins[bin][offset:offset+delta] )
	98	size -= delta
	99	start += delta
	100	else:
	101	rval.append( self.bins[bin][offset:offset+size] )
	102	size = 0
	103	return concatenate( rval )
	104	def __getitem__( self, key ):
	105	if isinstance( key, slice ):
	106	start, stop, stride = key.indices( self.max_size )
	107	assert stride == 1, "Slices with strides are not supported"
	108	return self.get_range( start, stop )
	109	else:
	110	return self.get( key )
	111	def __setitem__( self, key, value ):
	112	return self.set( key, value )
	113	def to_file( self, f, comp_type='zlib' ):
	114	# Get compress method
	115	compress, _ = comp_types[comp_type]
	116	# Write header
	117	write_packed( f, ">5I", MAGIC, VERSION, self.max_size, self.bin_size, self.nbins )
	118	# save type code
	119	f.write( pack('c',self.typecode ) )
	120	# save compression type
	121	f.write( comp_type[0:4].ljust( 4 ) )
	122	# write default value
	123	a = array( self.default, self.typecode )
	124	# Struct module can't deal with NaN and endian conversion, we'll hack
	125	# around that by byteswapping the array
	126	if platform_is_little_endian:
	127	a = a.byteswap()
	128	f.write( a.tostring() )
	129	# Save current position (start of bin offsets)
	130	index_start_pos = f.tell()
	131	# Skip forward to save space for index
	132	f.seek( calcsize( ">2I" ) * self.nbins, 1 )
	133	bin_pos_and_size = []
	134	# Write each bin
	135	for bin in self.bins:
	136	if bin is None:
	137	bin_pos_and_size.append( ( 0, 0 ) )
	138	else:
	139	assert bin.dtype.char == self.typecode
	140	if platform_is_little_endian:
	141	s = bin.byteswap().tostring()
	142	else:
	143	s = bin.tostring()
	144	compressed = compress( s )
	145	bin_pos_and_size.append( ( f.tell(), len( compressed ) ) )
	146	f.write( compressed )
	147	# Go back and fill in table
	148	f.seek( index_start_pos )
	149	for pos, size in bin_pos_and_size:
	150	write_packed( f, ">2I", pos, size )
	151
	152	class FileBinnedArray( object ):
	153	def __init__( self, f, cache=32):
	154	# If cache=None, then everything is allowed to stay in memory,
	155	# this is the default behavior.
	156	self.f = f
	157	M, V, max_size, bin_size, nbins = read_packed( f, ">5I" )
	158	assert M == MAGIC
	159	# assert version less than max supported
	160	assert V <= VERSION, "File is version %d but I don't know about anything beyond %d" % ( V, VERSION )
	161	self.max_size = max_size
	162	self.bin_size = bin_size
	163	self.nbins = nbins
	164	self.bins = LRUCache(size=cache)
	165	# Read typecode
	166	if V >= 1:
	167	self.typecode = unpack( 'c', f.read(1) )[0]
	168	else:
	169	self.typecode = 'f'
	170	# Read compression type
	171	if V >= 2:
	172	self.comp_type = f.read( 4 ).strip()
	173	else:
	174	self.comp_type = 'zlib'
	175	self.decompress = comp_types[self.comp_type][1]
	176	# Read default value
	177	s = f.read( calcsize( self.typecode ) )
	178	a = fromstring( s, self.typecode )
	179	if platform_is_little_endian:
	180	a = a.byteswap()
	181	self.default = a[0]
	182	# Read bin sizes and offsets
	183	self.bin_pos = []
	184	self.bin_sizes = []
	185	for i in range( nbins ):
	186	pos, size = read_packed( f, ">2I" )
	187	self.bin_pos.append( pos )
	188	self.bin_sizes.append( size )
	189	def get_bin_offset( self, index ):
	190	return int( index // self.bin_size ), int( index % self.bin_size )
	191	def load_bin( self, index ):
	192	assert self.bin_pos[index] != 0
	193	self.f.seek( self.bin_pos[index] )
	194	raw = self.f.read( self.bin_sizes[index] )
	195	a = fromstring( self.decompress( raw ), self.typecode )
	196	if platform_is_little_endian:
	197	a = a.byteswap()
	198	assert len( a ) == self.bin_size
	199	self.bins[index] = a
	200	def get( self, key ):
	201	bin, offset = self.get_bin_offset( key )
	202	if bin in self.bins:
	203	return self.bins[bin][offset]
	204	elif self.bin_pos[bin]:
	205	self.load_bin( bin )
	206	return self.bins[bin][offset]
	207	else:
	208	return self.default
	209	def get_range( self, start, end ):
	210	size = end - start
	211	assert size >= 0
	212	rval = []
	213	while size > 0:
	214	bin, offset = self.get_bin_offset( start )
	215	delta = self.bin_size - offset
	216	if not bin in self.bins and self.bin_pos[bin] != 0:
	217	self.load_bin( bin )
	218	if self.bins[bin] is None:
	219	if delta < size:
	220	rval.append( resize( array(self.default, self.typecode), (delta,) ) )
	221	size -= delta
	222	start += delta
	223	else:
	224	rval.append( resize( array(self.default, self.typecode), (size,) ) )
	225	size = 0
	226	else:
	227	if delta < size:
	228	rval.append( self.bins[bin][offset:offset+delta] )
	229	size -= delta
	230	start += delta
	231	else:
	232	rval.append( self.bins[bin][offset:offset+size] )
	233	size = 0
	234	return concatenate( rval )
	235	def __getitem__( self, key ):
	236	if isinstance( key, slice ):
	237	start, stop, stride = key.indices( self.max_size )
	238	assert stride == 1, "Slices with strides are not supported"
	239	return self.get_range( start, stop )
	240	else:
	241	return self.get( key )
	242
	243	class BinnedArrayWriter( object ):
	244	def __init__( self, f, bin_size=512*1024, default=NaN, max_size=MAX, typecode="f", comp_type='zlib' ):
	245	# All parameters in the constructor are immutable after creation
	246	self.f = f
	247	self.max_size = max_size
	248	self.bin_size = bin_size
	249	self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
	250	self.default = default
	251	self.typecode = typecode
	252	self.bin = 0
	253	self.bin_pos = 0
	254	self.bin_index = []
	255	self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
	256	self.buffer_contains_values = False
	257	self.comp_type = comp_type
	258	self.compress = comp_types[comp_type][0]
	259	self.write_header()
	260	# Start the first bin
	261	## self.bin_index = [ (self.data_offset, 0) ]
	262	# Put the fp at the start of the data (we go back and fill in the index at the end)
	263	self.f.seek( self.data_offset )
	264
	265	def write_header( self ):
	266	self.f.seek(0)
	267	# Write header
	268	write_packed( self.f, ">5I", MAGIC, VERSION, self.max_size, self.bin_size, self.nbins )
	269	# save type code
	270	self.f.write( pack('c',self.typecode ) )
	271	# write default value
	272	a = array( self.default, self.typecode )
	273	# write comp type
	274	self.f.write( self.comp_type[0:4].ljust(4) )
	275	# write default
	276	# Struct module can't deal with NaN and endian conversion, we'll hack
	277	# around that by byteswapping the array
	278	if platform_is_little_endian:
	279	a = a.byteswap()
	280	self.f.write( a.tostring() )
	281	# Save current position (start of bin offsets)
	282	self.index_pos = self.f.tell()
	283	self.data_offset = self.index_pos + (self.nbins * calcsize( ">2I" ))
	284
	285	def write_index( self ):
	286	self.f.seek(self.index_pos)
	287	for pos, size in self.bin_index:
	288	write_packed( self.f, ">2I", pos, size )
	289
	290	def skip( self ):
	291	self.bin_pos += 1
	292	if self.bin_pos == self.bin_size:
	293	self.flush()
	294	self.bin_pos = 0
	295	self.bin += 1
	296	assert self.bin <= self.nbins
	297	self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
	298	self.buffer_contains_values = False
	299	## self.bin_index.append( (self.f.tell(), 0) )
	300
	301	def write( self, data ):
	302	self.buffer[self.bin_pos] = data
	303	self.buffer_contains_values = True
	304	self.bin_pos += 1
	305	if self.bin_pos == self.bin_size:
	306	self.flush()
	307	self.bin_pos = 0
	308	self.bin += 1
	309	assert self.bin <= self.nbins
	310	self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
	311	self.buffer_contains_values = False
	312	## self.bin_index.append( (self.f.tell(), 0) )
	313
	314	def flush( self ):
	315	# Flush buffer to file
	316	if self.buffer_contains_values:
	317	## pos, size = self.bin_index[self.bin]
	318	## self.f.seek( pos )
	319	pos = self.f.tell()
	320	if platform_is_little_endian:
	321	s = self.buffer.byteswap().tostring()
	322	else:
	323	s = self.buffer.tostring()
	324	compressed = self.compress( s )
	325	size = len( compressed )
	326	assert len( self.bin_index ) == self.bin
	327	self.bin_index.append( ( pos, size ) )
	328	self.f.write( compressed )
	329	else:
	330	assert len( self.bin_index ) == self.bin
	331	self.bin_index.append( ( 0, 0 ) )
	332
	333	def finish( self ):
	334	self.flush()
	335	self.nbins = self.bin + 1
	336	self.write_header()
	337	self.write_index()
	338
	339	def write_packed( f, pattern, *vals ):
	340	f.write( pack( pattern, *vals ) )
	341
	342	def read_packed( f, pattern ):
	343	rval = unpack( pattern, f.read( calcsize( pattern ) ) )
	344	if len( rval ) == 1: return rval[0]
	345	return rval

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/binned_array.py @ 3

異なるフォーマットでダウンロード: