Context Navigation

binned_array.py

リビジョン 3, 12.4 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""
2	Numeric arrays stored as individually compressed blocks on disk, allowing
3	pseudo-random acccess.
4
5	`BinnedArray` is used to build such an array in memory and save it to disk.
6	`BinnedArrayWriter` can instead be used when creating the array sequentially
7	(does not require keeping all data in memory). `FileBinnedArray` provides
8	read only access to an on disk binned array.
9	"""
10
11	from __future__ import division
12
13	import math
14
15	from numpy import *
16	from struct import *
17	from bx_extras.lrucache import LRUCache
18
19	import sys
20	platform_is_little_endian = ( sys.byteorder == 'little' )
21
22	MAGIC=0x4AB04612
23
24	# Version incremented from version 0 to version 1 by Ian Schenck, June
25	# 23, 2006. Version 1 supports different typecodes, and in doing so
26	# breaks the original header format. The new FileBinnedArray is
27	# backwards compatible with version 0.
28
29	# Version 1 -> 2 by James Taylor, allow specifying different compression
30	# types.
31
32	VERSION=2
33
34	# Compression types
35
36	comp_types = dict()
37
38	comp_types['none'] = ( lambda x: x, lambda x: x )
39
40	try:
41	import zlib
42	comp_types['zlib'] = ( zlib.compress, zlib.decompress )
43	except:
44	pass
45
46	try:
47	import lzo
48	comp_types['lzo'] = ( lzo.compress, lzo.decompress )
49	except:
50	pass
51
52
53	MAX=51210241024
54
55	class BinnedArray( object ):
56	def __init__( self, bin_size=512*1024, default=NaN, max_size=MAX, typecode="f" ):
57	self.max_size = max_size
58	self.bin_size = bin_size
59	self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
60	self.bins = [ None ] * self.nbins
61	self.default = default
62	self.typecode = typecode
63	def get_bin_offset( self, index ):
64	return index // self.bin_size, index % self.bin_size
65	def init_bin( self, index ):
66	# self.bins[index] = zeros( self.bin_size ) * self.default
67	self.bins[index] = zeros( self.bin_size, self.typecode )
68	self.bins[index][:] = self.default
69	def get( self, key ):
70	bin, offset = self.get_bin_offset( key )
71	if self.bins[bin] is None:
72	return self.default
73	else:
74	return self.bins[bin][offset]
75	def set( self, key, value ):
76	bin, offset = self.get_bin_offset( key )
77	if self.bins[bin] is None:
78	self.init_bin( bin )
79	self.bins[bin][offset] = value
80	def get_range( self, start, end ):
81	size = end - start
82	assert size >= 0
83	rval = []
84	while size > 0:
85	bin, offset = self.get_bin_offset( start )
86	delta = self.bin_size - offset
87	if self.bins[bin] is None:
88	if delta < size:
89	rval.append( resize( array(self.default, self.typecode), (delta,) ) )
90	size -= delta
91	start += delta
92	else:
93	rval.append( resize( array(self.default, "f"), (size,) ) )
94	size = 0
95	else:
96	if delta < size:
97	rval.append( self.bins[bin][offset:offset+delta] )
98	size -= delta
99	start += delta
100	else:
101	rval.append( self.bins[bin][offset:offset+size] )
102	size = 0
103	return concatenate( rval )
104	def __getitem__( self, key ):
105	if isinstance( key, slice ):
106	start, stop, stride = key.indices( self.max_size )
107	assert stride == 1, "Slices with strides are not supported"
108	return self.get_range( start, stop )
109	else:
110	return self.get( key )
111	def __setitem__( self, key, value ):
112	return self.set( key, value )
113	def to_file( self, f, comp_type='zlib' ):
114	# Get compress method
115	compress, _ = comp_types[comp_type]
116	# Write header
117	write_packed( f, ">5I", MAGIC, VERSION, self.max_size, self.bin_size, self.nbins )
118	# save type code
119	f.write( pack('c',self.typecode ) )
120	# save compression type
121	f.write( comp_type[0:4].ljust( 4 ) )
122	# write default value
123	a = array( self.default, self.typecode )
124	# Struct module can't deal with NaN and endian conversion, we'll hack
125	# around that by byteswapping the array
126	if platform_is_little_endian:
127	a = a.byteswap()
128	f.write( a.tostring() )
129	# Save current position (start of bin offsets)
130	index_start_pos = f.tell()
131	# Skip forward to save space for index
132	f.seek( calcsize( ">2I" ) * self.nbins, 1 )
133	bin_pos_and_size = []
134	# Write each bin
135	for bin in self.bins:
136	if bin is None:
137	bin_pos_and_size.append( ( 0, 0 ) )
138	else:
139	assert bin.dtype.char == self.typecode
140	if platform_is_little_endian:
141	s = bin.byteswap().tostring()
142	else:
143	s = bin.tostring()
144	compressed = compress( s )
145	bin_pos_and_size.append( ( f.tell(), len( compressed ) ) )
146	f.write( compressed )
147	# Go back and fill in table
148	f.seek( index_start_pos )
149	for pos, size in bin_pos_and_size:
150	write_packed( f, ">2I", pos, size )
151
152	class FileBinnedArray( object ):
153	def __init__( self, f, cache=32):
154	# If cache=None, then everything is allowed to stay in memory,
155	# this is the default behavior.
156	self.f = f
157	M, V, max_size, bin_size, nbins = read_packed( f, ">5I" )
158	assert M == MAGIC
159	# assert version less than max supported
160	assert V <= VERSION, "File is version %d but I don't know about anything beyond %d" % ( V, VERSION )
161	self.max_size = max_size
162	self.bin_size = bin_size
163	self.nbins = nbins
164	self.bins = LRUCache(size=cache)
165	# Read typecode
166	if V >= 1:
167	self.typecode = unpack( 'c', f.read(1) )[0]
168	else:
169	self.typecode = 'f'
170	# Read compression type
171	if V >= 2:
172	self.comp_type = f.read( 4 ).strip()
173	else:
174	self.comp_type = 'zlib'
175	self.decompress = comp_types[self.comp_type][1]
176	# Read default value
177	s = f.read( calcsize( self.typecode ) )
178	a = fromstring( s, self.typecode )
179	if platform_is_little_endian:
180	a = a.byteswap()
181	self.default = a[0]
182	# Read bin sizes and offsets
183	self.bin_pos = []
184	self.bin_sizes = []
185	for i in range( nbins ):
186	pos, size = read_packed( f, ">2I" )
187	self.bin_pos.append( pos )
188	self.bin_sizes.append( size )
189	def get_bin_offset( self, index ):
190	return int( index // self.bin_size ), int( index % self.bin_size )
191	def load_bin( self, index ):
192	assert self.bin_pos[index] != 0
193	self.f.seek( self.bin_pos[index] )
194	raw = self.f.read( self.bin_sizes[index] )
195	a = fromstring( self.decompress( raw ), self.typecode )
196	if platform_is_little_endian:
197	a = a.byteswap()
198	assert len( a ) == self.bin_size
199	self.bins[index] = a
200	def get( self, key ):
201	bin, offset = self.get_bin_offset( key )
202	if bin in self.bins:
203	return self.bins[bin][offset]
204	elif self.bin_pos[bin]:
205	self.load_bin( bin )
206	return self.bins[bin][offset]
207	else:
208	return self.default
209	def get_range( self, start, end ):
210	size = end - start
211	assert size >= 0
212	rval = []
213	while size > 0:
214	bin, offset = self.get_bin_offset( start )
215	delta = self.bin_size - offset
216	if not bin in self.bins and self.bin_pos[bin] != 0:
217	self.load_bin( bin )
218	if self.bins[bin] is None:
219	if delta < size:
220	rval.append( resize( array(self.default, self.typecode), (delta,) ) )
221	size -= delta
222	start += delta
223	else:
224	rval.append( resize( array(self.default, self.typecode), (size,) ) )
225	size = 0
226	else:
227	if delta < size:
228	rval.append( self.bins[bin][offset:offset+delta] )
229	size -= delta
230	start += delta
231	else:
232	rval.append( self.bins[bin][offset:offset+size] )
233	size = 0
234	return concatenate( rval )
235	def __getitem__( self, key ):
236	if isinstance( key, slice ):
237	start, stop, stride = key.indices( self.max_size )
238	assert stride == 1, "Slices with strides are not supported"
239	return self.get_range( start, stop )
240	else:
241	return self.get( key )
242
243	class BinnedArrayWriter( object ):
244	def __init__( self, f, bin_size=512*1024, default=NaN, max_size=MAX, typecode="f", comp_type='zlib' ):
245	# All parameters in the constructor are immutable after creation
246	self.f = f
247	self.max_size = max_size
248	self.bin_size = bin_size
249	self.nbins = int( math.ceil( ( max_size / self.bin_size ) ) )
250	self.default = default
251	self.typecode = typecode
252	self.bin = 0
253	self.bin_pos = 0
254	self.bin_index = []
255	self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
256	self.buffer_contains_values = False
257	self.comp_type = comp_type
258	self.compress = comp_types[comp_type][0]
259	self.write_header()
260	# Start the first bin
261	## self.bin_index = [ (self.data_offset, 0) ]
262	# Put the fp at the start of the data (we go back and fill in the index at the end)
263	self.f.seek( self.data_offset )
264
265	def write_header( self ):
266	self.f.seek(0)
267	# Write header
268	write_packed( self.f, ">5I", MAGIC, VERSION, self.max_size, self.bin_size, self.nbins )
269	# save type code
270	self.f.write( pack('c',self.typecode ) )
271	# write default value
272	a = array( self.default, self.typecode )
273	# write comp type
274	self.f.write( self.comp_type[0:4].ljust(4) )
275	# write default
276	# Struct module can't deal with NaN and endian conversion, we'll hack
277	# around that by byteswapping the array
278	if platform_is_little_endian:
279	a = a.byteswap()
280	self.f.write( a.tostring() )
281	# Save current position (start of bin offsets)
282	self.index_pos = self.f.tell()
283	self.data_offset = self.index_pos + (self.nbins * calcsize( ">2I" ))
284
285	def write_index( self ):
286	self.f.seek(self.index_pos)
287	for pos, size in self.bin_index:
288	write_packed( self.f, ">2I", pos, size )
289
290	def skip( self ):
291	self.bin_pos += 1
292	if self.bin_pos == self.bin_size:
293	self.flush()
294	self.bin_pos = 0
295	self.bin += 1
296	assert self.bin <= self.nbins
297	self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
298	self.buffer_contains_values = False
299	## self.bin_index.append( (self.f.tell(), 0) )
300
301	def write( self, data ):
302	self.buffer[self.bin_pos] = data
303	self.buffer_contains_values = True
304	self.bin_pos += 1
305	if self.bin_pos == self.bin_size:
306	self.flush()
307	self.bin_pos = 0
308	self.bin += 1
309	assert self.bin <= self.nbins
310	self.buffer = resize( array(self.default, self.typecode), (self.bin_size,) )
311	self.buffer_contains_values = False
312	## self.bin_index.append( (self.f.tell(), 0) )
313
314	def flush( self ):
315	# Flush buffer to file
316	if self.buffer_contains_values:
317	## pos, size = self.bin_index[self.bin]
318	## self.f.seek( pos )
319	pos = self.f.tell()
320	if platform_is_little_endian:
321	s = self.buffer.byteswap().tostring()
322	else:
323	s = self.buffer.tostring()
324	compressed = self.compress( s )
325	size = len( compressed )
326	assert len( self.bin_index ) == self.bin
327	self.bin_index.append( ( pos, size ) )
328	self.f.write( compressed )
329	else:
330	assert len( self.bin_index ) == self.bin
331	self.bin_index.append( ( 0, 0 ) )
332
333	def finish( self ):
334	self.flush()
335	self.nbins = self.bin + 1
336	self.write_header()
337	self.write_index()
338
339	def write_packed( f, pattern, *vals ):
340	f.write( pack( pattern, *vals ) )
341
342	def read_packed( f, pattern ):
343	rval = unpack( pattern, f.read( calcsize( pattern ) ) )
344	if len( rval ) == 1: return rval[0]
345	return rval

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/binned_array.py

異なるフォーマットでダウンロード: