Context Navigation

hdflib.py @ 3

リビジョン 3, 10.4 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""
2	Utilities for the hierarchical data format (HDF).
3
4	"""
5	from tables import openFile
6	from tables import IsDescription, IntCol, FloatCol
7	from genetrack import logger, util, conf
8	from itertools import *
9	import os, bisect, gc, csv
10
11	# missing file
12	missing = lambda f: not os.path.isfile(f)
13
14	# prints messages after processing chunk number of lines
15	CHUNK = 10**5
16
17	class PositionalSchema( IsDescription ):
18	"""
19	Stores a triplet of float values for each index.
20	"""
21	# the position arguments must be present
22	idx = IntCol ( pos=1 ) # index
23	fwd = FloatCol( pos=2 ) # values on the forward strand
24	rev = FloatCol( pos=3 ) # value on the reverse strand
25	val = FloatCol( pos=4 ) # weighted value on the combined strands
26
27	class PositionalData(object):
28	"""
29	An HFD representation of coordinates with one or more values associated with
30	each of these coordinates. The class can store such data for various labels (chromosomes).
31	The default parser built into the class can process files in the following
32	format::
33
34	chrom index forward reverse value
35	chr1 146 0.0 1.0 1.0
36	chr1 254 0.0 3.0 3.0
37	chr1 319 0.0 1.0 1.0
38	chr1 328 0.0 1.0 1.0
39	chr1 330 0.0 1.0 1.0
40	chr1 339 0.0 1.0 1.0
41	chr1 341 1.0 0.0 1.0
42	...
43
44	The default representation is to store a value for the forward and reverse strands,
45	and to produce a composite value (stored as `value` column). In the most common
46	case the composite value is simply the sum of the values on the forward
47	and reverse strands. The input file must be sorted by both coordinates
48	and chromosome (increasing order). Processing is performed in the
49	following manner:
50
51	>>> from genetrack import conf
52	>>>
53	>>> fname = conf.testdata('test-hdflib-input.txt')
54	>>> index = PositionalData(fname=fname, workdir=conf.TEMP_DATA_DIR, update=True)
55
56	Upon the first instantiation the index will be created if it did
57	not exist or if the `update=True` parameter was set.
58
59	The `workdir` parameter is optional and if present must point
60	to the directory into which the resulting index file will be placed.
61	The contents of the Positional data object may be accessed as a list
62	but note that only the accessed slice is loaded into memory (lazy access).
63
64	>>> index.labels
65	['chr1', 'chr2', 'chr3']
66	>>>
67	>>> # this will return the HDF table as implmenented in pytables
68	>>> table = index.table('chr1')
69	>>>
70	>>> list (table.cols.idx[:10])
71	[146, 254, 319, 328, 330, 339, 341, 342, 345, 362]
72	>>>
73	>>> list( table.cols.fwd[:10])
74	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0]
75	>>>
76	>>> list( table.cols.rev[:10])
77	[1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]
78
79	We may also find the indices for real coordinates. For example the genomic
80	coordinates 400 and 600 map to internal data indices of 20 to 31
81	(it works as a binary search that returns the left index)
82
83	>>>
84	>>> start, end = index.indices('chr1', 400, 600)
85	>>> (start, end)
86	(20, 31)
87
88	We may also query for slices of data that span over an interval
89
90	>>> results = index.query( 'chr1', 400, 600)
91	>>>
92	>>> # the attributes are numeric arrays, here are cast to list
93	>>>
94	>>> list(results.idx)
95	[402, 403, 411, 419, 427, 432, 434, 443, 587, 593, 596]
96	>>>
97	>>> list(results.fwd)
98	[0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0]
99	>>>
100	>>> list(results.rev)
101	[3.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0]
102	>>>
103	>>> list(results.val)
104	[3.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0]
105	>>> index.close()
106	>>>
107
108	In order to provide the fastes parsing the internal parser
109	is not overridable. There are transformers that can
110	change bed and gff files to this input format. See the
111	`genetrack.scripts' module.
112	"""
113
114	def __init__(self, fname, workdir=None, update=False, nobuild=False, index=None ):
115	"""
116	Create the PositionalData
117	"""
118	self.fname = fname
119	self.db = None
120
121	# split the incoming name to find the real name, and base directory
122	basedir, basename = os.path.split(self.fname)
123
124	# the index may be stored in the workdir if it was specified
125	basedir = workdir or basedir
126
127	# this is the HDF index name that the file operates on
128	self.index = index or conf.path_join(basedir, '%s.hdf' % basename)
129
130	# debug messages
131	logger.debug('file path %s' % self.fname)
132	logger.debug('index path %s' % self.index)
133
134	# no building permitted
135	if nobuild and missing(self.index):
136	raise Exception('No autobuild allowed and no index found at %s' % self.index)
137
138	# creating indices if these are missing or an update is forced
139	if update or missing(self.index):
140	self.build()
141
142	# operates on the HDF file
143	self.db=openFile(self.index, mode='r')
144	self.root = self.db.root
145
146	# shows the internal labels
147	logger.debug('index labels -> %s' % self.labels)
148
149	def build(self):
150	"May be overriden to use different parsers and schemas"
151
152	logger.info( "file='%s'" % self.fname )
153	logger.info( "index='%s'" % self.index)
154
155	# check file for existance
156	if missing(self.fname):
157	raise IOError('missing data %s' % self.fname)
158
159	# provides timing information
160	timer = util.Timer()
161
162	# iterate over the file
163	reader = csv.reader( file(self.fname, 'rt'), delimiter='\t' )
164
165	# unwind the reader until it hits the header
166	for row in reader:
167	if row[0] == 'chrom':
168	break
169
170	# helper function that flushes a table
171	def flush( table, collect, name ):
172	# commit the changes
173	if collect:
174	table.append(collect)
175	table.flush()
176	# nicer information
177	size = util.commify( len(table) )
178	logger.info('table=%s, contains %s rows' % (name, size) )
179
180	# print messages at every CHUNK line
181	last_chrom = table = None
182	db = openFile( self.index, mode='w', title='HDF index database')
183
184	# continue on with reading, optimized for throughput
185	# with minimal function calls
186	collect = []
187	for linec, row in izip(count(1), reader):
188
189	# prints progress on processing, also flushes to periodically
190	if (linec % CHUNK) == 0:
191	logger.info("... processed %s lines" % util.commify(linec))
192	flush( table=table, collect=collect, name=last_chrom )
193	collect = []
194
195	# get the values from each row
196	chrom, index, fwd, rev, value = row
197	fwd, rev, value = float(fwd), float(rev), float(value)
198
199	# flush when switching chromosomes
200	if chrom != last_chrom:
201	# table==None at the beginning
202	if table is not None:
203	#logger.debug("... flushing at line %s" % row)
204	flush( table=table, collect=collect, name=last_chrom )
205	collect = []
206
207	# creates the new HDF table here
208	table = db.createTable( "/", chrom, PositionalSchema, 'label %s' % chrom )
209	logger.info("creating table:%s" % chrom)
210	last_chrom = chrom
211
212	collect.append( (index, fwd, rev, value) )
213
214	# flush for last chromosome, report some timing information
215	flush(table, collect, chrom)
216	lineno = util.commify(linec)
217	elapsed = timer.report()
218	logger.info("finished inserting %s lines in %s" % (lineno, elapsed) )
219
220	# close database
221	db.close()
222
223	@property
224	def labels(self):
225	"Labels in the file"
226	labs = [ x.name for x in self.root._f_listNodes() ]
227	util.nice_sort( labs )
228	return labs
229
230	def indices( self, label, start, end, colattr='idx'):
231	"""
232	Returns the array indices that correspond the start, end values of index column
233
234	Note that for this to work the values for the column attribute 'colattr'
235	in the table must be sorted in increasing order
236	"""
237	table = self.table( label )
238	column = getattr(table.cols, colattr)
239	istart = bisect.bisect_left( column, start )
240	iend = bisect.bisect_left( column, end )
241	return istart, iend
242
243	def query(self, label, start, end, pad=0, aslist=False ):
244	"""
245	Returns data that spans star to end as a class
246	with attributes for idx, fwd, rev and val
247	"""
248
249	step = 1
250	table = self.table( label )
251	istart, iend = self.indices(label=label, start=start-pad, end=end+pad)
252
253	idx = table.cols.idx[istart:iend:step]
254	fwd = table.cols.fwd[istart:iend:step]
255	rev = table.cols.rev[istart:iend:step]
256	val = table.cols.val[istart:iend:step]
257
258	# sometimes we need all return values to belists
259	if aslist:
260	idx, fwd, rev, val = map(list, (idx, fwd, rev, val))
261
262	params = util.Params( idx=idx, fwd=fwd, rev=rev, val=val )
263	return params
264
265	def chunks(self, label, size=10**6, step=1 ):
266	"""
267	Returns the data as chunks of size. All columns are
268	simultaneously iterated over.
269	"""
270	table = self.table( label )
271	for start in xrange(0, 10**9, size):
272	end = start + size
273	idx = table.cols.idx[start:end:step].tolist()
274	if not idx:
275	break
276	fwd = table.cols.fwd[start:end:step].tolist()
277	rev = table.cols.rev[start:end:step].tolist()
278	val = table.cols.val[start:end:step].tolist()
279	yield idx, fwd, rev, val
280
281	def table(self, label):
282	return getattr( self.root, label )
283
284	def chromosome(self, label):
285	"""
286	Attempts to get a chromosome when specified by either
287	the label or chr1, chr01, chrom01, chrI
288	"""
289	return getattr( self.root, label )
290
291
292	def close(self):
293	if self.db is not None:
294	self.db.close()
295
296	def __del__(self):
297	self.close()
298
299	def test( verbose=0 ):
300	"""
301	Test runner
302	"""
303	import doctest
304	doctest.testmod(optionflags=doctest.ELLIPSIS)
305
306	if __name__ == "__main__":
307	test()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/GeneTrack-2.0.0_beta_1_dev_48da9e998f0caf01c5be731e926f4b0481f658f0-py2.6.egg/genetrack/hdflib.py @ 3

異なるフォーマットでダウンロード: