Context Navigation

hdflib.py @ 3

リビジョン 3, 10.4 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

Rev	行番号
[3]	1	"""
	2	Utilities for the hierarchical data format (HDF).
	3
	4	"""
	5	from tables import openFile
	6	from tables import IsDescription, IntCol, FloatCol
	7	from genetrack import logger, util, conf
	8	from itertools import *
	9	import os, bisect, gc, csv
	10
	11	# missing file
	12	missing = lambda f: not os.path.isfile(f)
	13
	14	# prints messages after processing chunk number of lines
	15	CHUNK = 10**5
	16
	17	class PositionalSchema( IsDescription ):
	18	"""
	19	Stores a triplet of float values for each index.
	20	"""
	21	# the position arguments must be present
	22	idx = IntCol ( pos=1 ) # index
	23	fwd = FloatCol( pos=2 ) # values on the forward strand
	24	rev = FloatCol( pos=3 ) # value on the reverse strand
	25	val = FloatCol( pos=4 ) # weighted value on the combined strands
	26
	27	class PositionalData(object):
	28	"""
	29	An HFD representation of coordinates with one or more values associated with
	30	each of these coordinates. The class can store such data for various labels (chromosomes).
	31	The default parser built into the class can process files in the following
	32	format::
	33
	34	chrom index forward reverse value
	35	chr1 146 0.0 1.0 1.0
	36	chr1 254 0.0 3.0 3.0
	37	chr1 319 0.0 1.0 1.0
	38	chr1 328 0.0 1.0 1.0
	39	chr1 330 0.0 1.0 1.0
	40	chr1 339 0.0 1.0 1.0
	41	chr1 341 1.0 0.0 1.0
	42	...
	43
	44	The default representation is to store a value for the forward and reverse strands,
	45	and to produce a composite value (stored as `value` column). In the most common
	46	case the composite value is simply the sum of the values on the forward
	47	and reverse strands. The input file must be sorted by both coordinates
	48	and chromosome (increasing order). Processing is performed in the
	49	following manner:
	50
	51	>>> from genetrack import conf
	52	>>>
	53	>>> fname = conf.testdata('test-hdflib-input.txt')
	54	>>> index = PositionalData(fname=fname, workdir=conf.TEMP_DATA_DIR, update=True)
	55
	56	Upon the first instantiation the index will be created if it did
	57	not exist or if the `update=True` parameter was set.
	58
	59	The `workdir` parameter is optional and if present must point
	60	to the directory into which the resulting index file will be placed.
	61	The contents of the Positional data object may be accessed as a list
	62	but note that only the accessed slice is loaded into memory (lazy access).
	63
	64	>>> index.labels
	65	['chr1', 'chr2', 'chr3']
	66	>>>
	67	>>> # this will return the HDF table as implmenented in pytables
	68	>>> table = index.table('chr1')
	69	>>>
	70	>>> list (table.cols.idx[:10])
	71	[146, 254, 319, 328, 330, 339, 341, 342, 345, 362]
	72	>>>
	73	>>> list( table.cols.fwd[:10])
	74	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0]
	75	>>>
	76	>>> list( table.cols.rev[:10])
	77	[1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]
	78
	79	We may also find the indices for real coordinates. For example the genomic
	80	coordinates 400 and 600 map to internal data indices of 20 to 31
	81	(it works as a binary search that returns the left index)
	82
	83	>>>
	84	>>> start, end = index.indices('chr1', 400, 600)
	85	>>> (start, end)
	86	(20, 31)
	87
	88	We may also query for slices of data that span over an interval
	89
	90	>>> results = index.query( 'chr1', 400, 600)
	91	>>>
	92	>>> # the attributes are numeric arrays, here are cast to list
	93	>>>
	94	>>> list(results.idx)
	95	[402, 403, 411, 419, 427, 432, 434, 443, 587, 593, 596]
	96	>>>
	97	>>> list(results.fwd)
	98	[0.0, 1.0, 0.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0]
	99	>>>
	100	>>> list(results.rev)
	101	[3.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0]
	102	>>>
	103	>>> list(results.val)
	104	[3.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0]
	105	>>> index.close()
	106	>>>
	107
	108	In order to provide the fastes parsing the internal parser
	109	is not overridable. There are transformers that can
	110	change bed and gff files to this input format. See the
	111	`genetrack.scripts' module.
	112	"""
	113
	114	def __init__(self, fname, workdir=None, update=False, nobuild=False, index=None ):
	115	"""
	116	Create the PositionalData
	117	"""
	118	self.fname = fname
	119	self.db = None
	120
	121	# split the incoming name to find the real name, and base directory
	122	basedir, basename = os.path.split(self.fname)
	123
	124	# the index may be stored in the workdir if it was specified
	125	basedir = workdir or basedir
	126
	127	# this is the HDF index name that the file operates on
	128	self.index = index or conf.path_join(basedir, '%s.hdf' % basename)
	129
	130	# debug messages
	131	logger.debug('file path %s' % self.fname)
	132	logger.debug('index path %s' % self.index)
	133
	134	# no building permitted
	135	if nobuild and missing(self.index):
	136	raise Exception('No autobuild allowed and no index found at %s' % self.index)
	137
	138	# creating indices if these are missing or an update is forced
	139	if update or missing(self.index):
	140	self.build()
	141
	142	# operates on the HDF file
	143	self.db=openFile(self.index, mode='r')
	144	self.root = self.db.root
	145
	146	# shows the internal labels
	147	logger.debug('index labels -> %s' % self.labels)
	148
	149	def build(self):
	150	"May be overriden to use different parsers and schemas"
	151
	152	logger.info( "file='%s'" % self.fname )
	153	logger.info( "index='%s'" % self.index)
	154
	155	# check file for existance
	156	if missing(self.fname):
	157	raise IOError('missing data %s' % self.fname)
	158
	159	# provides timing information
	160	timer = util.Timer()
	161
	162	# iterate over the file
	163	reader = csv.reader( file(self.fname, 'rt'), delimiter='\t' )
	164
	165	# unwind the reader until it hits the header
	166	for row in reader:
	167	if row[0] == 'chrom':
	168	break
	169
	170	# helper function that flushes a table
	171	def flush( table, collect, name ):
	172	# commit the changes
	173	if collect:
	174	table.append(collect)
	175	table.flush()
	176	# nicer information
	177	size = util.commify( len(table) )
	178	logger.info('table=%s, contains %s rows' % (name, size) )
	179
	180	# print messages at every CHUNK line
	181	last_chrom = table = None
	182	db = openFile( self.index, mode='w', title='HDF index database')
	183
	184	# continue on with reading, optimized for throughput
	185	# with minimal function calls
	186	collect = []
	187	for linec, row in izip(count(1), reader):
	188
	189	# prints progress on processing, also flushes to periodically
	190	if (linec % CHUNK) == 0:
	191	logger.info("... processed %s lines" % util.commify(linec))
	192	flush( table=table, collect=collect, name=last_chrom )
	193	collect = []
	194
	195	# get the values from each row
	196	chrom, index, fwd, rev, value = row
	197	fwd, rev, value = float(fwd), float(rev), float(value)
	198
	199	# flush when switching chromosomes
	200	if chrom != last_chrom:
	201	# table==None at the beginning
	202	if table is not None:
	203	#logger.debug("... flushing at line %s" % row)
	204	flush( table=table, collect=collect, name=last_chrom )
	205	collect = []
	206
	207	# creates the new HDF table here
	208	table = db.createTable( "/", chrom, PositionalSchema, 'label %s' % chrom )
	209	logger.info("creating table:%s" % chrom)
	210	last_chrom = chrom
	211
	212	collect.append( (index, fwd, rev, value) )
	213
	214	# flush for last chromosome, report some timing information
	215	flush(table, collect, chrom)
	216	lineno = util.commify(linec)
	217	elapsed = timer.report()
	218	logger.info("finished inserting %s lines in %s" % (lineno, elapsed) )
	219
	220	# close database
	221	db.close()
	222
	223	@property
	224	def labels(self):
	225	"Labels in the file"
	226	labs = [ x.name for x in self.root._f_listNodes() ]
	227	util.nice_sort( labs )
	228	return labs
	229
	230	def indices( self, label, start, end, colattr='idx'):
	231	"""
	232	Returns the array indices that correspond the start, end values of index column
	233
	234	Note that for this to work the values for the column attribute 'colattr'
	235	in the table must be sorted in increasing order
	236	"""
	237	table = self.table( label )
	238	column = getattr(table.cols, colattr)
	239	istart = bisect.bisect_left( column, start )
	240	iend = bisect.bisect_left( column, end )
	241	return istart, iend
	242
	243	def query(self, label, start, end, pad=0, aslist=False ):
	244	"""
	245	Returns data that spans star to end as a class
	246	with attributes for idx, fwd, rev and val
	247	"""
	248
	249	step = 1
	250	table = self.table( label )
	251	istart, iend = self.indices(label=label, start=start-pad, end=end+pad)
	252
	253	idx = table.cols.idx[istart:iend:step]
	254	fwd = table.cols.fwd[istart:iend:step]
	255	rev = table.cols.rev[istart:iend:step]
	256	val = table.cols.val[istart:iend:step]
	257
	258	# sometimes we need all return values to belists
	259	if aslist:
	260	idx, fwd, rev, val = map(list, (idx, fwd, rev, val))
	261
	262	params = util.Params( idx=idx, fwd=fwd, rev=rev, val=val )
	263	return params
	264
	265	def chunks(self, label, size=10**6, step=1 ):
	266	"""
	267	Returns the data as chunks of size. All columns are
	268	simultaneously iterated over.
	269	"""
	270	table = self.table( label )
	271	for start in xrange(0, 10**9, size):
	272	end = start + size
	273	idx = table.cols.idx[start:end:step].tolist()
	274	if not idx:
	275	break
	276	fwd = table.cols.fwd[start:end:step].tolist()
	277	rev = table.cols.rev[start:end:step].tolist()
	278	val = table.cols.val[start:end:step].tolist()
	279	yield idx, fwd, rev, val
	280
	281	def table(self, label):
	282	return getattr( self.root, label )
	283
	284	def chromosome(self, label):
	285	"""
	286	Attempts to get a chromosome when specified by either
	287	the label or chr1, chr01, chrom01, chrI
	288	"""
	289	return getattr( self.root, label )
	290
	291
	292	def close(self):
	293	if self.db is not None:
	294	self.db.close()
	295
	296	def __del__(self):
	297	self.close()
	298
	299	def test( verbose=0 ):
	300	"""
	301	Test runner
	302	"""
	303	import doctest
	304	doctest.testmod(optionflags=doctest.ELLIPSIS)
	305
	306	if __name__ == "__main__":
	307	test()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/GeneTrack-2.0.0_beta_1_dev_48da9e998f0caf01c5be731e926f4b0481f658f0-py2.6.egg/genetrack/hdflib.py @ 3

異なるフォーマットでダウンロード: