Context Navigation

aggregate_scores_in_intervals.py @ 2

リビジョン 2, 8.5 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central
属性 svn:executable の設定値 ``*

Rev	行番号
[2]	1	#!/usr/bin/env python
	2	# Greg Von Kuster
	3	"""
	4	usage: %prog score_file interval_file chrom start stop [out_file] [options]
	5	-b, --binned: 'score_file' is actually a directory of binned array files
	6	-m, --mask=FILE: bed file containing regions not to consider valid
	7	-c, --chrom_buffer=INT: number of chromosomes (default is 3) to keep in memory when using a user supplied score file
	8	"""
	9
	10	from __future__ import division
	11	from galaxy import eggs
	12	import pkg_resources
	13	pkg_resources.require( "bx-python" )
	14	pkg_resources.require( "lrucache" )
	15	try:
	16	pkg_resources.require( "python-lzo" )
	17	except:
	18	pass
	19
	20	import psyco_full
	21	import sys
	22	import os, os.path
	23	from UserDict import DictMixin
	24	import bx.wiggle
	25	from bx.binned_array import BinnedArray, FileBinnedArray
	26	from bx.bitset import *
	27	from bx.bitset_builders import *
	28	from fpconst import isNaN
	29	from bx.cookbook import doc_optparse
	30	from galaxy.tools.exception_handling import *
	31
	32	assert sys.version_info[:2] >= ( 2, 4 )
	33
	34	import tempfile, struct
	35	class PositionalScoresOnDisk:
	36	fmt = 'f'
	37	fmt_size = struct.calcsize( fmt )
	38	default_value = float( 'nan' )
	39
	40	def __init__( self ):
	41	self.file = tempfile.TemporaryFile( 'w+b' )
	42	self.length = 0
	43	def __getitem__( self, i ):
	44	if i < 0: i = self.length + i
	45	if i < 0 or i >= self.length: return self.default_value
	46	try:
	47	self.file.seek( i * self.fmt_size )
	48	return struct.unpack( self.fmt, self.file.read( self.fmt_size ) )[0]
	49	except Exception, e:
	50	raise IndexError, e
	51	def __setitem__( self, i, value ):
	52	if i < 0: i = self.length + i
	53	if i < 0: raise IndexError, 'Negative assignment index out of range'
	54	if i >= self.length:
	55	self.file.seek( self.length * self.fmt_size )
	56	self.file.write( struct.pack( self.fmt, self.default_value ) * ( i - self.length ) )
	57	self.length = i + 1
	58	self.file.seek( i * self.fmt_size )
	59	self.file.write( struct.pack( self.fmt, value ) )
	60	def __len__( self ):
	61	return self.length
	62	def __repr__( self ):
	63	i = 0
	64	repr = "[ "
	65	for i in xrange( self.length ):
	66	repr = "%s %s," % ( repr, self[i] )
	67	return "%s ]" % ( repr )
	68
	69	class FileBinnedArrayDir( DictMixin ):
	70	"""
	71	Adapter that makes a directory of FileBinnedArray files look like
	72	a regular dict of BinnedArray objects.
	73	"""
	74	def __init__( self, dir ):
	75	self.dir = dir
	76	self.cache = dict()
	77	def __getitem__( self, key ):
	78	value = None
	79	if key in self.cache:
	80	value = self.cache[key]
	81	else:
	82	fname = os.path.join( self.dir, "%s.ba" % key )
	83	if os.path.exists( fname ):
	84	value = FileBinnedArray( open( fname ) )
	85	self.cache[key] = value
	86	if value is None:
	87	raise KeyError( "File does not exist: " + fname )
	88	return value
	89
	90	def stop_err(msg):
	91	sys.stderr.write(msg)
	92	sys.exit()
	93
	94	def load_scores_wiggle( fname, chrom_buffer_size = 3 ):
	95	"""
	96	Read a wiggle file and return a dict of BinnedArray objects keyed
	97	by chromosome.
	98	"""
	99	scores_by_chrom = dict()
	100	try:
	101	for chrom, pos, val in bx.wiggle.Reader( UCSCOutWrapper( open( fname ) ) ):
	102	if chrom not in scores_by_chrom:
	103	if chrom_buffer_size:
	104	scores_by_chrom[chrom] = BinnedArray()
	105	chrom_buffer_size -= 1
	106	else:
	107	scores_by_chrom[chrom] = PositionalScoresOnDisk()
	108	scores_by_chrom[chrom][pos] = val
	109	except UCSCLimitException:
	110	# Wiggle data was truncated, at the very least need to warn the user.
	111	print 'Encountered message from UCSC: "Reached output limit of 100000 data values", so be aware your data was truncated.'
	112	except IndexError:
	113	stop_err('Data error: one or more column data values is missing in "%s"' %fname)
	114	except ValueError:
	115	stop_err('Data error: invalid data type for one or more values in "%s".' %fname)
	116	return scores_by_chrom
	117
	118	def load_scores_ba_dir( dir ):
	119	"""
	120	Return a dict-like object (keyed by chromosome) that returns
	121	FileBinnedArray objects created from "key.ba" files in `dir`
	122	"""
	123	return FileBinnedArrayDir( dir )
	124
	125	def main():
	126
	127	# Parse command line
	128	options, args = doc_optparse.parse( __doc__ )
	129
	130	try:
	131	score_fname = args[0]
	132	interval_fname = args[1]
	133	chrom_col = args[2]
	134	start_col = args[3]
	135	stop_col = args[4]
	136	if len( args ) > 5:
	137	out_file = open( args[5], 'w' )
	138	else:
	139	out_file = sys.stdout
	140	binned = bool( options.binned )
	141	mask_fname = options.mask
	142	except:
	143	doc_optparse.exit()
	144
	145	if score_fname == 'None':
	146	stop_err( 'This tool works with data from genome builds hg16, hg17 or hg18. Click the pencil icon in your history item to set the genome build if appropriate.' )
	147
	148	try:
	149	chrom_col = int(chrom_col) - 1
	150	start_col = int(start_col) - 1
	151	stop_col = int(stop_col) - 1
	152	except:
	153	stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
	154
	155	if chrom_col < 0 or start_col < 0 or stop_col < 0:
	156	stop_err( 'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.' )
	157
	158	if binned:
	159	scores_by_chrom = load_scores_ba_dir( score_fname )
	160	else:
	161	try:
	162	chrom_buffer = int( options.chrom_buffer )
	163	except:
	164	chrom_buffer = 3
	165	scores_by_chrom = load_scores_wiggle( score_fname, chrom_buffer )
	166
	167	if mask_fname:
	168	masks = binned_bitsets_from_file( open( mask_fname ) )
	169	else:
	170	masks = None
	171
	172	skipped_lines = 0
	173	first_invalid_line = 0
	174	invalid_line = ''
	175
	176	for i, line in enumerate( open( interval_fname )):
	177	valid = True
	178	line = line.rstrip('\r\n')
	179	if line and not line.startswith( '#' ):
	180	fields = line.split()
	181
	182	try:
	183	chrom, start, stop = fields[chrom_col], int( fields[start_col] ), int( fields[stop_col] )
	184	except:
	185	valid = False
	186	skipped_lines += 1
	187	if not invalid_line:
	188	first_invalid_line = i + 1
	189	invalid_line = line
	190	if valid:
	191	total = 0
	192	count = 0
	193	min_score = 100000000
	194	max_score = -100000000
	195	for j in range( start, stop ):
	196	if chrom in scores_by_chrom:
	197	try:
	198	# Skip if base is masked
	199	if masks and chrom in masks:
	200	if masks[chrom][j]:
	201	continue
	202	# Get the score, only count if not 'nan'
	203	score = scores_by_chrom[chrom][j]
	204	if not isNaN( score ):
	205	total += score
	206	count += 1
	207	max_score = max( score, max_score )
	208	min_score = min( score, min_score )
	209	except:
	210	continue
	211	if count > 0:
	212	avg = total/count
	213	else:
	214	avg = "nan"
	215	min_score = "nan"
	216	max_score = "nan"
	217
	218	# Build the resulting line of data
	219	out_line = []
	220	for k in range(0, len(fields)):
	221	out_line.append(fields[k])
	222	out_line.append(avg)
	223	out_line.append(min_score)
	224	out_line.append(max_score)
	225
	226	print >> out_file, "\t".join( map( str, out_line ) )
	227	else:
	228	skipped_lines += 1
	229	if not invalid_line:
	230	first_invalid_line = i + 1
	231	invalid_line = line
	232	elif line.startswith( '#' ):
	233	# We'll save the original comments
	234	print >> out_file, line
	235
	236	out_file.close()
	237
	238	if skipped_lines > 0:
	239	print 'Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
	240	if skipped_lines == i:
	241	print 'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.'
	242
	243	if __name__ == "__main__": main()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/stats/aggregate_scores_in_intervals.py @ 2

異なるフォーマットでダウンロード: