Context Navigation

io.py @ 3

リビジョン 3, 9.1 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

Rev	行番号
[3]	1	"""
	2	Support for reading and writing genomic intervals from delimited text files.
	3	"""
	4
	5	import sys
	6	from itertools import *
	7	from bx.tabular.io import *
	8	from bx.bitset import *
	9
	10	class MissingFieldError( ParseError ):
	11	pass
	12
	13	class FieldFormatError( ParseError ):
	14	def __init__( self, args, *kwargs):
	15	ParseError.__init__( self, args, *kwargs )
	16	self.expected = kwargs.get("expected",None)
	17	def __str__( self ):
	18	if self.expected:
	19	return ParseError.__str__( self ) + ", " + self.expected + " expected"
	20	else:
	21	return ParseError.__str__( self )
	22
	23	class StrandFormatError( ParseError ):
	24	pass
	25
	26	class GenomicInterval( TableRow ):
	27	"""
	28	A genomic interval stored in a set of fields (a row of a table)
	29	"""
	30	def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, fix_strand=False ):
	31	TableRow.__init__( self, reader, fields )
	32	self.chrom_col = chrom_col
	33	self.start_col = start_col
	34	self.end_col = end_col
	35	self.strand_col = strand_col
	36	self.nfields = nfields = len( fields )
	37	# Parse chrom/source column
	38	if chrom_col >= nfields:
	39	raise MissingFieldError( "No field for chrom_col (%d)" % chrom_col )
	40	self.chrom = fields[chrom_col]
	41	# Parse start column and ensure it is an integer
	42	if start_col >= nfields:
	43	raise MissingFieldError( "No field for start_col (%d)" % start_col )
	44	try:
	45	self.start = int( fields[start_col] )
	46	except ValueError, e:
	47	raise FieldFormatError( "Could not parse start_col: " + str( e ), expected="integer" )
	48	# Parse end column and ensure it is an integer
	49	if end_col >= nfields:
	50	raise MissingFieldError( "No field for end_col (%d)" % end_col )
	51	try:
	52	self.end = int( fields[end_col] )
	53	except ValueError, e:
	54	raise FieldFormatError( "Could not parse end_col: " + str( e ), expected="integer" )
	55	# Ensure start <= end
	56	if self.end < self.start:
	57	raise ParseError( "Start is greater than End. Interval length is < 1." )
	58	# Parse strand and ensure it is valid
	59	if strand_col >= nfields or strand_col < 0:
	60	# This should probable be immutable since the fields are
	61	# not updated when it is set
	62	self.strand = default_strand
	63	else:
	64	strand = fields[strand_col]
	65	if strand not in ( "+", "-"):
	66	if fix_strand:
	67	strand = "+"
	68	else: raise StrandFormatError( "Strand must be either '+' or '-'" )
	69	self.strand = strand
	70	def __setattr__( self, name, value ):
	71	if name == "chrom":
	72	self.fields[self.chrom_col] = str( value )
	73	elif name == "start":
	74	self.fields[self.start_col] = str( value )
	75	elif name == "end":
	76	self.fields[self.end_col] = str( value )
	77	elif name == "strand":
	78	if self.strand_col < self.nfields and self.strand_col >= 0:
	79	self.fields[self.strand_col] = str( value )
	80	object.__setattr__( self, name, value )
	81	def __str__( self ):
	82	return "\t".join( self.fields )
	83	def copy( self ):
	84	return GenomicInterval(self.reader, list( self.fields ), self.chrom_col, self.start_col, self.end_col, self.strand_col, self.strand)
	85
	86	class GenomicIntervalReader( TableReader ):
	87	"""
	88	Reader for iterating a set of intervals in a tab separated file. Can
	89	also parse header and comment lines if requested.
	90
	91	>>> r = GenomicIntervalReader( [ "#chrom\\tname\\tstart\\tend\\textra",
	92	... "chr1\\tfoo\\t1\\t100\\txxx",
	93	... "chr2\\tbar\\t20\\t300\\txxx",
	94	... "#I am a comment",
	95	... "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
	96	>>> elements = list( r )
	97	>>> assert type( elements[0] ) is Header
	98	>>> str( elements[0] )
	99	'#chrom\\tname\\tstart\\tend\\textra'
	100	>>> assert type( elements[1] ) is GenomicInterval
	101	>>> print elements[1].start, elements[1].end
	102	1 100
	103	>>> str( elements[1] )
	104	'chr1\\tfoo\\t1\\t100\\txxx'
	105	>>> elements[1].start = 30
	106	>>> print elements[1].start, elements[1].end
	107	30 100
	108	>>> str( elements[1] )
	109	'chr1\\tfoo\\t30\\t100\\txxx'
	110	>>> assert type( elements[2] ) is GenomicInterval
	111	>>> assert type( elements[3] ) is Comment
	112	>>> assert type( elements[4] ) is GenomicInterval
	113	"""
	114	def __init__( self, input, chrom_col=0, start_col=1, end_col=2, strand_col=5,
	115	default_strand="+", return_header=True, return_comments=True, force_header=None, fix_strand=False, comment_lines_startswith = ["#", "track "] ):
	116	TableReader.__init__( self, input, return_header, return_comments, force_header, comment_lines_startswith )
	117	self.chrom_col = chrom_col
	118	self.start_col = start_col
	119	self.end_col = end_col
	120	self.strand_col = strand_col
	121	self.default_strand = default_strand
	122	self.fix_strand = fix_strand
	123	def parse_row( self, line ):
	124	return GenomicInterval( self, line.split( "\t" ), self.chrom_col,
	125	self.start_col, self.end_col,
	126	self.strand_col, self.default_strand, fix_strand=self.fix_strand )
	127
	128	def binned_bitsets( self , upstream_pad=0, downstream_pad=0, lens={} ):
	129	# The incoming lens dictionary is a dictionary of chromosome lengths
	130	# which are used to initialize the bitsets.
	131	last_chrom = None
	132	last_bitset = None
	133	bitsets = dict()
	134	for interval in self:
	135	if type( interval ) == GenomicInterval:
	136	chrom = interval[self.chrom_col]
	137	if chrom != last_chrom:
	138	if chrom not in bitsets:
	139	size = lens.get( chrom, MAX )
	140	try:
	141	bbs = BinnedBitSet( size )
	142	except ValueError, e:
	143	# We will only reach here when constructing this bitset from the lens dict
	144	# since the value of MAX is always safe.
	145	raise Exception( "Invalid chrom length %s in 'lens' dictionary. %s" % ( str( size ), str( e ) ) )
	146	bitsets[chrom] = bbs
	147	last_chrom = chrom
	148	last_bitset = bitsets[chrom]
	149	start = max( int( interval[self.start_col] ), 0 )
	150	end = min( int( interval[self.end_col] ), last_bitset.size)
	151	last_bitset.set_range( start, end-start )
	152	return bitsets
	153
	154	class NiceReaderWrapper( GenomicIntervalReader ):
	155	def __init__( self, reader, **kwargs ):
	156	GenomicIntervalReader.__init__( self, reader, **kwargs )
	157	self.outstream = kwargs.get("outstream", None)
	158	self.print_delegate = kwargs.get("print_delegate", None)
	159	self.input_wrapper = iter( self.input )
	160	self.input_iter = self.iterwrapper()
	161	self.skipped = 0
	162	self.skipped_lines = []
	163	def __iter__( self ):
	164	return self
	165	def next( self ):
	166	while 1:
	167	try:
	168	nextitem = GenomicIntervalReader.next( self )
	169	return nextitem
	170	except ParseError, e:
	171	if self.outstream:
	172	if self.print_delegate and hasattr(self.print_delegate,"__call__"):
	173	self.print_delegate( self.outstream, e, self )
	174	self.skipped += 1
	175	# no reason to stuff an entire bad file into memmory
	176	if self.skipped < 10:
	177	self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
	178	def iterwrapper( self ):
	179	while 1:
	180	self.current_line = self.input_wrapper.next()
	181	yield self.current_line
	182
	183	class BitsetSafeReaderWrapper( NiceReaderWrapper ):
	184	def __init__( self, reader, lens={} ):
	185	# This class handles any ValueError, IndexError and OverflowError exceptions that may be thrown when
	186	# the bitsets are being created by skipping the problem lines.
	187	# The incoming lens dictionary is a dictionary of chromosome lengths
	188	# which are used to initialize the bitsets.
	189	# It is assumed that the reader is an interval reader, i.e. it has chr_col, start_col, end_col and strand_col attributes.
	190	NiceReaderWrapper.__init__( self, reader.input, chrom_col=reader.chrom_col, start_col=reader.start_col, end_col=reader.end_col, strand_col=reader.strand_col)
	191	self.lens = lens
	192	def next( self ):
	193	while True:
	194	rval = NiceReaderWrapper.next( self )
	195	if type( rval ) == GenomicInterval and rval.end > self.lens.get( rval.chrom, MAX ): # MAX_INT is defined in bx.bitset
	196	try:
	197	# This will only work if reader is a NiceReaderWrapper
	198	self.skipped += 1
	199	# no reason to stuff an entire bad file into memmory
	200	if self.skipped < 10:
	201	self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
	202	except:
	203	pass
	204	else:
	205	return rval

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/intervals/io.py @ 3

異なるフォーマットでダウンロード: