Context Navigation

io.py

リビジョン 3, 9.1 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""
2	Support for reading and writing genomic intervals from delimited text files.
3	"""
4
5	import sys
6	from itertools import *
7	from bx.tabular.io import *
8	from bx.bitset import *
9
10	class MissingFieldError( ParseError ):
11	pass
12
13	class FieldFormatError( ParseError ):
14	def __init__( self, args, *kwargs):
15	ParseError.__init__( self, args, *kwargs )
16	self.expected = kwargs.get("expected",None)
17	def __str__( self ):
18	if self.expected:
19	return ParseError.__str__( self ) + ", " + self.expected + " expected"
20	else:
21	return ParseError.__str__( self )
22
23	class StrandFormatError( ParseError ):
24	pass
25
26	class GenomicInterval( TableRow ):
27	"""
28	A genomic interval stored in a set of fields (a row of a table)
29	"""
30	def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, fix_strand=False ):
31	TableRow.__init__( self, reader, fields )
32	self.chrom_col = chrom_col
33	self.start_col = start_col
34	self.end_col = end_col
35	self.strand_col = strand_col
36	self.nfields = nfields = len( fields )
37	# Parse chrom/source column
38	if chrom_col >= nfields:
39	raise MissingFieldError( "No field for chrom_col (%d)" % chrom_col )
40	self.chrom = fields[chrom_col]
41	# Parse start column and ensure it is an integer
42	if start_col >= nfields:
43	raise MissingFieldError( "No field for start_col (%d)" % start_col )
44	try:
45	self.start = int( fields[start_col] )
46	except ValueError, e:
47	raise FieldFormatError( "Could not parse start_col: " + str( e ), expected="integer" )
48	# Parse end column and ensure it is an integer
49	if end_col >= nfields:
50	raise MissingFieldError( "No field for end_col (%d)" % end_col )
51	try:
52	self.end = int( fields[end_col] )
53	except ValueError, e:
54	raise FieldFormatError( "Could not parse end_col: " + str( e ), expected="integer" )
55	# Ensure start <= end
56	if self.end < self.start:
57	raise ParseError( "Start is greater than End. Interval length is < 1." )
58	# Parse strand and ensure it is valid
59	if strand_col >= nfields or strand_col < 0:
60	# This should probable be immutable since the fields are
61	# not updated when it is set
62	self.strand = default_strand
63	else:
64	strand = fields[strand_col]
65	if strand not in ( "+", "-"):
66	if fix_strand:
67	strand = "+"
68	else: raise StrandFormatError( "Strand must be either '+' or '-'" )
69	self.strand = strand
70	def __setattr__( self, name, value ):
71	if name == "chrom":
72	self.fields[self.chrom_col] = str( value )
73	elif name == "start":
74	self.fields[self.start_col] = str( value )
75	elif name == "end":
76	self.fields[self.end_col] = str( value )
77	elif name == "strand":
78	if self.strand_col < self.nfields and self.strand_col >= 0:
79	self.fields[self.strand_col] = str( value )
80	object.__setattr__( self, name, value )
81	def __str__( self ):
82	return "\t".join( self.fields )
83	def copy( self ):
84	return GenomicInterval(self.reader, list( self.fields ), self.chrom_col, self.start_col, self.end_col, self.strand_col, self.strand)
85
86	class GenomicIntervalReader( TableReader ):
87	"""
88	Reader for iterating a set of intervals in a tab separated file. Can
89	also parse header and comment lines if requested.
90
91	>>> r = GenomicIntervalReader( [ "#chrom\\tname\\tstart\\tend\\textra",
92	... "chr1\\tfoo\\t1\\t100\\txxx",
93	... "chr2\\tbar\\t20\\t300\\txxx",
94	... "#I am a comment",
95	... "chr2\\tbar\\t20\\t300\\txxx" ], start_col=2, end_col=3 )
96	>>> elements = list( r )
97	>>> assert type( elements[0] ) is Header
98	>>> str( elements[0] )
99	'#chrom\\tname\\tstart\\tend\\textra'
100	>>> assert type( elements[1] ) is GenomicInterval
101	>>> print elements[1].start, elements[1].end
102	1 100
103	>>> str( elements[1] )
104	'chr1\\tfoo\\t1\\t100\\txxx'
105	>>> elements[1].start = 30
106	>>> print elements[1].start, elements[1].end
107	30 100
108	>>> str( elements[1] )
109	'chr1\\tfoo\\t30\\t100\\txxx'
110	>>> assert type( elements[2] ) is GenomicInterval
111	>>> assert type( elements[3] ) is Comment
112	>>> assert type( elements[4] ) is GenomicInterval
113	"""
114	def __init__( self, input, chrom_col=0, start_col=1, end_col=2, strand_col=5,
115	default_strand="+", return_header=True, return_comments=True, force_header=None, fix_strand=False, comment_lines_startswith = ["#", "track "] ):
116	TableReader.__init__( self, input, return_header, return_comments, force_header, comment_lines_startswith )
117	self.chrom_col = chrom_col
118	self.start_col = start_col
119	self.end_col = end_col
120	self.strand_col = strand_col
121	self.default_strand = default_strand
122	self.fix_strand = fix_strand
123	def parse_row( self, line ):
124	return GenomicInterval( self, line.split( "\t" ), self.chrom_col,
125	self.start_col, self.end_col,
126	self.strand_col, self.default_strand, fix_strand=self.fix_strand )
127
128	def binned_bitsets( self , upstream_pad=0, downstream_pad=0, lens={} ):
129	# The incoming lens dictionary is a dictionary of chromosome lengths
130	# which are used to initialize the bitsets.
131	last_chrom = None
132	last_bitset = None
133	bitsets = dict()
134	for interval in self:
135	if type( interval ) == GenomicInterval:
136	chrom = interval[self.chrom_col]
137	if chrom != last_chrom:
138	if chrom not in bitsets:
139	size = lens.get( chrom, MAX )
140	try:
141	bbs = BinnedBitSet( size )
142	except ValueError, e:
143	# We will only reach here when constructing this bitset from the lens dict
144	# since the value of MAX is always safe.
145	raise Exception( "Invalid chrom length %s in 'lens' dictionary. %s" % ( str( size ), str( e ) ) )
146	bitsets[chrom] = bbs
147	last_chrom = chrom
148	last_bitset = bitsets[chrom]
149	start = max( int( interval[self.start_col] ), 0 )
150	end = min( int( interval[self.end_col] ), last_bitset.size)
151	last_bitset.set_range( start, end-start )
152	return bitsets
153
154	class NiceReaderWrapper( GenomicIntervalReader ):
155	def __init__( self, reader, **kwargs ):
156	GenomicIntervalReader.__init__( self, reader, **kwargs )
157	self.outstream = kwargs.get("outstream", None)
158	self.print_delegate = kwargs.get("print_delegate", None)
159	self.input_wrapper = iter( self.input )
160	self.input_iter = self.iterwrapper()
161	self.skipped = 0
162	self.skipped_lines = []
163	def __iter__( self ):
164	return self
165	def next( self ):
166	while 1:
167	try:
168	nextitem = GenomicIntervalReader.next( self )
169	return nextitem
170	except ParseError, e:
171	if self.outstream:
172	if self.print_delegate and hasattr(self.print_delegate,"__call__"):
173	self.print_delegate( self.outstream, e, self )
174	self.skipped += 1
175	# no reason to stuff an entire bad file into memmory
176	if self.skipped < 10:
177	self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
178	def iterwrapper( self ):
179	while 1:
180	self.current_line = self.input_wrapper.next()
181	yield self.current_line
182
183	class BitsetSafeReaderWrapper( NiceReaderWrapper ):
184	def __init__( self, reader, lens={} ):
185	# This class handles any ValueError, IndexError and OverflowError exceptions that may be thrown when
186	# the bitsets are being created by skipping the problem lines.
187	# The incoming lens dictionary is a dictionary of chromosome lengths
188	# which are used to initialize the bitsets.
189	# It is assumed that the reader is an interval reader, i.e. it has chr_col, start_col, end_col and strand_col attributes.
190	NiceReaderWrapper.__init__( self, reader.input, chrom_col=reader.chrom_col, start_col=reader.start_col, end_col=reader.end_col, strand_col=reader.strand_col)
191	self.lens = lens
192	def next( self ):
193	while True:
194	rval = NiceReaderWrapper.next( self )
195	if type( rval ) == GenomicInterval and rval.end > self.lens.get( rval.chrom, MAX ): # MAX_INT is defined in bx.bitset
196	try:
197	# This will only work if reader is a NiceReaderWrapper
198	self.skipped += 1
199	# no reason to stuff an entire bad file into memmory
200	if self.skipped < 10:
201	self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
202	except:
203	pass
204	else:
205	return rval

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/intervals/io.py

異なるフォーマットでダウンロード: