Context Navigation

core.py

リビジョン 3, 16.6 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""
2	Classes that represent alignments between multiple sequences.
3	"""
4
5	import random
6	import string
7	import weakref
8	from bx.misc.readlengths import read_lengths_file
9
10	# DNA reverse complement table
11	## DNA_COMP = " - " \
12	## " TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr " \
13	## " " \
14	## " "
15
16	DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
17
18	class Alignment( object ):
19
20	def __init__( self, score=0, attributes={}, species_to_lengths=None ):
21	# species_to_lengths is needed only for file formats that don't provide
22	# chromosome lengths; it maps each species name to one of these:
23	# - the name of a file that contains a list of chromosome length pairs
24	# - a dict mapping chromosome names to their length
25	# - a single length value (useful when we just have one sequence and no chromosomes)
26	# internally a file name is replaced by a dict, but only on an "as
27	# needed" basis
28	self.score = score
29	self.text_size = 0
30	self.attributes = attributes
31	if species_to_lengths == None: self.species_to_lengths = {}
32	else: self.species_to_lengths = species_to_lengths
33	self.components = []
34
35	def add_component( self, component ):
36	component._alignment = weakref.ref( self )
37	self.components.append( component )
38	if component.text is not None:
39	if self.text_size == 0:
40	self.text_size = len( component.text )
41	elif self.text_size != len( component.text ):
42	raise Exception( "Components must have same text length" )
43
44	def get_score( self ):
45	return self.__score
46	def set_score( self,score ):
47	if type( score ) == str:
48	try:
49	score = int(score)
50	except:
51	try:
52	score = float(score)
53	except:
54	pass
55	self.__score = score
56	score = property( fget=get_score,fset=set_score )
57
58	def __str__( self ):
59	s = "a score=" + str( self.score )
60	for key in self.attributes:
61	s += " %s=%s" % ( key, self.attributes[key] )
62	s += "\n"
63	# Components
64	for c in self.components:
65	s += str( c )
66	s += "\n"
67	return s
68
69	def src_size( self, src ):
70	species,chrom = src_split( src )
71	if species in self.species_to_lengths:
72	chrom_to_length = self.species_to_lengths[species]
73	elif chrom in self.species_to_lengths:
74	chrom_to_length = self.species_to_lengths
75	else:
76	raise "no src_size (no length file for %s)" % species
77	if type( chrom_to_length ) == int: # (if it's a single length)
78	return chrom_to_length
79	if type( chrom_to_length ) == type( "" ): # (if it's a file name)
80	chrom_to_length = read_lengths_file( chrom_to_length )
81	self.species_to_lengths[species] = chrom_to_length
82	if chrom not in chrom_to_length: raise "no src_size (%s has no length for %s)" % ( species, chrom )
83	return chrom_to_length[chrom]
84
85	def get_component_by_src( self, src ):
86	for c in self.components:
87	if c.src == src: return c
88	return None
89
90	def get_component_by_src_start( self, src ):
91	for c in self.components:
92	if c.src.startswith( src ): return c
93	return None
94
95	def slice( self, start, end ):
96	new = Alignment( score=self.score, attributes=self.attributes )
97	for component in self.components:
98	# FIXME: Is this the right solution?
99	if component.empty:
100	continue
101	new.components.append( component.slice( start, end ) )
102	new.text_size = end - start
103	return new
104
105	def reverse_complement( self ):
106	new = Alignment( score=self.score, attributes=self.attributes )
107	for component in self.components:
108	new.components.append( component.reverse_complement() )
109	new.text_size = self.text_size
110	return new
111
112	def slice_by_component( self, component_index, start, end ):
113	"""
114	Return a slice of the alignment, corresponding to an coordinate interval in a specific component.
115
116	component_index is one of
117	an integer offset into the components list
118	a string indicating the src of the desired component
119	a component
120
121	start and end are relative to the + strand, regardless of the component's strand.
122
123	"""
124	if type( component_index ) == type( 0 ):
125	ref = self.components[ component_index ]
126	elif type( component_index ) == type( "" ):
127	ref = self.get_component_by_src( component_index )
128	elif type( component_index ) == Component:
129	ref = component_index
130	else:
131	raise ValueError( "can't figure out what to do" )
132	start_col = ref.coord_to_col( start )
133	end_col = ref.coord_to_col( end )
134	if (ref.strand == '-'):
135	(start_col,end_col) = (end_col,start_col)
136	return self.slice( start_col, end_col )
137
138	def column_iter( self ):
139	for i in range( self.text_size ):
140	yield [ c.text[i] for c in self.components ]
141
142	def limit_to_species( self, species ):
143	new = Alignment( score=self.score, attributes=self.attributes )
144	new.text_size = self.text_size
145	for component in self.components:
146	if component.src.split('.')[0] in species:
147	new.add_component( component )
148	return new
149
150	def remove_all_gap_columns( self ):
151	"""
152	Remove any columns containing only gaps from alignment components,
153	text of components is modified IN PLACE.
154	"""
155	seqs = []
156	for c in self.components:
157	try:
158	seqs.append( list( c.text ) )
159	except TypeError:
160	seqs.append( None )
161	i = 0
162	text_size = self.text_size
163	while i < text_size:
164	all_gap = True
165	for seq in seqs:
166	if seq is None: continue
167	if seq[i] != '-': all_gap = False
168	if all_gap:
169	for seq in seqs:
170	if seq is None: continue
171	del seq[i]
172	text_size -= 1
173	else:
174	i += 1
175	for i in range( len( self.components ) ):
176	if seqs[i] is None: continue
177	self.components[i].text = ''.join( seqs[i] )
178	self.text_size = text_size
179
180	def __eq__( self, other ):
181	if other is None or type( other ) != type( self ):
182	return False
183	if self.score != other.score:
184	return False
185	if self.attributes != other.attributes:
186	return False
187	if len( self.components ) != len( other.components ):
188	return False
189	for c1, c2 in zip( self.components, other.components ):
190	if c1 != c2:
191	return False
192	return True
193
194	def __ne__( self, other ):
195	return not( self.__eq__( other ) )
196
197	def __deepcopy__( self, memo ):
198	from copy import deepcopy
199	new = Alignment( score=self.score, attributes=deepcopy( self.attributes ), species_to_lengths=deepcopy( self.species_to_lengths ) )
200	for component in self.components:
201	new.add_component( deepcopy( component ) )
202	return new
203
204	class Component( object ):
205
206	def __init__( self, src='', start=0, size=0, strand=None, src_size=None, text='' ):
207	self._alignment = None
208	self.src = src
209	self.start = start # Nota Bene: start,size,strand are as they
210	self.size = size # .. appear in a MAF file-- origin-zero, end
211	self.strand = strand # .. excluded, and minus strand counts from
212	self._src_size = src_size # .. end of sequence
213	self.text = text
214	self.quality = None
215	# Optional fields to keep track of synteny status (only makes sense
216	# when the alignment is part of an ordered set)
217	self.synteny_left = None
218	self.synteny_right = None
219	self.synteny_empty = None
220	# If true, this component actually represents a non-aligning region,
221	# and has no text.
222	self.empty = False
223	# Index maps a coordinate (distance along + strand from + start) to alignment column
224	self.index = None
225
226	def __str__( self ):
227	if self.empty:
228	rval = "e %s %d %d %s %d %s" % ( self.src, self.start,
229	self.size, self.strand,
230	self.src_size, self.synteny_empty )
231	else:
232	rval = "s %s %d %d %s %d %s" % ( self.src, self.start,
233	self.size, self.strand,
234	self.src_size, self.text )
235	if self.synteny_left and self.synteny_right:
236	rval += "\ni %s %s %d %s %d" % ( self.src,
237	self.synteny_left[0], self.synteny_left[1],
238	self.synteny_right[0], self.synteny_right[1] )
239	return rval
240
241	def get_end( self ):
242	return self.start + self.size
243	end = property( fget=get_end )
244
245	def get_src_size( self ):
246	if self._src_size == None:
247	if self._alignment == None: raise "component has no src_size"
248	self._src_size = self._alignment().src_size( self.src )
249	return self._src_size
250	def set_src_size( self,src_size ):
251	self._src_size = src_size
252	src_size = property( fget=get_src_size, fset=set_src_size )
253
254	def get_forward_strand_start( self ):
255	if self.strand == '-': return self.src_size - self.end
256	else: return self.start
257	forward_strand_start = property( fget=get_forward_strand_start )
258
259	def get_forward_strand_end( self ):
260	if self.strand == '-': return self.src_size - self.start
261	else: return self.end
262	forward_strand_end = property( fget=get_forward_strand_end)
263
264	def reverse_complement( self ):
265	start = self.src_size - self.end
266	if self.strand == "+": strand = "-"
267	else: strand = "+"
268	comp = [ch for ch in self.text.translate(DNA_COMP)]
269	comp.reverse()
270	text = "".join(comp)
271	new = Component( self.src, start, self.size, strand, self._src_size, text )
272	new._alignment = self._alignment
273	return new
274
275	def slice( self, start, end ):
276	new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self._src_size )
277	new._alignment = self._alignment
278	new.text = self.text[start:end]
279
280	#for i in range( 0, start ):
281	# if self.text[i] != '-': new.start += 1
282	#for c in new.text:
283	# if c != '-': new.size += 1
284	new.start += start - self.text.count( '-', 0, start )
285	new.size = len( new.text ) - new.text.count( '-' )
286
287	# FIXME: This annotation probably means nothing after slicing if
288	# one of the ends changes. In general the 'i' rows of a MAF only
289	# make sense in context (relative to the previous and next alignments
290	# in a stream, slicing breaks that).
291	new.synteny_left = self.synteny_left
292	new.synteny_right = self.synteny_right
293
294	return new
295
296	def slice_by_coord( self, start, end ):
297	"""
298	Return the slice of the component corresponding to a coordinate interval.
299
300	start and end are relative to the + strand, regardless of the component's strand.
301
302	"""
303	start_col = self.coord_to_col( start )
304	end_col = self.coord_to_col( end )
305	if (self.strand == '-'):
306	(start_col,end_col) = (end_col,start_col)
307	return self.slice( start_col, end_col )
308
309	def coord_to_col( self, pos ):
310	"""
311	Return the alignment column index corresponding to coordinate pos.
312
313	pos is relative to the + strand, regardless of the component's strand.
314
315	"""
316	start,end = self.get_forward_strand_start(),self.get_forward_strand_end()
317	if pos < start or pos > end:
318	raise "Range error: %d not in %d-%d" % ( pos, start, end )
319	if not self.index:
320	self.index = list()
321	if (self.strand == '-'):
322	# nota bene: for - strand self.index[x] maps to one column
323	# higher than is actually associated with the position; thus
324	# when slice_by_component() and slice_by_coord() flip the ends,
325	# the resulting slice is correct
326	for x in range( len(self.text)-1,-1,-1 ):
327	if not self.text[x] == '-':
328	self.index.append( x + 1 )
329	self.index.append( 0 )
330	else:
331	for x in range( len(self.text) ):
332	if not self.text[x] == '-':
333	self.index.append(x)
334	self.index.append( len(self.text) )
335	x = None
336	try:
337	x = self.index[ pos - start ]
338	except:
339	raise "Error in index."
340	return x
341
342
343	def __eq__( self, other ):
344	if other is None or type( other ) != type( self ):
345	return False
346	return ( self.src == other.src
347	and self.start == other.start
348	and self.size == other.size
349	and self.strand == other.strand
350	and self._src_size == other._src_size
351	and self.text == other.text
352	and self.synteny_left == other.synteny_left
353	and self.synteny_right == other.synteny_right
354	and self.synteny_empty == other.synteny_empty
355	and self.empty == other.empty )
356
357	def __ne__( self, other ):
358	return not( self.__eq__( other ) )
359
360	def __deepcopy__( self, memo ):
361	new = Component( src=self.src, start=self.start, size=self.size, strand=self.strand, src_size=self._src_size, text=self.text )
362	new._alignment = self._alignment
363	new.quality = self.quality
364	new.synteny_left = self.synteny_left
365	new.synteny_right = self.synteny_right
366	new.synteny_empty = self.synteny_empty
367	new.empty = self.empty
368	new.index = self.index
369	return new
370
371	def get_reader( format, infile, species_to_lengths=None ):
372	import bx.align.maf, bx.align.axt, bx.align.lav
373	if format == "maf": return bx.align.maf.Reader( infile, species_to_lengths )
374	elif format == "axt": return bx.align.axt.Reader( infile, species_to_lengths )
375	elif format == "lav": return bx.align.lav.Reader( infile )
376	else: raise "Unknown alignment format %s" % format
377
378	def get_writer( format, outfile, attributes={} ):
379	import bx.align.maf, bx.align.axt, bx.align.lav
380	if format == "maf": return bx.align.maf.Writer( outfile, attributes )
381	elif format == "axt": return bx.align.axt.Writer( outfile, attributes )
382	elif format == "lav": return bx.align.lav.Writer( outfile, attributes )
383	else: raise "Unknown alignment format %s" % format
384
385	def get_indexed( format, filename, index_filename=None, keep_open=False, species_to_lengths=None ):
386	import bx.align.maf, bx.align.axt, bx.align.lav
387	if format == "maf": return bx.align.maf.Indexed( filename, index_filename, keep_open, species_to_lengths )
388	elif format == "axt": return bx.align.axt.Indexed( filename, index_filename, keep_open, species_to_lengths )
389	elif format == "lav": raise "LAV support for Indexed has not been implemented"
390	else: raise "Unknown alignment format %s" % format
391
392	def shuffle_columns( a ):
393	"""Randomize the columns of an alignment"""
394	mask = range( a.text_size )
395	random.shuffle( mask )
396	for c in a.components:
397	c.text = ''.join( [ c.text[i] for i in mask ] )
398
399	def src_split( src ): # splits src into species,chrom
400	dot = src.rfind( "." )
401	if dot == -1: return None,src
402	else: return src[:dot],src[dot+1:]
403
404	def src_merge( species,chrom,contig=None ): # creates src (inverse of src_split)
405	if species == None: src = chrom
406	else: src = species + "." + chrom
407	if contig != None: src += "[%s]" % contig
408	return src
409
410	# ---- Read C extension if available ---------------------------------------
411
412	try:
413	from _core import coord_to_col
414	except:
415	def coord_to_col( start, text, pos ):
416	col = 0
417	while start < pos:
418	if text[col] != '-':
419	start += 1
420	col += 1
421	return col

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/align/core.py

異なるフォーマットでダウンロード: