Context Navigation

core.py @ 3

リビジョン 3, 16.6 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

Rev	行番号
[3]	1	"""
	2	Classes that represent alignments between multiple sequences.
	3	"""
	4
	5	import random
	6	import string
	7	import weakref
	8	from bx.misc.readlengths import read_lengths_file
	9
	10	# DNA reverse complement table
	11	## DNA_COMP = " - " \
	12	## " TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr " \
	13	## " " \
	14	## " "
	15
	16	DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
	17
	18	class Alignment( object ):
	19
	20	def __init__( self, score=0, attributes={}, species_to_lengths=None ):
	21	# species_to_lengths is needed only for file formats that don't provide
	22	# chromosome lengths; it maps each species name to one of these:
	23	# - the name of a file that contains a list of chromosome length pairs
	24	# - a dict mapping chromosome names to their length
	25	# - a single length value (useful when we just have one sequence and no chromosomes)
	26	# internally a file name is replaced by a dict, but only on an "as
	27	# needed" basis
	28	self.score = score
	29	self.text_size = 0
	30	self.attributes = attributes
	31	if species_to_lengths == None: self.species_to_lengths = {}
	32	else: self.species_to_lengths = species_to_lengths
	33	self.components = []
	34
	35	def add_component( self, component ):
	36	component._alignment = weakref.ref( self )
	37	self.components.append( component )
	38	if component.text is not None:
	39	if self.text_size == 0:
	40	self.text_size = len( component.text )
	41	elif self.text_size != len( component.text ):
	42	raise Exception( "Components must have same text length" )
	43
	44	def get_score( self ):
	45	return self.__score
	46	def set_score( self,score ):
	47	if type( score ) == str:
	48	try:
	49	score = int(score)
	50	except:
	51	try:
	52	score = float(score)
	53	except:
	54	pass
	55	self.__score = score
	56	score = property( fget=get_score,fset=set_score )
	57
	58	def __str__( self ):
	59	s = "a score=" + str( self.score )
	60	for key in self.attributes:
	61	s += " %s=%s" % ( key, self.attributes[key] )
	62	s += "\n"
	63	# Components
	64	for c in self.components:
	65	s += str( c )
	66	s += "\n"
	67	return s
	68
	69	def src_size( self, src ):
	70	species,chrom = src_split( src )
	71	if species in self.species_to_lengths:
	72	chrom_to_length = self.species_to_lengths[species]
	73	elif chrom in self.species_to_lengths:
	74	chrom_to_length = self.species_to_lengths
	75	else:
	76	raise "no src_size (no length file for %s)" % species
	77	if type( chrom_to_length ) == int: # (if it's a single length)
	78	return chrom_to_length
	79	if type( chrom_to_length ) == type( "" ): # (if it's a file name)
	80	chrom_to_length = read_lengths_file( chrom_to_length )
	81	self.species_to_lengths[species] = chrom_to_length
	82	if chrom not in chrom_to_length: raise "no src_size (%s has no length for %s)" % ( species, chrom )
	83	return chrom_to_length[chrom]
	84
	85	def get_component_by_src( self, src ):
	86	for c in self.components:
	87	if c.src == src: return c
	88	return None
	89
	90	def get_component_by_src_start( self, src ):
	91	for c in self.components:
	92	if c.src.startswith( src ): return c
	93	return None
	94
	95	def slice( self, start, end ):
	96	new = Alignment( score=self.score, attributes=self.attributes )
	97	for component in self.components:
	98	# FIXME: Is this the right solution?
	99	if component.empty:
	100	continue
	101	new.components.append( component.slice( start, end ) )
	102	new.text_size = end - start
	103	return new
	104
	105	def reverse_complement( self ):
	106	new = Alignment( score=self.score, attributes=self.attributes )
	107	for component in self.components:
	108	new.components.append( component.reverse_complement() )
	109	new.text_size = self.text_size
	110	return new
	111
	112	def slice_by_component( self, component_index, start, end ):
	113	"""
	114	Return a slice of the alignment, corresponding to an coordinate interval in a specific component.
	115
	116	component_index is one of
	117	an integer offset into the components list
	118	a string indicating the src of the desired component
	119	a component
	120
	121	start and end are relative to the + strand, regardless of the component's strand.
	122
	123	"""
	124	if type( component_index ) == type( 0 ):
	125	ref = self.components[ component_index ]
	126	elif type( component_index ) == type( "" ):
	127	ref = self.get_component_by_src( component_index )
	128	elif type( component_index ) == Component:
	129	ref = component_index
	130	else:
	131	raise ValueError( "can't figure out what to do" )
	132	start_col = ref.coord_to_col( start )
	133	end_col = ref.coord_to_col( end )
	134	if (ref.strand == '-'):
	135	(start_col,end_col) = (end_col,start_col)
	136	return self.slice( start_col, end_col )
	137
	138	def column_iter( self ):
	139	for i in range( self.text_size ):
	140	yield [ c.text[i] for c in self.components ]
	141
	142	def limit_to_species( self, species ):
	143	new = Alignment( score=self.score, attributes=self.attributes )
	144	new.text_size = self.text_size
	145	for component in self.components:
	146	if component.src.split('.')[0] in species:
	147	new.add_component( component )
	148	return new
	149
	150	def remove_all_gap_columns( self ):
	151	"""
	152	Remove any columns containing only gaps from alignment components,
	153	text of components is modified IN PLACE.
	154	"""
	155	seqs = []
	156	for c in self.components:
	157	try:
	158	seqs.append( list( c.text ) )
	159	except TypeError:
	160	seqs.append( None )
	161	i = 0
	162	text_size = self.text_size
	163	while i < text_size:
	164	all_gap = True
	165	for seq in seqs:
	166	if seq is None: continue
	167	if seq[i] != '-': all_gap = False
	168	if all_gap:
	169	for seq in seqs:
	170	if seq is None: continue
	171	del seq[i]
	172	text_size -= 1
	173	else:
	174	i += 1
	175	for i in range( len( self.components ) ):
	176	if seqs[i] is None: continue
	177	self.components[i].text = ''.join( seqs[i] )
	178	self.text_size = text_size
	179
	180	def __eq__( self, other ):
	181	if other is None or type( other ) != type( self ):
	182	return False
	183	if self.score != other.score:
	184	return False
	185	if self.attributes != other.attributes:
	186	return False
	187	if len( self.components ) != len( other.components ):
	188	return False
	189	for c1, c2 in zip( self.components, other.components ):
	190	if c1 != c2:
	191	return False
	192	return True
	193
	194	def __ne__( self, other ):
	195	return not( self.__eq__( other ) )
	196
	197	def __deepcopy__( self, memo ):
	198	from copy import deepcopy
	199	new = Alignment( score=self.score, attributes=deepcopy( self.attributes ), species_to_lengths=deepcopy( self.species_to_lengths ) )
	200	for component in self.components:
	201	new.add_component( deepcopy( component ) )
	202	return new
	203
	204	class Component( object ):
	205
	206	def __init__( self, src='', start=0, size=0, strand=None, src_size=None, text='' ):
	207	self._alignment = None
	208	self.src = src
	209	self.start = start # Nota Bene: start,size,strand are as they
	210	self.size = size # .. appear in a MAF file-- origin-zero, end
	211	self.strand = strand # .. excluded, and minus strand counts from
	212	self._src_size = src_size # .. end of sequence
	213	self.text = text
	214	self.quality = None
	215	# Optional fields to keep track of synteny status (only makes sense
	216	# when the alignment is part of an ordered set)
	217	self.synteny_left = None
	218	self.synteny_right = None
	219	self.synteny_empty = None
	220	# If true, this component actually represents a non-aligning region,
	221	# and has no text.
	222	self.empty = False
	223	# Index maps a coordinate (distance along + strand from + start) to alignment column
	224	self.index = None
	225
	226	def __str__( self ):
	227	if self.empty:
	228	rval = "e %s %d %d %s %d %s" % ( self.src, self.start,
	229	self.size, self.strand,
	230	self.src_size, self.synteny_empty )
	231	else:
	232	rval = "s %s %d %d %s %d %s" % ( self.src, self.start,
	233	self.size, self.strand,
	234	self.src_size, self.text )
	235	if self.synteny_left and self.synteny_right:
	236	rval += "\ni %s %s %d %s %d" % ( self.src,
	237	self.synteny_left[0], self.synteny_left[1],
	238	self.synteny_right[0], self.synteny_right[1] )
	239	return rval
	240
	241	def get_end( self ):
	242	return self.start + self.size
	243	end = property( fget=get_end )
	244
	245	def get_src_size( self ):
	246	if self._src_size == None:
	247	if self._alignment == None: raise "component has no src_size"
	248	self._src_size = self._alignment().src_size( self.src )
	249	return self._src_size
	250	def set_src_size( self,src_size ):
	251	self._src_size = src_size
	252	src_size = property( fget=get_src_size, fset=set_src_size )
	253
	254	def get_forward_strand_start( self ):
	255	if self.strand == '-': return self.src_size - self.end
	256	else: return self.start
	257	forward_strand_start = property( fget=get_forward_strand_start )
	258
	259	def get_forward_strand_end( self ):
	260	if self.strand == '-': return self.src_size - self.start
	261	else: return self.end
	262	forward_strand_end = property( fget=get_forward_strand_end)
	263
	264	def reverse_complement( self ):
	265	start = self.src_size - self.end
	266	if self.strand == "+": strand = "-"
	267	else: strand = "+"
	268	comp = [ch for ch in self.text.translate(DNA_COMP)]
	269	comp.reverse()
	270	text = "".join(comp)
	271	new = Component( self.src, start, self.size, strand, self._src_size, text )
	272	new._alignment = self._alignment
	273	return new
	274
	275	def slice( self, start, end ):
	276	new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self._src_size )
	277	new._alignment = self._alignment
	278	new.text = self.text[start:end]
	279
	280	#for i in range( 0, start ):
	281	# if self.text[i] != '-': new.start += 1
	282	#for c in new.text:
	283	# if c != '-': new.size += 1
	284	new.start += start - self.text.count( '-', 0, start )
	285	new.size = len( new.text ) - new.text.count( '-' )
	286
	287	# FIXME: This annotation probably means nothing after slicing if
	288	# one of the ends changes. In general the 'i' rows of a MAF only
	289	# make sense in context (relative to the previous and next alignments
	290	# in a stream, slicing breaks that).
	291	new.synteny_left = self.synteny_left
	292	new.synteny_right = self.synteny_right
	293
	294	return new
	295
	296	def slice_by_coord( self, start, end ):
	297	"""
	298	Return the slice of the component corresponding to a coordinate interval.
	299
	300	start and end are relative to the + strand, regardless of the component's strand.
	301
	302	"""
	303	start_col = self.coord_to_col( start )
	304	end_col = self.coord_to_col( end )
	305	if (self.strand == '-'):
	306	(start_col,end_col) = (end_col,start_col)
	307	return self.slice( start_col, end_col )
	308
	309	def coord_to_col( self, pos ):
	310	"""
	311	Return the alignment column index corresponding to coordinate pos.
	312
	313	pos is relative to the + strand, regardless of the component's strand.
	314
	315	"""
	316	start,end = self.get_forward_strand_start(),self.get_forward_strand_end()
	317	if pos < start or pos > end:
	318	raise "Range error: %d not in %d-%d" % ( pos, start, end )
	319	if not self.index:
	320	self.index = list()
	321	if (self.strand == '-'):
	322	# nota bene: for - strand self.index[x] maps to one column
	323	# higher than is actually associated with the position; thus
	324	# when slice_by_component() and slice_by_coord() flip the ends,
	325	# the resulting slice is correct
	326	for x in range( len(self.text)-1,-1,-1 ):
	327	if not self.text[x] == '-':
	328	self.index.append( x + 1 )
	329	self.index.append( 0 )
	330	else:
	331	for x in range( len(self.text) ):
	332	if not self.text[x] == '-':
	333	self.index.append(x)
	334	self.index.append( len(self.text) )
	335	x = None
	336	try:
	337	x = self.index[ pos - start ]
	338	except:
	339	raise "Error in index."
	340	return x
	341
	342
	343	def __eq__( self, other ):
	344	if other is None or type( other ) != type( self ):
	345	return False
	346	return ( self.src == other.src
	347	and self.start == other.start
	348	and self.size == other.size
	349	and self.strand == other.strand
	350	and self._src_size == other._src_size
	351	and self.text == other.text
	352	and self.synteny_left == other.synteny_left
	353	and self.synteny_right == other.synteny_right
	354	and self.synteny_empty == other.synteny_empty
	355	and self.empty == other.empty )
	356
	357	def __ne__( self, other ):
	358	return not( self.__eq__( other ) )
	359
	360	def __deepcopy__( self, memo ):
	361	new = Component( src=self.src, start=self.start, size=self.size, strand=self.strand, src_size=self._src_size, text=self.text )
	362	new._alignment = self._alignment
	363	new.quality = self.quality
	364	new.synteny_left = self.synteny_left
	365	new.synteny_right = self.synteny_right
	366	new.synteny_empty = self.synteny_empty
	367	new.empty = self.empty
	368	new.index = self.index
	369	return new
	370
	371	def get_reader( format, infile, species_to_lengths=None ):
	372	import bx.align.maf, bx.align.axt, bx.align.lav
	373	if format == "maf": return bx.align.maf.Reader( infile, species_to_lengths )
	374	elif format == "axt": return bx.align.axt.Reader( infile, species_to_lengths )
	375	elif format == "lav": return bx.align.lav.Reader( infile )
	376	else: raise "Unknown alignment format %s" % format
	377
	378	def get_writer( format, outfile, attributes={} ):
	379	import bx.align.maf, bx.align.axt, bx.align.lav
	380	if format == "maf": return bx.align.maf.Writer( outfile, attributes )
	381	elif format == "axt": return bx.align.axt.Writer( outfile, attributes )
	382	elif format == "lav": return bx.align.lav.Writer( outfile, attributes )
	383	else: raise "Unknown alignment format %s" % format
	384
	385	def get_indexed( format, filename, index_filename=None, keep_open=False, species_to_lengths=None ):
	386	import bx.align.maf, bx.align.axt, bx.align.lav
	387	if format == "maf": return bx.align.maf.Indexed( filename, index_filename, keep_open, species_to_lengths )
	388	elif format == "axt": return bx.align.axt.Indexed( filename, index_filename, keep_open, species_to_lengths )
	389	elif format == "lav": raise "LAV support for Indexed has not been implemented"
	390	else: raise "Unknown alignment format %s" % format
	391
	392	def shuffle_columns( a ):
	393	"""Randomize the columns of an alignment"""
	394	mask = range( a.text_size )
	395	random.shuffle( mask )
	396	for c in a.components:
	397	c.text = ''.join( [ c.text[i] for i in mask ] )
	398
	399	def src_split( src ): # splits src into species,chrom
	400	dot = src.rfind( "." )
	401	if dot == -1: return None,src
	402	else: return src[:dot],src[dot+1:]
	403
	404	def src_merge( species,chrom,contig=None ): # creates src (inverse of src_split)
	405	if species == None: src = chrom
	406	else: src = species + "." + chrom
	407	if contig != None: src += "[%s]" % contig
	408	return src
	409
	410	# ---- Read C extension if available ---------------------------------------
	411
	412	try:
	413	from _core import coord_to_col
	414	except:
	415	def coord_to_col( start, text, pos ):
	416	col = 0
	417	while start < pos:
	418	if text[col] != '-':
	419	start += 1
	420	col += 1
	421	return col

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/align/core.py @ 3

異なるフォーマットでダウンロード: