Context Navigation

maf_utilities.py @ 2

リビジョン 2, 26.5 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2	"""
	3	Provides wrappers and utilities for working with MAF files and alignments.
	4	"""
	5	#Dan Blankenberg
	6	import pkg_resources; pkg_resources.require( "bx-python" )
	7	import bx.align.maf
	8	import bx.intervals
	9	import bx.interval_index_file
	10	import sys, os, string, tempfile
	11	import logging
	12	from copy import deepcopy
	13
	14	assert sys.version_info[:2] >= ( 2, 4 )
	15
	16	log = logging.getLogger(__name__)
	17
	18
	19	GAP_CHARS = [ '-' ]
	20	SRC_SPLIT_CHAR = '.'
	21
	22	def src_split( src ):
	23	fields = src.split( SRC_SPLIT_CHAR, 1 )
	24	spec = fields.pop( 0 )
	25	if fields:
	26	chrom = fields.pop( 0 )
	27	else:
	28	chrom = spec
	29	return spec, chrom
	30
	31	def src_merge( spec, chrom, contig = None ):
	32	if None in [ spec, chrom ]:
	33	spec = chrom = spec or chrom
	34	return bx.align.maf.src_merge( spec, chrom, contig )
	35
	36	def get_species_in_block( block ):
	37	species = []
	38	for c in block.components:
	39	spec, chrom = src_split( c.src )
	40	if spec not in species:
	41	species.append( spec )
	42	return species
	43
	44	def tool_fail( msg = "Unknown Error" ):
	45	print >> sys.stderr, "Fatal Error: %s" % msg
	46	sys.exit()
	47
	48	#an object corresponding to a reference layered alignment
	49	class RegionAlignment( object ):
	50
	51	DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
	52	MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed
	53
	54	def __init__( self, size, species = [] ):
	55	assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )
	56	self.size = size
	57	self.sequences = {}
	58	if not isinstance( species, list ):
	59	species = [species]
	60	for spec in species:
	61	self.add_species( spec )
	62
	63	#add a species to the alignment
	64	def add_species( self, species ):
	65	#make temporary sequence files
	66	self.sequences[species] = tempfile.TemporaryFile()
	67	self.sequences[species].write( "-" * self.size )
	68
	69	#returns the names for species found in alignment, skipping names as requested
	70	def get_species_names( self, skip = [] ):
	71	if not isinstance( skip, list ): skip = [skip]
	72	names = self.sequences.keys()
	73	for name in skip:
	74	try: names.remove( name )
	75	except: pass
	76	return names
	77
	78	#returns the sequence for a species
	79	def get_sequence( self, species ):
	80	self.sequences[species].seek( 0 )
	81	return self.sequences[species].read()
	82
	83	#returns the reverse complement of the sequence for a species
	84	def get_sequence_reverse_complement( self, species ):
	85	complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
	86	complement.reverse()
	87	return "".join( complement )
	88
	89	#sets a position for a species
	90	def set_position( self, index, species, base ):
	91	if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )
	92	return self.set_range( index, species, base )
	93	#sets a range for a species
	94	def set_range( self, index, species, bases ):
	95	if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )
	96	if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )
	97	if species not in self.sequences.keys(): self.add_species( species )
	98	self.sequences[species].seek( index )
	99	self.sequences[species].write( bases )
	100
	101	#Flush temp file of specified species, or all species
	102	def flush( self, species = None ):
	103	if species is None:
	104	species = self.sequences.keys()
	105	elif not isinstance( species, list ):
	106	species = [species]
	107	for spec in species:
	108	self.sequences[spec].flush()
	109
	110	class GenomicRegionAlignment( RegionAlignment ):
	111
	112	def __init__( self, start, end, species = [] ):
	113	RegionAlignment.__init__( self, end - start, species )
	114	self.start = start
	115	self.end = end
	116
	117	class SplicedAlignment( object ):
	118
	119	DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
	120
	121	def __init__( self, exon_starts, exon_ends, species = [] ):
	122	if not isinstance( exon_starts, list ):
	123	exon_starts = [exon_starts]
	124	if not isinstance( exon_ends, list ):
	125	exon_ends = [exon_ends]
	126	assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes."
	127	self.exons = []
	128	for i in range( len( exon_starts ) ):
	129	self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) )
	130
	131	#returns the names for species found in alignment, skipping names as requested
	132	def get_species_names( self, skip = [] ):
	133	if not isinstance( skip, list ): skip = [skip]
	134	names = []
	135	for exon in self.exons:
	136	for name in exon.get_species_names( skip = skip ):
	137	if name not in names:
	138	names.append( name )
	139	return names
	140
	141	#returns the sequence for a species
	142	def get_sequence( self, species ):
	143	sequence = tempfile.TemporaryFile()
	144	for exon in self.exons:
	145	if species in exon.get_species_names():
	146	sequence.write( exon.get_sequence( species ) )
	147	else:
	148	sequence.write( "-" * exon.size )
	149	sequence.seek( 0 )
	150	return sequence.read()
	151
	152	#returns the reverse complement of the sequence for a species
	153	def get_sequence_reverse_complement( self, species ):
	154	complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
	155	complement.reverse()
	156	return "".join( complement )
	157
	158	#Start and end of coding region
	159	@property
	160	def start( self ):
	161	return self.exons[0].start
	162	@property
	163	def end( self ):
	164	return self.exons[-1].end
	165
	166	#Open a MAF index using a UID
	167	def maf_index_by_uid( maf_uid, index_location_file ):
	168	for line in open( index_location_file ):
	169	try:
	170	#read each line, if not enough fields, go to next line
	171	if line[0:1] == "#" : continue
	172	fields = line.split('\t')
	173	if maf_uid == fields[1]:
	174	try:
	175	maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," )
	176	return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False )
	177	except Exception, e:
	178	raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) )
	179	except:
	180	pass
	181	return None
	182
	183	#return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created
	184	def open_or_build_maf_index( maf_file, index_filename, species = None ):
	185	try:
	186	return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None )
	187	except:
	188	return build_maf_index( maf_file, species = species )
	189
	190	#* ANYCHANGE TO THIS METHOD HERE OR IN galaxy.datatypes.sequences MUST BE PROPAGATED *
	191	def build_maf_index_species_chromosomes( filename, index_species = None ):
	192	species = []
	193	species_chromosomes = {}
	194	indexes = bx.interval_index_file.Indexes()
	195	blocks = 0
	196	try:
	197	maf_reader = bx.align.maf.Reader( open( filename ) )
	198	while True:
	199	pos = maf_reader.file.tell()
	200	block = maf_reader.next()
	201	if block is None:
	202	break
	203	blocks += 1
	204	for c in block.components:
	205	spec = c.src
	206	chrom = None
	207	if "." in spec:
	208	spec, chrom = spec.split( ".", 1 )
	209	if spec not in species:
	210	species.append( spec )
	211	species_chromosomes[spec] = []
	212	if chrom and chrom not in species_chromosomes[spec]:
	213	species_chromosomes[spec].append( chrom )
	214	if index_species is None or spec in index_species:
	215	forward_strand_start = c.forward_strand_start
	216	forward_strand_end = c.forward_strand_end
	217	try:
	218	forward_strand_start = int( forward_strand_start )
	219	forward_strand_end = int( forward_strand_end )
	220	except ValueError:
	221	continue #start and end are not integers, can't add component to index, goto next component
	222	#this likely only occurs when parse_e_rows is True?
	223	#could a species exist as only e rows? should the
	224	if forward_strand_end > forward_strand_start:
	225	#require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
	226	indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
	227	except Exception, e:
	228	#most likely a bad MAF
	229	log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
	230	return ( None, [], {} )
	231	return ( indexes, species, species_chromosomes, blocks )
	232
	233	#builds and returns ( index, index_filename ) for specified maf_file
	234	def build_maf_index( maf_file, species = None ):
	235	indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species )
	236	if indexes is not None:
	237	fd, index_filename = tempfile.mkstemp()
	238	out = os.fdopen( fd, 'w' )
	239	indexes.write( out )
	240	out.close()
	241	return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename )
	242	return ( None, None )
	243
	244	def component_overlaps_region( c, region ):
	245	if c is None: return False
	246	start, end = c.get_forward_strand_start(), c.get_forward_strand_end()
	247	if region.start >= end or region.end <= start:
	248	return False
	249	return True
	250
	251	def chop_block_by_region( block, src, region, species = None, mincols = 0 ):
	252	# This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far:
	253	# behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end )
	254	# whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency
	255	# comments welcome
	256	slice_start = block.text_size #max for the min()
	257	slice_end = 0 #min for the max()
	258	old_score = block.score #save old score for later use
	259	# We no longer assume only one occurance of src per block, so we need to check them all
	260	for c in iter_components_by_src( block, src ):
	261	if component_overlaps_region( c, region ):
	262	if c.text is not None:
	263	rev_strand = False
	264	if c.strand == "-":
	265	#We want our coord_to_col coordinates to be returned from positive stranded component
	266	rev_strand = True
	267	c = c.reverse_complement()
	268	start = max( region.start, c.start )
	269	end = min( region.end, c.end )
	270	start = c.coord_to_col( start )
	271	end = c.coord_to_col( end )
	272	if rev_strand:
	273	#need to orient slice coordinates to the original block direction
	274	slice_len = end - start
	275	end = len( c.text ) - start
	276	start = end - slice_len
	277	slice_start = min( start, slice_start )
	278	slice_end = max( end, slice_end )
	279
	280	if slice_start < slice_end:
	281	block = block.slice( slice_start, slice_end )
	282	if block.text_size > mincols:
	283	# restore old score, may not be accurate, but it is better than 0 for everything?
	284	block.score = old_score
	285	if species is not None:
	286	block = block.limit_to_species( species )
	287	block.remove_all_gap_columns()
	288	return block
	289	return None
	290
	291	def orient_block_by_region( block, src, region, force_strand = None ):
	292	#loop through components matching src,
	293	#make sure each of these components overlap region
	294	#cache strand for each of overlaping regions
	295	#if force_strand / region.strand not in strand cache, reverse complement
	296	### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing
	297	strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ]
	298	if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ):
	299	block = block.reverse_complement()
	300	return block
	301
	302	def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
	303	for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ):
	304	yield block
	305	def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
	306	for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
	307	yield orient_block_by_region( block, src, region, force_strand ), idx, offset
	308
	309	#split a block with multiple occurances of src into one block per src
	310	def iter_blocks_split_by_src( block, src ):
	311	for src_c in iter_components_by_src( block, src ):
	312	new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) )
	313	new_block.text_size = block.text_size
	314	for c in block.components:
	315	if c == src_c or c.src != src:
	316	new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components
	317	yield new_block
	318
	319	#split a block into multiple blocks with all combinations of a species appearing only once per block
	320	def iter_blocks_split_by_species( block, species = None ):
	321	def __split_components_by_species( components_by_species, new_block ):
	322	if components_by_species:
	323	#more species with components to add to this block
	324	components_by_species = deepcopy( components_by_species )
	325	spec_comps = components_by_species.pop( 0 )
	326	for c in spec_comps:
	327	newer_block = deepcopy( new_block )
	328	newer_block.add_component( deepcopy( c ) )
	329	for value in __split_components_by_species( components_by_species, newer_block ):
	330	yield value
	331	else:
	332	#no more components to add, yield this block
	333	yield new_block
	334
	335	#divide components by species
	336	spec_dict = {}
	337	if not species:
	338	species = []
	339	for c in block.components:
	340	spec, chrom = src_split( c.src )
	341	if spec not in spec_dict:
	342	spec_dict[ spec ] = []
	343	species.append( spec )
	344	spec_dict[ spec ].append( c )
	345	else:
	346	for spec in species:
	347	spec_dict[ spec ] = []
	348	for c in iter_components_by_src_start( block, spec ):
	349	spec_dict[ spec ].append( c )
	350
	351	empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes?
	352	empty_block.text_size = block.text_size
	353	#call recursive function to split into each combo of spec/blocks
	354	for value in __split_components_by_species( spec_dict.values(), empty_block ):
	355	sort_block_components_by_block( value, block ) #restore original component order
	356	yield value
	357
	358
	359	#generator yielding only chopped and valid blocks for a specified region
	360	def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ):
	361	for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
	362	yield block
	363	def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ):
	364	for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ):
	365	block = chop_block_by_region( block, src, region, species, mincols )
	366	if block is not None:
	367	yield block, idx, offset
	368
	369	#returns a filled region alignment for specified regions
	370	def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
	371	if species is not None: alignment = RegionAlignment( end - start, species )
	372	else: alignment = RegionAlignment( end - start, primary_species )
	373	return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps )
	374
	375	#reduces a block to only positions exisiting in the src provided
	376	def reduce_block_by_primary_genome( block, species, chromosome, region_start ):
	377	#returns ( startIndex, {species:texts}
	378	#where texts' contents are reduced to only positions existing in the primary genome
	379	src = "%s.%s" % ( species, chromosome )
	380	ref = block.get_component_by_src( src )
	381	start_offset = ref.start - region_start
	382	species_texts = {}
	383	for c in block.components:
	384	species_texts[ c.src.split( '.' )[0] ] = list( c.text )
	385	#remove locations which are gaps in the primary species, starting from the downstream end
	386	for i in range( len( species_texts[ species ] ) - 1, -1, -1 ):
	387	if species_texts[ species ][i] == '-':
	388	for text in species_texts.values():
	389	text.pop( i )
	390	for spec, text in species_texts.items():
	391	species_texts[spec] = ''.join( text )
	392	return ( start_offset, species_texts )
	393
	394	#fills a region alignment
	395	def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
	396	region = bx.intervals.Interval( start, end )
	397	region.chrom = chrom
	398	region.strand = strand
	399	primary_src = "%s.%s" % ( primary_species, chrom )
	400
	401	#Order blocks overlaping this position by score, lowest first
	402	blocks = []
	403	for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ):
	404	score = float( block.score )
	405	for i in range( 0, len( blocks ) ):
	406	if score < blocks[i][0]:
	407	blocks.insert( i, ( score, idx, offset ) )
	408	break
	409	else:
	410	blocks.append( ( score, idx, offset ) )
	411
	412	#gap_chars_tuple = tuple( GAP_CHARS )
	413	gap_chars_str = ''.join( GAP_CHARS )
	414	#Loop through ordered blocks and layer by increasing score
	415	for block_dict in blocks: for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately
	416	if component_overlaps_region( block.get_component_by_src( primary_src ), region ):
	417	block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block
	418	block = orient_block_by_region( block, primary_src, region ) #orient block
	419	start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start )
	420	for spec, text in species_texts.items():
	421	#we should trim gaps from both sides, since these are not positions in this species genome (sequence)
	422	text = text.rstrip( gap_chars_str )
	423	gap_offset = 0
	424	while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith()
	425	#while text.startswith( gap_chars_tuple ):
	426	gap_offset += 1
	427	text = text[1:]
	428	if not text:
	429	break
	430	if text:
	431	if overwrite_with_gaps:
	432	alignment.set_range( start_offset + gap_offset, spec, text )
	433	else:
	434	for i, char in enumerate( text ):
	435	if char not in GAP_CHARS:
	436	alignment.set_position( start_offset + gap_offset + i, spec, char )
	437	return alignment
	438
	439	#returns a filled spliced region alignment for specified region with start and end lists
	440	def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
	441	#create spliced alignment object
	442	if species is not None: alignment = SplicedAlignment( starts, ends, species )
	443	else: alignment = SplicedAlignment( starts, ends, [primary_species] )
	444	for exon in alignment.exons:
	445	fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps )
	446	return alignment
	447
	448	#loop through string array, only return non-commented lines
	449	def line_enumerator( lines, comment_start = '#' ):
	450	i = 0
	451	for line in lines:
	452	if not line.startswith( comment_start ):
	453	i += 1
	454	yield ( i, line )
	455
	456	#read a GeneBed file, return list of starts, ends, raw fields
	457	def get_starts_ends_fields_from_gene_bed( line ):
	458	#Starts and ends for exons
	459	starts = []
	460	ends = []
	461
	462	fields = line.split()
	463	#Requires atleast 12 BED columns
	464	if len(fields) < 12:
	465	raise Exception( "Not a proper 12 column BED line (%s)." % line )
	466	chrom = fields[0]
	467	tx_start = int( fields[1] )
	468	tx_end = int( fields[2] )
	469	name = fields[3]
	470	strand = fields[5]
	471	if strand != '-': strand='+' #Default strand is +
	472	cds_start = int( fields[6] )
	473	cds_end = int( fields[7] )
	474
	475	#Calculate and store starts and ends of coding exons
	476	region_start, region_end = cds_start, cds_end
	477	exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
	478	exon_starts = map( ( lambda x: x + tx_start ), exon_starts )
	479	exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) )
	480	exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends );
	481	for start, end in zip( exon_starts, exon_ends ):
	482	start = max( start, region_start )
	483	end = min( end, region_end )
	484	if start < end:
	485	starts.append( start )
	486	ends.append( end )
	487	return ( starts, ends, fields )
	488
	489	def iter_components_by_src( block, src ):
	490	for c in block.components:
	491	if c.src == src:
	492	yield c
	493
	494	def get_components_by_src( block, src ):
	495	return [ value for value in iter_components_by_src( block, src ) ]
	496
	497	def iter_components_by_src_start( block, src ):
	498	for c in block.components:
	499	if c.src.startswith( src ):
	500	yield c
	501
	502	def get_components_by_src_start( block, src ):
	503	return [ value for value in iter_components_by_src_start( block, src ) ]
	504
	505	def sort_block_components_by_block( block1, block2 ):
	506	#orders the components in block1 by the index of the component in block2
	507	#block1 must be a subset of block2
	508	#occurs in-place
	509	return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) )
	510
	511	def get_species_in_maf( maf_filename ):
	512	species = []
	513	for block in bx.align.maf.Reader( open( maf_filename ) ):
	514	for spec in get_species_in_block( block ):
	515	if spec not in species:
	516	species.append( spec )
	517	return species
	518
	519	def parse_species_option( species ):
	520	if species:
	521	species = species.split( ',' )
	522	if 'None' not in species:
	523	return species
	524	return None #provided species was '', None, or had 'None' in it
	525
	526	def remove_temp_index_file( index_filename ):
	527	try: os.unlink( index_filename )
	528	except: pass
	529
	530	#Below are methods to deal with FASTA files
	531
	532	def get_fasta_header( component, attributes = {}, suffix = None ):
	533	header = ">%s(%s):%i-%i\|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )
	534	for key, value in attributes.iteritems():
	535	header = "%s%s=%s\|" % ( header, key, value )
	536	if suffix:
	537	header = "%s%s" % ( header, suffix )
	538	else:
	539	header = "%s%s" % ( header, src_split( component.src )[ 0 ] )
	540	return header
	541
	542	def get_attributes_from_fasta_header( header ):
	543	if not header: return {}
	544	attributes = {}
	545	header = header.lstrip( '>' )
	546	header = header.strip()
	547	fields = header.split( '\|' )
	548	try:
	549	region = fields[0]
	550	region = region.split( '(', 1 )
	551	temp = region[0].split( '.', 1 )
	552	attributes['species'] = temp[0]
	553	if len( temp ) == 2:
	554	attributes['chrom'] = temp[1]
	555	else:
	556	attributes['chrom'] = temp[0]
	557	region = region[1].split( ')', 1 )
	558	attributes['strand'] = region[0]
	559	region = region[1].lstrip( ':' ).split( '-' )
	560	attributes['start'] = int( region[0] )
	561	attributes['end'] = int( region[1] )
	562	except:
	563	#fields 0 is not a region coordinate
	564	pass
	565	if len( fields ) > 2:
	566	for i in xrange( 1, len( fields ) - 1 ):
	567	prop = fields[i].split( '=', 1 )
	568	if len( prop ) == 2:
	569	attributes[ prop[0] ] = prop[1]
	570	if len( fields ) > 1:
	571	attributes['__suffix__'] = fields[-1]
	572	return attributes
	573
	574	def iter_fasta_alignment( filename ):
	575	class fastaComponent:
	576	def __init__( self, species, text = "" ):
	577	self.species = species
	578	self.text = text
	579	def extend( self, text ):
	580	self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip()
	581	#yields a list of fastaComponents for a FASTA file
	582	f = open( filename, 'rb' )
	583	components = []
	584	#cur_component = None
	585	while True:
	586	line = f.readline()
	587	if not line:
	588	if components:
	589	yield components
	590	return
	591	line = line.strip()
	592	if not line:
	593	if components:
	594	yield components
	595	components = []
	596	elif line.startswith( '>' ):
	597	attributes = get_attributes_from_fasta_header( line )
	598	components.append( fastaComponent( attributes['species'] ) )
	599	elif components:
	600	components[-1].extend( line )
	601

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/tools/util/maf_utilities.py @ 2

異なるフォーマットでダウンロード: