Context Navigation

sequence.py @ 2

リビジョン 2, 24.2 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	"""
	2	Sequence classes
	3	"""
	4
	5	import data
	6	import logging
	7	import re
	8	import string
	9	from cgi import escape
	10	from galaxy.datatypes.metadata import MetadataElement
	11	from galaxy.datatypes import metadata
	12	import galaxy.model
	13	from galaxy import util
	14	from sniff import *
	15
	16	log = logging.getLogger(__name__)
	17
	18	class Sequence( data.Text ):
	19	"""Class describing a sequence"""
	20
	21	"""Add metadata elements"""
	22	MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
	23
	24	def set_meta( self, dataset, **kwd ):
	25	"""
	26	Set the number of sequences and the number of data lines in dataset.
	27	"""
	28	data_lines = 0
	29	sequences = 0
	30	for line in file( dataset.file_name ):
	31	line = line.strip()
	32	if line and line.startswith( '#' ):
	33	# We don't count comment lines for sequence data types
	34	continue
	35	if line and line.startswith( '>' ):
	36	sequences += 1
	37	data_lines +=1
	38	else:
	39	data_lines += 1
	40	dataset.metadata.data_lines = data_lines
	41	dataset.metadata.sequences = sequences
	42	def set_peek( self, dataset, is_multi_byte=False ):
	43	if not dataset.dataset.purged:
	44	dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
	45	if dataset.metadata.sequences:
	46	dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) )
	47	else:
	48	dataset.blurb = data.nice_size( dataset.get_size() )
	49	else:
	50	dataset.peek = 'file does not exist'
	51	dataset.blurb = 'file purged from disk'
	52
	53	class Alignment( data.Text ):
	54	"""Class describing an alignment"""
	55
	56	"""Add metadata elements"""
	57	MetadataElement( name="species", desc="Species", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None )
	58
	59	class Fasta( Sequence ):
	60	"""Class representing a FASTA sequence"""
	61	file_ext = "fasta"
	62
	63	def sniff( self, filename ):
	64	"""
	65	Determines whether the file is in fasta format
	66
	67	A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
	68	The first character of the description line is a greater-than (">") symbol in the first column.
	69	All lines should be shorter than 80 characters
	70
	71	For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
	72
	73	Rules for sniffing as True:
	74	We don't care about line length (other than empty lines).
	75	The first non-empty line must start with '>' and the Very Next line.strip() must have sequence data and not be a header.
	76	'sequence data' here is loosely defined as non-empty lines which do not start with '>'
	77	This will cause Color Space FASTA (csfasta) to be detected as True (they are, after all, still FASTA files - they have a header line followed by sequence data)
	78	Previously this method did some checking to determine if the sequence data had integers (presumably to differentiate between fasta and csfasta)
	79	This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta
	80	We will only check that the first purported sequence is correctly formatted.
	81
	82	>>> fname = get_test_fname( 'sequence.maf' )
	83	>>> Fasta().sniff( fname )
	84	False
	85	>>> fname = get_test_fname( 'sequence.fasta' )
	86	>>> Fasta().sniff( fname )
	87	True
	88	"""
	89
	90	try:
	91	fh = open( filename )
	92	while True:
	93	line = fh.readline()
	94	if not line:
	95	break #EOF
	96	line = line.strip()
	97	if line: #first non-empty line
	98	if line.startswith( '>' ):
	99	#The next line.strip() must not be '', nor startwith '>'
	100	line = fh.readline().strip()
	101	if line == '' or line.startswith( '>' ):
	102	break
	103	return True
	104	else:
	105	break #we found a non-empty line, but its not a fasta header
	106	fh.close()
	107	except:
	108	pass
	109	return False
	110
	111	class csFasta( Sequence ):
	112	""" Class representing the SOLID Color-Space sequence ( csfasta ) """
	113	file_ext = "csfasta"
	114
	115	def sniff( self, filename ):
	116	"""
	117	Color-space sequence:
	118	>2_15_85_F3
	119	T213021013012303002332212012112221222112212222
	120
	121	>>> fname = get_test_fname( 'sequence.fasta' )
	122	>>> csFasta().sniff( fname )
	123	False
	124	>>> fname = get_test_fname( 'sequence.csfasta' )
	125	>>> csFasta().sniff( fname )
	126	True
	127	"""
	128	try:
	129	fh = open( filename )
	130	while True:
	131	line = fh.readline()
	132	if not line:
	133	break #EOF
	134	line = line.strip()
	135	if line and not line.startswith( '#' ): #first non-empty non-comment line
	136	if line.startswith( '>' ):
	137	line = fh.readline().strip()
	138	if line == '' or line.startswith( '>' ):
	139	break
	140	elif line[0] not in string.ascii_uppercase:
	141	return False
	142	elif len( line ) > 1 and not re.search( '^[\d.]+$', line[1:] ):
	143	return False
	144	return True
	145	else:
	146	break #we found a non-empty line, but it's not a header
	147	fh.close()
	148	except:
	149	pass
	150	return False
	151
	152	def set_meta( self, dataset, **kwd ):
	153	if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
	154	dataset.metadata.data_lines = None
	155	dataset.metadata.sequences = None
	156	return
	157	return Sequence.set_meta( self, dataset, **kwd )
	158
	159	class Fastq ( Sequence ):
	160	"""Class representing a generic FASTQ sequence"""
	161	file_ext = "fastq"
	162
	163	def set_meta( self, dataset, **kwd ):
	164	"""
	165	Set the number of sequences and the number of data lines
	166	in dataset.
	167	"""
	168	if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
	169	dataset.metadata.data_lines = None
	170	dataset.metadata.sequences = None
	171	return
	172	data_lines = 0
	173	sequences = 0
	174	seq_counter = 0 # blocks should be 4 lines long
	175	for line in file( dataset.file_name ):
	176	line = line.strip()
	177	if line and line.startswith( '#' ) and not sequences:
	178	# We don't count comment lines for sequence data types
	179	continue
	180	if line and line.startswith( '@' ):
	181	if seq_counter >= 4:
	182	# count previous block
	183	# blocks should be 4 lines long
	184	sequences += 1
	185	seq_counter = 1
	186	else:
	187	# in case quality line starts with @
	188	seq_counter += 1
	189	data_lines += 1
	190	else:
	191	data_lines += 1
	192	seq_counter += 1
	193	if seq_counter >= 4:
	194	# count final block
	195	sequences += 1
	196	dataset.metadata.data_lines = data_lines
	197	dataset.metadata.sequences = sequences
	198	def sniff ( self, filename ):
	199	"""
	200	Determines whether the file is in generic fastq format
	201	For details, see http://maq.sourceforge.net/fastq.shtml
	202
	203	Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina
	204	These differ in the representation of the quality scores
	205
	206	>>> fname = get_test_fname( '1.fastqsanger' )
	207	>>> Fastq().sniff( fname )
	208	True
	209	>>> fname = get_test_fname( '2.fastqsanger' )
	210	>>> Fastq().sniff( fname )
	211	True
	212	"""
	213	headers = get_headers( filename, None )
	214	bases_regexp = re.compile( "^[NGTAC]*" )
	215	# check that first block looks like a fastq block
	216	try:
	217	if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]:
	218	# Check the sequence line, make sure it contains only G/C/A/T/N
	219	if not bases_regexp.match( headers[1][0] ):
	220	return False
	221	return True
	222	return False
	223	except:
	224	return False
	225
	226	class FastqSanger( Fastq ):
	227	"""Class representing a FASTQ sequence ( the Sanger variant )"""
	228	file_ext = "fastqsanger"
	229
	230	class FastqSolexa( Fastq ):
	231	"""Class representing a FASTQ sequence ( the Solexa variant )"""
	232	file_ext = "fastqsolexa"
	233
	234	class FastqIllumina( Fastq ):
	235	"""Class representing a FASTQ sequence ( the Illumina 1.3+ variant )"""
	236	file_ext = "fastqillumina"
	237
	238	class FastqCSSanger( Fastq ):
	239	"""Class representing a Color Space FASTQ sequence ( e.g a SOLiD variant )"""
	240	file_ext = "fastqcssanger"
	241
	242	try:
	243	from galaxy import eggs
	244	import pkg_resources; pkg_resources.require( "bx-python" )
	245	import bx.align.maf
	246	except:
	247	pass
	248
	249	#trying to import maf_utilities here throws an ImportError due to a circular import between jobs and tools:
	250	#from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
	251	#Traceback (most recent call last):
	252	# File "./scripts/paster.py", line 27, in <module>
	253	# command.run()
	254	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 78, in run
	255	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 117, in invoke
	256	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 212, in run
	257	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 227, in command
	258	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 250, in loadapp
	259	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 193, in loadapp
	260	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 213, in loadobj
	261	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 237, in loadcontext
	262	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 267, in _loadconfig
	263	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 397, in get_context
	264	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 439, in _context_from_explicit
	265	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 18, in import_string
	266	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/pkg_resources.py", line 1912, in load
	267	# entry = __import__(self.module_name, globals(),globals(), ['__name__'])
	268	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/web/buildapp.py", line 18, in <module>
	269	# from galaxy import config, jobs, util, tools
	270	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/jobs/__init__.py", line 3, in <module>
	271	# from galaxy import util, model
	272	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/model/__init__.py", line 13, in <module>
	273	# import galaxy.datatypes.registry
	274	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/registry.py", line 6, in <module>
	275	# import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo
	276	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/sequence.py", line 344, in <module>
	277	# from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
	278	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/tools/__init__.py", line 15, in <module>
	279	# from galaxy import util, jobs, model
	280	#ImportError: cannot import name jobs
	281	#so we'll copy and paste for now...terribly icky
	282	#* ANYCHANGE TO THIS METHOD HERE OR IN maf_utilities MUST BE PROPAGATED *
	283	def COPIED_build_maf_index_species_chromosomes( filename, index_species = None ):
	284	species = []
	285	species_chromosomes = {}
	286	indexes = bx.interval_index_file.Indexes()
	287	blocks = 0
	288	try:
	289	maf_reader = bx.align.maf.Reader( open( filename ) )
	290	while True:
	291	pos = maf_reader.file.tell()
	292	block = maf_reader.next()
	293	if block is None:
	294	break
	295	blocks += 1
	296	for c in block.components:
	297	spec = c.src
	298	chrom = None
	299	if "." in spec:
	300	spec, chrom = spec.split( ".", 1 )
	301	if spec not in species:
	302	species.append( spec )
	303	species_chromosomes[spec] = []
	304	if chrom and chrom not in species_chromosomes[spec]:
	305	species_chromosomes[spec].append( chrom )
	306	if index_species is None or spec in index_species:
	307	forward_strand_start = c.forward_strand_start
	308	forward_strand_end = c.forward_strand_end
	309	try:
	310	forward_strand_start = int( forward_strand_start )
	311	forward_strand_end = int( forward_strand_end )
	312	except ValueError:
	313	continue #start and end are not integers, can't add component to index, goto next component
	314	#this likely only occurs when parse_e_rows is True?
	315	#could a species exist as only e rows? should the
	316	if forward_strand_end > forward_strand_start:
	317	#require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
	318	indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
	319	except Exception, e:
	320	#most likely a bad MAF
	321	log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
	322	return ( None, [], {}, 0 )
	323	return ( indexes, species, species_chromosomes, blocks )
	324
	325	class Maf( Alignment ):
	326	"""Class describing a Maf alignment"""
	327	file_ext = "maf"
	328
	329	#Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
	330	MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
	331	MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
	332	MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
	333
	334	def init_meta( self, dataset, copy_from=None ):
	335	Alignment.init_meta( self, dataset, copy_from=copy_from )
	336	def set_meta( self, dataset, overwrite = True, **kwd ):
	337	"""
	338	Parses and sets species, chromosomes, index from MAF file.
	339	"""
	340	#these metadata values are not accessable by users, always overwrite
	341	indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name )
	342	if indexes is None:
	343	return #this is not a MAF file
	344	dataset.metadata.species = species
	345	dataset.metadata.blocks = blocks
	346
	347	#write species chromosomes to a file
	348	chrom_file = dataset.metadata.species_chromosomes
	349	if not chrom_file:
	350	chrom_file = dataset.metadata.spec['species_chromosomes'].param.new_file( dataset = dataset )
	351	chrom_out = open( chrom_file.file_name, 'wb' )
	352	for spec, chroms in species_chromosomes.items():
	353	chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) )
	354	chrom_out.close()
	355	dataset.metadata.species_chromosomes = chrom_file
	356
	357	index_file = dataset.metadata.maf_index
	358	if not index_file:
	359	index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
	360	indexes.write( open( index_file.file_name, 'wb' ) )
	361	dataset.metadata.maf_index = index_file
	362	def set_peek( self, dataset, is_multi_byte=False ):
	363	if not dataset.dataset.purged:
	364	# The file must exist on disk for the get_file_peek() method
	365	dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
	366	if dataset.metadata.blocks:
	367	dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) )
	368	else:
	369	# Number of blocks is not known ( this should not happen ), and auto-detect is
	370	# needed to set metadata
	371	dataset.blurb = "? blocks"
	372	else:
	373	dataset.peek = 'file does not exist'
	374	dataset.blurb = 'file purged from disk'
	375	def display_peek( self, dataset ):
	376	"""Returns formated html of peek"""
	377	return self.make_html_table( dataset )
	378	def make_html_table( self, dataset, skipchars=[] ):
	379	"""Create HTML table, used for displaying peek"""
	380	out = ['<table cellspacing="0" cellpadding="3">']
	381	try:
	382	out.append('<tr><th>Species: ')
	383	for species in dataset.metadata.species:
	384	out.append( '%s ' % species )
	385	out.append( '</th></tr>' )
	386	if not dataset.peek:
	387	dataset.set_peek()
	388	data = dataset.peek
	389	lines = data.splitlines()
	390	for line in lines:
	391	line = line.strip()
	392	if not line:
	393	continue
	394	out.append( '<tr><td>%s</td></tr>' % escape( line ) )
	395	out.append( '</table>' )
	396	out = "".join( out )
	397	except Exception, exc:
	398	out = "Can't create peek %s" % exc
	399	return out
	400	def sniff( self, filename ):
	401	"""
	402	Determines wether the file is in maf format
	403
	404	The .maf format is line-oriented. Each multiple alignment ends with a blank line.
	405	Each sequence in an alignment is on a single line, which can get quite long, but
	406	there is no length limit. Words in a line are delimited by any white space.
	407	Lines starting with # are considered to be comments. Lines starting with ## can
	408	be ignored by most programs, but contain meta-data of one form or another.
	409
	410	The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
	411	variable=value pairs. There should be no white space surrounding the "=".
	412
	413	For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5
	414
	415	>>> fname = get_test_fname( 'sequence.maf' )
	416	>>> Maf().sniff( fname )
	417	True
	418	>>> fname = get_test_fname( 'sequence.fasta' )
	419	>>> Maf().sniff( fname )
	420	False
	421	"""
	422	headers = get_headers( filename, None )
	423	try:
	424	if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf":
	425	return True
	426	else:
	427	return False
	428	except:
	429	return False
	430
	431	class MafCustomTrack( data.Text ):
	432	file_ext = "mafcustomtrack"
	433
	434	MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' )
	435	MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' )
	436	MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' )
	437
	438	def set_meta( self, dataset, overwrite = True, **kwd ):
	439	"""
	440	Parses and sets viewport metadata from MAF file.
	441	"""
	442	max_block_check = 10
	443	chrom = None
	444	forward_strand_start = float( 'inf' )
	445	forward_strand_end = 0
	446	try:
	447	maf_file = open( dataset.file_name )
	448	maf_file.readline() #move past track line
	449	for i, block in enumerate( bx.align.maf.Reader( maf_file ) ):
	450	ref_comp = block.get_component_by_src_start( dataset.metadata.dbkey )
	451	if ref_comp:
	452	ref_chrom = bx.align.maf.src_split( ref_comp.src )[-1]
	453	if chrom is None:
	454	chrom = ref_chrom
	455	if chrom == ref_chrom:
	456	forward_strand_start = min( forward_strand_start, ref_comp.forward_strand_start )
	457	forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end )
	458	if i > max_block_check:
	459	break
	460
	461	if forward_strand_end > forward_strand_start:
	462	dataset.metadata.vp_chromosome = chrom
	463	dataset.metadata.vp_start = forward_strand_start
	464	dataset.metadata.vp_end = forward_strand_end
	465	except:
	466	pass
	467
	468	class Axt( data.Text ):
	469	"""Class describing an axt alignment"""
	470
	471	# gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
	472	# here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
	473	# from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
	474
	475	file_ext = "axt"
	476
	477	def sniff( self, filename ):
	478	"""
	479	Determines whether the file is in axt format
	480
	481	axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
	482	at Penn State University.
	483
	484	Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
	485	Blocks are separated from one another by blank lines.
	486
	487	The summary line contains chromosomal position and size information about the alignment. It
	488	consists of 9 required fields.
	489
	490	The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
	491	(line 3) with inserts. Repeats are indicated by lower-case letters.
	492
	493	For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html
	494
	495	>>> fname = get_test_fname( 'alignment.axt' )
	496	>>> Axt().sniff( fname )
	497	True
	498	>>> fname = get_test_fname( 'alignment.lav' )
	499	>>> Axt().sniff( fname )
	500	False
	501	"""
	502	headers = get_headers( filename, None )
	503	if len(headers) < 4:
	504	return False
	505	for hdr in headers:
	506	if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
	507	return True
	508	if len(hdr) > 0 and not hdr[0].startswith("#"):
	509	if len(hdr) != 9:
	510	return False
	511	try:
	512	map ( int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]] )
	513	except:
	514	return False
	515	if hdr[7] not in data.valid_strand:
	516	return False
	517	else:
	518	return True
	519
	520	class Lav( data.Text ):
	521	"""Class describing a LAV alignment"""
	522
	523	file_ext = "lav"
	524
	525	# gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
	526	# here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
	527	# from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
	528
	529	def sniff( self, filename ):
	530	"""
	531	Determines whether the file is in lav format
	532
	533	LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
	534	The first line of a .lav file begins with #:lav.
	535
	536	For complete details see http://www.bioperl.org/wiki/LAV_alignment_format
	537
	538	>>> fname = get_test_fname( 'alignment.lav' )
	539	>>> Lav().sniff( fname )
	540	True
	541	>>> fname = get_test_fname( 'alignment.axt' )
	542	>>> Lav().sniff( fname )
	543	False
	544	"""
	545	headers = get_headers( filename, None )
	546	try:
	547	if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'):
	548	return True
	549	else:
	550	return False
	551	except:
	552	return False

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/datatypes/sequence.py @ 2

異なるフォーマットでダウンロード: