Context Navigation

sequence.py

リビジョン 2, 24.2 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	"""
2	Sequence classes
3	"""
4
5	import data
6	import logging
7	import re
8	import string
9	from cgi import escape
10	from galaxy.datatypes.metadata import MetadataElement
11	from galaxy.datatypes import metadata
12	import galaxy.model
13	from galaxy import util
14	from sniff import *
15
16	log = logging.getLogger(__name__)
17
18	class Sequence( data.Text ):
19	"""Class describing a sequence"""
20
21	"""Add metadata elements"""
22	MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
23
24	def set_meta( self, dataset, **kwd ):
25	"""
26	Set the number of sequences and the number of data lines in dataset.
27	"""
28	data_lines = 0
29	sequences = 0
30	for line in file( dataset.file_name ):
31	line = line.strip()
32	if line and line.startswith( '#' ):
33	# We don't count comment lines for sequence data types
34	continue
35	if line and line.startswith( '>' ):
36	sequences += 1
37	data_lines +=1
38	else:
39	data_lines += 1
40	dataset.metadata.data_lines = data_lines
41	dataset.metadata.sequences = sequences
42	def set_peek( self, dataset, is_multi_byte=False ):
43	if not dataset.dataset.purged:
44	dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
45	if dataset.metadata.sequences:
46	dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) )
47	else:
48	dataset.blurb = data.nice_size( dataset.get_size() )
49	else:
50	dataset.peek = 'file does not exist'
51	dataset.blurb = 'file purged from disk'
52
53	class Alignment( data.Text ):
54	"""Class describing an alignment"""
55
56	"""Add metadata elements"""
57	MetadataElement( name="species", desc="Species", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None )
58
59	class Fasta( Sequence ):
60	"""Class representing a FASTA sequence"""
61	file_ext = "fasta"
62
63	def sniff( self, filename ):
64	"""
65	Determines whether the file is in fasta format
66
67	A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
68	The first character of the description line is a greater-than (">") symbol in the first column.
69	All lines should be shorter than 80 characters
70
71	For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
72
73	Rules for sniffing as True:
74	We don't care about line length (other than empty lines).
75	The first non-empty line must start with '>' and the Very Next line.strip() must have sequence data and not be a header.
76	'sequence data' here is loosely defined as non-empty lines which do not start with '>'
77	This will cause Color Space FASTA (csfasta) to be detected as True (they are, after all, still FASTA files - they have a header line followed by sequence data)
78	Previously this method did some checking to determine if the sequence data had integers (presumably to differentiate between fasta and csfasta)
79	This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta
80	We will only check that the first purported sequence is correctly formatted.
81
82	>>> fname = get_test_fname( 'sequence.maf' )
83	>>> Fasta().sniff( fname )
84	False
85	>>> fname = get_test_fname( 'sequence.fasta' )
86	>>> Fasta().sniff( fname )
87	True
88	"""
89
90	try:
91	fh = open( filename )
92	while True:
93	line = fh.readline()
94	if not line:
95	break #EOF
96	line = line.strip()
97	if line: #first non-empty line
98	if line.startswith( '>' ):
99	#The next line.strip() must not be '', nor startwith '>'
100	line = fh.readline().strip()
101	if line == '' or line.startswith( '>' ):
102	break
103	return True
104	else:
105	break #we found a non-empty line, but its not a fasta header
106	fh.close()
107	except:
108	pass
109	return False
110
111	class csFasta( Sequence ):
112	""" Class representing the SOLID Color-Space sequence ( csfasta ) """
113	file_ext = "csfasta"
114
115	def sniff( self, filename ):
116	"""
117	Color-space sequence:
118	>2_15_85_F3
119	T213021013012303002332212012112221222112212222
120
121	>>> fname = get_test_fname( 'sequence.fasta' )
122	>>> csFasta().sniff( fname )
123	False
124	>>> fname = get_test_fname( 'sequence.csfasta' )
125	>>> csFasta().sniff( fname )
126	True
127	"""
128	try:
129	fh = open( filename )
130	while True:
131	line = fh.readline()
132	if not line:
133	break #EOF
134	line = line.strip()
135	if line and not line.startswith( '#' ): #first non-empty non-comment line
136	if line.startswith( '>' ):
137	line = fh.readline().strip()
138	if line == '' or line.startswith( '>' ):
139	break
140	elif line[0] not in string.ascii_uppercase:
141	return False
142	elif len( line ) > 1 and not re.search( '^[\d.]+$', line[1:] ):
143	return False
144	return True
145	else:
146	break #we found a non-empty line, but it's not a header
147	fh.close()
148	except:
149	pass
150	return False
151
152	def set_meta( self, dataset, **kwd ):
153	if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
154	dataset.metadata.data_lines = None
155	dataset.metadata.sequences = None
156	return
157	return Sequence.set_meta( self, dataset, **kwd )
158
159	class Fastq ( Sequence ):
160	"""Class representing a generic FASTQ sequence"""
161	file_ext = "fastq"
162
163	def set_meta( self, dataset, **kwd ):
164	"""
165	Set the number of sequences and the number of data lines
166	in dataset.
167	"""
168	if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
169	dataset.metadata.data_lines = None
170	dataset.metadata.sequences = None
171	return
172	data_lines = 0
173	sequences = 0
174	seq_counter = 0 # blocks should be 4 lines long
175	for line in file( dataset.file_name ):
176	line = line.strip()
177	if line and line.startswith( '#' ) and not sequences:
178	# We don't count comment lines for sequence data types
179	continue
180	if line and line.startswith( '@' ):
181	if seq_counter >= 4:
182	# count previous block
183	# blocks should be 4 lines long
184	sequences += 1
185	seq_counter = 1
186	else:
187	# in case quality line starts with @
188	seq_counter += 1
189	data_lines += 1
190	else:
191	data_lines += 1
192	seq_counter += 1
193	if seq_counter >= 4:
194	# count final block
195	sequences += 1
196	dataset.metadata.data_lines = data_lines
197	dataset.metadata.sequences = sequences
198	def sniff ( self, filename ):
199	"""
200	Determines whether the file is in generic fastq format
201	For details, see http://maq.sourceforge.net/fastq.shtml
202
203	Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina
204	These differ in the representation of the quality scores
205
206	>>> fname = get_test_fname( '1.fastqsanger' )
207	>>> Fastq().sniff( fname )
208	True
209	>>> fname = get_test_fname( '2.fastqsanger' )
210	>>> Fastq().sniff( fname )
211	True
212	"""
213	headers = get_headers( filename, None )
214	bases_regexp = re.compile( "^[NGTAC]*" )
215	# check that first block looks like a fastq block
216	try:
217	if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]:
218	# Check the sequence line, make sure it contains only G/C/A/T/N
219	if not bases_regexp.match( headers[1][0] ):
220	return False
221	return True
222	return False
223	except:
224	return False
225
226	class FastqSanger( Fastq ):
227	"""Class representing a FASTQ sequence ( the Sanger variant )"""
228	file_ext = "fastqsanger"
229
230	class FastqSolexa( Fastq ):
231	"""Class representing a FASTQ sequence ( the Solexa variant )"""
232	file_ext = "fastqsolexa"
233
234	class FastqIllumina( Fastq ):
235	"""Class representing a FASTQ sequence ( the Illumina 1.3+ variant )"""
236	file_ext = "fastqillumina"
237
238	class FastqCSSanger( Fastq ):
239	"""Class representing a Color Space FASTQ sequence ( e.g a SOLiD variant )"""
240	file_ext = "fastqcssanger"
241
242	try:
243	from galaxy import eggs
244	import pkg_resources; pkg_resources.require( "bx-python" )
245	import bx.align.maf
246	except:
247	pass
248
249	#trying to import maf_utilities here throws an ImportError due to a circular import between jobs and tools:
250	#from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
251	#Traceback (most recent call last):
252	# File "./scripts/paster.py", line 27, in <module>
253	# command.run()
254	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 78, in run
255	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 117, in invoke
256	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 212, in run
257	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 227, in command
258	# File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 250, in loadapp
259	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 193, in loadapp
260	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 213, in loadobj
261	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 237, in loadcontext
262	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 267, in _loadconfig
263	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 397, in get_context
264	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 439, in _context_from_explicit
265	# File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 18, in import_string
266	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/pkg_resources.py", line 1912, in load
267	# entry = __import__(self.module_name, globals(),globals(), ['__name__'])
268	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/web/buildapp.py", line 18, in <module>
269	# from galaxy import config, jobs, util, tools
270	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/jobs/__init__.py", line 3, in <module>
271	# from galaxy import util, model
272	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/model/__init__.py", line 13, in <module>
273	# import galaxy.datatypes.registry
274	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/registry.py", line 6, in <module>
275	# import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo
276	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/sequence.py", line 344, in <module>
277	# from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
278	# File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/tools/__init__.py", line 15, in <module>
279	# from galaxy import util, jobs, model
280	#ImportError: cannot import name jobs
281	#so we'll copy and paste for now...terribly icky
282	#* ANYCHANGE TO THIS METHOD HERE OR IN maf_utilities MUST BE PROPAGATED *
283	def COPIED_build_maf_index_species_chromosomes( filename, index_species = None ):
284	species = []
285	species_chromosomes = {}
286	indexes = bx.interval_index_file.Indexes()
287	blocks = 0
288	try:
289	maf_reader = bx.align.maf.Reader( open( filename ) )
290	while True:
291	pos = maf_reader.file.tell()
292	block = maf_reader.next()
293	if block is None:
294	break
295	blocks += 1
296	for c in block.components:
297	spec = c.src
298	chrom = None
299	if "." in spec:
300	spec, chrom = spec.split( ".", 1 )
301	if spec not in species:
302	species.append( spec )
303	species_chromosomes[spec] = []
304	if chrom and chrom not in species_chromosomes[spec]:
305	species_chromosomes[spec].append( chrom )
306	if index_species is None or spec in index_species:
307	forward_strand_start = c.forward_strand_start
308	forward_strand_end = c.forward_strand_end
309	try:
310	forward_strand_start = int( forward_strand_start )
311	forward_strand_end = int( forward_strand_end )
312	except ValueError:
313	continue #start and end are not integers, can't add component to index, goto next component
314	#this likely only occurs when parse_e_rows is True?
315	#could a species exist as only e rows? should the
316	if forward_strand_end > forward_strand_start:
317	#require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
318	indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
319	except Exception, e:
320	#most likely a bad MAF
321	log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
322	return ( None, [], {}, 0 )
323	return ( indexes, species, species_chromosomes, blocks )
324
325	class Maf( Alignment ):
326	"""Class describing a Maf alignment"""
327	file_ext = "maf"
328
329	#Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
330	MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
331	MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
332	MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
333
334	def init_meta( self, dataset, copy_from=None ):
335	Alignment.init_meta( self, dataset, copy_from=copy_from )
336	def set_meta( self, dataset, overwrite = True, **kwd ):
337	"""
338	Parses and sets species, chromosomes, index from MAF file.
339	"""
340	#these metadata values are not accessable by users, always overwrite
341	indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name )
342	if indexes is None:
343	return #this is not a MAF file
344	dataset.metadata.species = species
345	dataset.metadata.blocks = blocks
346
347	#write species chromosomes to a file
348	chrom_file = dataset.metadata.species_chromosomes
349	if not chrom_file:
350	chrom_file = dataset.metadata.spec['species_chromosomes'].param.new_file( dataset = dataset )
351	chrom_out = open( chrom_file.file_name, 'wb' )
352	for spec, chroms in species_chromosomes.items():
353	chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) )
354	chrom_out.close()
355	dataset.metadata.species_chromosomes = chrom_file
356
357	index_file = dataset.metadata.maf_index
358	if not index_file:
359	index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
360	indexes.write( open( index_file.file_name, 'wb' ) )
361	dataset.metadata.maf_index = index_file
362	def set_peek( self, dataset, is_multi_byte=False ):
363	if not dataset.dataset.purged:
364	# The file must exist on disk for the get_file_peek() method
365	dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
366	if dataset.metadata.blocks:
367	dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) )
368	else:
369	# Number of blocks is not known ( this should not happen ), and auto-detect is
370	# needed to set metadata
371	dataset.blurb = "? blocks"
372	else:
373	dataset.peek = 'file does not exist'
374	dataset.blurb = 'file purged from disk'
375	def display_peek( self, dataset ):
376	"""Returns formated html of peek"""
377	return self.make_html_table( dataset )
378	def make_html_table( self, dataset, skipchars=[] ):
379	"""Create HTML table, used for displaying peek"""
380	out = ['<table cellspacing="0" cellpadding="3">']
381	try:
382	out.append('<tr><th>Species: ')
383	for species in dataset.metadata.species:
384	out.append( '%s ' % species )
385	out.append( '</th></tr>' )
386	if not dataset.peek:
387	dataset.set_peek()
388	data = dataset.peek
389	lines = data.splitlines()
390	for line in lines:
391	line = line.strip()
392	if not line:
393	continue
394	out.append( '<tr><td>%s</td></tr>' % escape( line ) )
395	out.append( '</table>' )
396	out = "".join( out )
397	except Exception, exc:
398	out = "Can't create peek %s" % exc
399	return out
400	def sniff( self, filename ):
401	"""
402	Determines wether the file is in maf format
403
404	The .maf format is line-oriented. Each multiple alignment ends with a blank line.
405	Each sequence in an alignment is on a single line, which can get quite long, but
406	there is no length limit. Words in a line are delimited by any white space.
407	Lines starting with # are considered to be comments. Lines starting with ## can
408	be ignored by most programs, but contain meta-data of one form or another.
409
410	The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
411	variable=value pairs. There should be no white space surrounding the "=".
412
413	For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5
414
415	>>> fname = get_test_fname( 'sequence.maf' )
416	>>> Maf().sniff( fname )
417	True
418	>>> fname = get_test_fname( 'sequence.fasta' )
419	>>> Maf().sniff( fname )
420	False
421	"""
422	headers = get_headers( filename, None )
423	try:
424	if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf":
425	return True
426	else:
427	return False
428	except:
429	return False
430
431	class MafCustomTrack( data.Text ):
432	file_ext = "mafcustomtrack"
433
434	MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' )
435	MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' )
436	MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' )
437
438	def set_meta( self, dataset, overwrite = True, **kwd ):
439	"""
440	Parses and sets viewport metadata from MAF file.
441	"""
442	max_block_check = 10
443	chrom = None
444	forward_strand_start = float( 'inf' )
445	forward_strand_end = 0
446	try:
447	maf_file = open( dataset.file_name )
448	maf_file.readline() #move past track line
449	for i, block in enumerate( bx.align.maf.Reader( maf_file ) ):
450	ref_comp = block.get_component_by_src_start( dataset.metadata.dbkey )
451	if ref_comp:
452	ref_chrom = bx.align.maf.src_split( ref_comp.src )[-1]
453	if chrom is None:
454	chrom = ref_chrom
455	if chrom == ref_chrom:
456	forward_strand_start = min( forward_strand_start, ref_comp.forward_strand_start )
457	forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end )
458	if i > max_block_check:
459	break
460
461	if forward_strand_end > forward_strand_start:
462	dataset.metadata.vp_chromosome = chrom
463	dataset.metadata.vp_start = forward_strand_start
464	dataset.metadata.vp_end = forward_strand_end
465	except:
466	pass
467
468	class Axt( data.Text ):
469	"""Class describing an axt alignment"""
470
471	# gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
472	# here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
473	# from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
474
475	file_ext = "axt"
476
477	def sniff( self, filename ):
478	"""
479	Determines whether the file is in axt format
480
481	axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
482	at Penn State University.
483
484	Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
485	Blocks are separated from one another by blank lines.
486
487	The summary line contains chromosomal position and size information about the alignment. It
488	consists of 9 required fields.
489
490	The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
491	(line 3) with inserts. Repeats are indicated by lower-case letters.
492
493	For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html
494
495	>>> fname = get_test_fname( 'alignment.axt' )
496	>>> Axt().sniff( fname )
497	True
498	>>> fname = get_test_fname( 'alignment.lav' )
499	>>> Axt().sniff( fname )
500	False
501	"""
502	headers = get_headers( filename, None )
503	if len(headers) < 4:
504	return False
505	for hdr in headers:
506	if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
507	return True
508	if len(hdr) > 0 and not hdr[0].startswith("#"):
509	if len(hdr) != 9:
510	return False
511	try:
512	map ( int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]] )
513	except:
514	return False
515	if hdr[7] not in data.valid_strand:
516	return False
517	else:
518	return True
519
520	class Lav( data.Text ):
521	"""Class describing a LAV alignment"""
522
523	file_ext = "lav"
524
525	# gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
526	# here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
527	# from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
528
529	def sniff( self, filename ):
530	"""
531	Determines whether the file is in lav format
532
533	LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
534	The first line of a .lav file begins with #:lav.
535
536	For complete details see http://www.bioperl.org/wiki/LAV_alignment_format
537
538	>>> fname = get_test_fname( 'alignment.lav' )
539	>>> Lav().sniff( fname )
540	True
541	>>> fname = get_test_fname( 'alignment.axt' )
542	>>> Lav().sniff( fname )
543	False
544	"""
545	headers = get_headers( filename, None )
546	try:
547	if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'):
548	return True
549	else:
550	return False
551	except:
552	return False

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/datatypes/sequence.py

異なるフォーマットでダウンロード: