Context Navigation

tabular.py

リビジョン 2, 22.6 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	"""
2	Tabular datatype
3
4	"""
5	import pkg_resources
6	pkg_resources.require( "bx-python" )
7
8	import logging
9	import data
10	from galaxy import util
11	from cgi import escape
12	from galaxy.datatypes import metadata
13	from galaxy.datatypes.metadata import MetadataElement
14	import galaxy_utils.sequence.vcf
15	from sniff import *
16
17	log = logging.getLogger(__name__)
18
19	class Tabular( data.Text ):
20	"""Tab delimited data"""
21
22	"""Add metadata elements"""
23	MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 )
24	MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 )
25	MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] )
26
27	def init_meta( self, dataset, copy_from=None ):
28	data.Text.init_meta( self, dataset, copy_from=copy_from )
29	def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
30	"""
31	Tries to determine the number of columns as well as those columns
32	that contain numerical values in the dataset. A skip parameter is
33	used because various tabular data types reuse this function, and
34	their data type classes are responsible to determine how many invalid
35	comment lines should be skipped. Using None for skip will cause skip
36	to be zero, but the first line will be processed as a header. A
37	max_data_lines parameter is used because various tabular data types
38	reuse this function, and their data type classes are responsible to
39	determine how many data lines should be processed to ensure that the
40	non-optional metadata parameters are properly set; if used, optional
41	metadata parameters will be set to None, unless the entire file has
42	already been read. Using None (default) for max_data_lines will
43	process all data lines.
44
45	Items of interest:
46	1. We treat 'overwrite' as always True (we always want to set tabular metadata when called).
47	2. If a tabular file has no data, it will have one column of type 'str'.
48	3. We used to check only the first 100 lines when setting metadata and this class's
49	set_peek() method read the entire file to determine the number of lines in the file.
50	Since metadata can now be processed on cluster nodes, we've merged the line count portion
51	of the set_peek() processing here, and we now check the entire contents of the file.
52	"""
53	# Store original skip value to check with later
54	requested_skip = skip
55	if skip is None:
56	skip = 0
57	column_type_set_order = [ 'int', 'float', 'list', 'str' ] #Order to set column types in
58	default_column_type = column_type_set_order[-1] # Default column type is lowest in list
59	column_type_compare_order = list( column_type_set_order ) #Order to compare column types
60	column_type_compare_order.reverse()
61	def type_overrules_type( column_type1, column_type2 ):
62	if column_type1 is None or column_type1 == column_type2:
63	return False
64	if column_type2 is None:
65	return True
66	for column_type in column_type_compare_order:
67	if column_type1 == column_type:
68	return True
69	if column_type2 == column_type:
70	return False
71	#neither column type was found in our ordered list, this cannot happen
72	raise "Tried to compare unknown column types"
73	def is_int( column_text ):
74	try:
75	int( column_text )
76	return True
77	except:
78	return False
79	def is_float( column_text ):
80	try:
81	float( column_text )
82	return True
83	except:
84	if column_text.strip().lower() == 'na':
85	return True #na is special cased to be a float
86	return False
87	def is_list( column_text ):
88	return "," in column_text
89	def is_str( column_text ):
90	#anything, except an empty string, is True
91	if column_text == "":
92	return False
93	return True
94	is_column_type = {} #Dict to store column type string to checking function
95	for column_type in column_type_set_order:
96	is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ]
97	def guess_column_type( column_text ):
98	for column_type in column_type_set_order:
99	if is_column_type[column_type]( column_text ):
100	return column_type
101	return None
102	data_lines = 0
103	comment_lines = 0
104	column_types = []
105	first_line_column_types = [default_column_type] # default value is one column of type str
106	if dataset.has_data():
107	#NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
108	dataset_fh = open( dataset.file_name )
109	i = 0
110	while True:
111	line = dataset_fh.readline()
112	if not line: break
113	line = line.rstrip( '\r\n' )
114	if i < skip or not line or line.startswith( '#' ):
115	# We'll call blank lines comments
116	comment_lines += 1
117	else:
118	data_lines += 1
119	fields = line.split( '\t' )
120	for field_count, field in enumerate( fields ):
121	if field_count >= len( column_types ): #found a previously unknown column, we append None
122	column_types.append( None )
123	column_type = guess_column_type( field )
124	if type_overrules_type( column_type, column_types[field_count] ):
125	column_types[field_count] = column_type
126	if i == 0 and requested_skip is None:
127	# This is our first line, people seem to like to upload files that have a header line, but do not
128	# start with '#' (i.e. all column types would then most likely be detected as str). We will assume
129	# that the first line is always a header (this was previous behavior - it was always skipped). When
130	# the requested skip is None, we only use the data from the first line if we have no other data for
131	# a column. This is far from perfect, as
132	# 1,2,3 1.1 2.2 qwerty
133	# 0 0 1,2,3
134	# will be detected as
135	# "column_types": ["int", "int", "float", "list"]
136	# instead of
137	# "column_types": ["list", "float", "float", "str"] *** would seem to be the 'Truth' by manual
138	# observation that the first line should be included as data. The old method would have detected as
139	# "column_types": ["int", "int", "str", "list"]
140	first_line_column_types = column_types
141	column_types = [ None for col in first_line_column_types ]
142	if max_data_lines is not None and data_lines >= max_data_lines:
143	if dataset_fh.tell() != dataset.get_size():
144	data_lines = None #Clear optional data_lines metadata value
145	comment_lines = None #Clear optional comment_lines metadata value; additional comment lines could appear below this point
146	break
147	i += 1
148	dataset_fh.close()
149
150	#we error on the larger number of columns
151	#first we pad our column_types by using data from first line
152	if len( first_line_column_types ) > len( column_types ):
153	for column_type in first_line_column_types[len( column_types ):]:
154	column_types.append( column_type )
155	#Now we fill any unknown (None) column_types with data from first line
156	for i in range( len( column_types ) ):
157	if column_types[i] is None:
158	if len( first_line_column_types ) <= i or first_line_column_types[i] is None:
159	column_types[i] = default_column_type
160	else:
161	column_types[i] = first_line_column_types[i]
162	# Set the discovered metadata values for the dataset
163	dataset.metadata.data_lines = data_lines
164	dataset.metadata.comment_lines = comment_lines
165	dataset.metadata.column_types = column_types
166	dataset.metadata.columns = len( column_types )
167	def make_html_table( self, dataset, skipchars=[] ):
168	"""Create HTML table, used for displaying peek"""
169	out = ['<table cellspacing="0" cellpadding="3">']
170	try:
171	out.append( '<tr>' )
172	# Generate column header
173	for i in range( 1, dataset.metadata.columns+1 ):
174	out.append( '<th>%s</th>' % str( i ) )
175	out.append( '</tr>' )
176	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
177	out.append( '</table>' )
178	out = "".join( out )
179	except Exception, exc:
180	out = "Can't create peek %s" % str( exc )
181	return out
182	def make_html_peek_rows( self, dataset, skipchars=[] ):
183	out = [""]
184	comments = []
185	if not dataset.peek:
186	dataset.set_peek()
187	data = dataset.peek
188	lines = data.splitlines()
189	for line in lines:
190	line = line.rstrip( '\r\n' )
191	if not line:
192	continue
193	comment = False
194	for skipchar in skipchars:
195	if line.startswith( skipchar ):
196	comments.append( line )
197	comment = True
198	break
199	if comment:
200	continue
201	elems = line.split( '\t' )
202	if len( elems ) != dataset.metadata.columns:
203	# We may have an invalid comment line or invalid data
204	comments.append( line )
205	comment = True
206	continue
207	while len( comments ) > 0: # Keep comments
208	try:
209	out.append( '<tr><td colspan="100%">' )
210	except:
211	out.append( '<tr><td>' )
212	out.append( '%s</td></tr>' % escape( comments.pop(0) ) )
213	out.append( '<tr>' )
214	for elem in elems: # valid data
215	elem = escape( elem )
216	out.append( '<td>%s</td>' % elem )
217	out.append( '</tr>' )
218	# Peek may consist only of comments
219	while len( comments ) > 0:
220	try:
221	out.append( '<tr><td colspan="100%">' )
222	except:
223	out.append( '<tr><td>' )
224	out.append( '%s</td></tr>' % escape( comments.pop(0) ) )
225	return "".join( out )
226	def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
227	data.Text.set_peek( self, dataset, line_count=line_count, is_multi_byte=is_multi_byte )
228	if dataset.metadata.comment_lines:
229	dataset.blurb = "%s, %s comments" % ( dataset.blurb, util.commaify( str( dataset.metadata.comment_lines ) ) )
230	def display_peek( self, dataset ):
231	"""Returns formatted html of peek"""
232	return self.make_html_table( dataset )
233	def displayable( self, dataset ):
234	try:
235	return dataset.has_data() \
236	and dataset.state == dataset.states.OK \
237	and dataset.metadata.columns > 0 \
238	and dataset.metadata.data_lines > 0
239	except:
240	return False
241	def as_gbrowse_display_file( self, dataset, **kwd ):
242	return open( dataset.file_name )
243	def as_ucsc_display_file( self, dataset, **kwd ):
244	return open( dataset.file_name )
245
246	class Taxonomy( Tabular ):
247	def __init__(self, **kwd):
248	"""Initialize taxonomy datatype"""
249	Tabular.__init__( self, **kwd )
250	self.column_names = ['Name', 'TaxId', 'Root', 'Superkingdom', 'Kingdom', 'Subkingdom',
251	'Superphylum', 'Phylum', 'Subphylum', 'Superclass', 'Class', 'Subclass',
252	'Superorder', 'Order', 'Suborder', 'Superfamily', 'Family', 'Subfamily',
253	'Tribe', 'Subtribe', 'Genus', 'Subgenus', 'Species', 'Subspecies'
254	]
255	def make_html_table( self, dataset, skipchars=[] ):
256	"""Create HTML table, used for displaying peek"""
257	out = ['<table cellspacing="0" cellpadding="3">']
258	comments = []
259	try:
260	# Generate column header
261	out.append( '<tr>' )
262	for i, name in enumerate( self.column_names ):
263	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
264	# This data type requires at least 24 columns in the data
265	if dataset.metadata.columns - len( self.column_names ) > 0:
266	for i in range( len( self.column_names ), dataset.metadata.columns ):
267	out.append( '<th>%s</th>' % str( i+1 ) )
268	out.append( '</tr>' )
269	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
270	out.append( '</table>' )
271	out = "".join( out )
272	except Exception, exc:
273	out = "Can't create peek %s" % exc
274	return out
275
276	class Sam( Tabular ):
277	file_ext = 'sam'
278	def __init__(self, **kwd):
279	"""Initialize taxonomy datatype"""
280	Tabular.__init__( self, **kwd )
281	self.column_names = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR',
282	'MRNM', 'MPOS', 'ISIZE', 'SEQ', 'QUAL', 'OPT'
283	]
284	def make_html_table( self, dataset, skipchars=[] ):
285	"""Create HTML table, used for displaying peek"""
286	out = ['<table cellspacing="0" cellpadding="3">']
287	try:
288	# Generate column header
289	out.append( '<tr>' )
290	for i, name in enumerate( self.column_names ):
291	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
292	# This data type requires at least 11 columns in the data
293	if dataset.metadata.columns - len( self.column_names ) > 0:
294	for i in range( len( self.column_names ), dataset.metadata.columns ):
295	out.append( '<th>%s</th>' % str( i+1 ) )
296	out.append( '</tr>' )
297	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
298	out.append( '</table>' )
299	out = "".join( out )
300	except Exception, exc:
301	out = "Can't create peek %s" % exc
302	return out
303	def sniff( self, filename ):
304	"""
305	Determines whether the file is in SAM format
306
307	A file in SAM format consists of lines of tab-separated data.
308	The following header line may be the first line:
309	@QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL
310	or
311	@QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL OPT
312	Data in the OPT column is optional and can consist of tab-separated data
313
314	For complete details see http://samtools.sourceforge.net/SAM1.pdf
315
316	Rules for sniffing as True:
317	There must be 11 or more columns of data on each line
318	Columns 2 (FLAG), 4(POS), 5 (MAPQ), 8 (MPOS), and 9 (ISIZE) must be numbers (9 can be negative)
319	We will only check that up to the first 5 alignments are correctly formatted.
320
321	>>> fname = get_test_fname( 'sequence.maf' )
322	>>> Sam().sniff( fname )
323	False
324	>>> fname = get_test_fname( '1.sam' )
325	>>> Sam().sniff( fname )
326	True
327	"""
328	try:
329	fh = open( filename )
330	count = 0
331	while True:
332	line = fh.readline()
333	line = line.strip()
334	if not line:
335	break #EOF
336	if line:
337	if line[0] != '@':
338	linePieces = line.split('\t')
339	if len(linePieces) < 11:
340	return False
341	try:
342	check = int(linePieces[1])
343	check = int(linePieces[3])
344	check = int(linePieces[4])
345	check = int(linePieces[7])
346	check = int(linePieces[8])
347	except ValueError:
348	return False
349	count += 1
350	if count == 5:
351	return True
352	fh.close()
353	if count < 5 and count > 0:
354	return True
355	except:
356	pass
357	return False
358
359	class Pileup( Tabular ):
360	"""Tab delimited data in pileup (6- or 10-column) format"""
361	file_ext = "pileup"
362
363	"""Add metadata elements"""
364	MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
365	MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
366	MetadataElement( name="baseCol", default=3, desc="Reference base column", param=metadata.ColumnParameter )
367
368	def init_meta( self, dataset, copy_from=None ):
369	Tabular.init_meta( self, dataset, copy_from=copy_from )
370
371	def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
372	"""Set the peek and blurb text"""
373	if not dataset.dataset.purged:
374	dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
375	if line_count is None:
376	# See if line_count is stored in the metadata
377	if dataset.metadata.data_lines:
378	dataset.blurb = "%s genomic coordinates" % util.commaify( str( dataset.metadata.data_lines ) )
379	else:
380	# Number of lines is not known ( this should not happen ), and auto-detect is
381	# needed to set metadata
382	dataset.blurb = "? genomic coordinates"
383	else:
384	dataset.blurb = "%s genomic coordinates" % util.commaify( str( line_count ) )
385	else:
386	dataset.peek = 'file does not exist'
387	dataset.blurb = 'file purged from disk'
388
389	def make_html_table( self, dataset, skipchars=[] ):
390	"""Create HTML table, used for displaying peek"""
391	out = ['<table cellspacing="0" cellpadding="3">']
392	comments = []
393	try:
394	# Generate column header
395	out.append('<tr>')
396	for i in range( 1, dataset.metadata.columns+1 ):
397	if i == dataset.metadata.chromCol:
398	out.append( '<th>%s.Chrom</th>' % i )
399	elif i == dataset.metadata.startCol:
400	out.append( '<th>%s.Start</th>' % i )
401	elif i == dataset.metadata.baseCol:
402	out.append( '<th>%s.Base</th>' % i )
403	else:
404	out.append( '<th>%s</th>' % i )
405	out.append('</tr>')
406	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
407	out.append( '</table>' )
408	out = "".join( out )
409	except Exception, exc:
410	out = "Can't create peek %s" % str( exc )
411	return out
412
413	def repair_methods( self, dataset ):
414	"""Return options for removing errors along with a description"""
415	return [ ("lines", "Remove erroneous lines") ]
416
417	def sniff( self, filename ):
418	"""
419	Checks for 'pileup-ness'
420
421	There are two main types of pileup: 6-column and 10-column. For both,
422	the first three and last two columns are the same. We only check the
423	first three to allow for some personalization of the format.
424
425	>>> fname = get_test_fname( 'interval.interval' )
426	>>> Pileup().sniff( fname )
427	False
428	>>> fname = get_test_fname( '6col.pileup' )
429	>>> Pileup().sniff( fname )
430	True
431	>>> fname = get_test_fname( '10col.pileup' )
432	>>> Pileup().sniff( fname )
433	True
434	"""
435	headers = get_headers( filename, '\t' )
436	try:
437	for hdr in headers:
438	if hdr and not hdr[0].startswith( '#' ):
439	if len( hdr ) < 3:
440	return False
441	try:
442	# chrom start in column 1 (with 0-based columns)
443	# and reference base is in column 2
444	check = int( hdr[1] )
445	assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ]
446	except:
447	return False
448	return True
449	except:
450	return False
451
452	class Eland( Tabular ):
453	file_ext = 'eland'
454
455	def sniff( self, filename ):
456	return False
457
458	class ElandMulti( Tabular ):
459	file_ext = 'elandmulti'
460
461	def sniff( self, filename ):
462	return False
463
464	class Vcf( Tabular ):
465	""" Variant Call Format for describing SNPs and other simple genome variations. """
466
467	file_ext = 'vcf'
468	column_names = [ 'Chrom', 'Pos', 'ID', 'Ref', 'Alt', 'Qual', 'Filter', 'Info', 'Format', 'data' ]
469
470	MetadataElement( name="columns", default=10, desc="Number of columns", readonly=True, visible=False )
471	MetadataElement( name="column_types", default=['str','int','str','str','str','int','str','list','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
472	MetadataElement( name="viz_filter_cols", default=[5], param=metadata.ColumnParameter, multiple=True )
473
474	def sniff( self, filename ):
475	try:
476	# If reader can read and parse file, it's VCF.
477	for line in list( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ):
478	pass
479	return True
480	except:
481	return False
482
483	def make_html_table( self, dataset, skipchars=[] ):
484	"""Create HTML table, used for displaying peek"""
485	out = ['<table cellspacing="0" cellpadding="3">']
486	try:
487	# Generate column header
488	out.append( '<tr>' )
489	for i, name in enumerate( self.column_names ):
490	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
491	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
492	out.append( '</table>' )
493	out = "".join( out )
494	except Exception, exc:
495	out = "Can't create peek %s" % exc
496	return out
497
498	def get_track_type( self ):
499	return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"}

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/datatypes/tabular.py

異なるフォーマットでダウンロード: