Context Navigation

tabular.py @ 2

リビジョン 2, 22.6 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	"""
	2	Tabular datatype
	3
	4	"""
	5	import pkg_resources
	6	pkg_resources.require( "bx-python" )
	7
	8	import logging
	9	import data
	10	from galaxy import util
	11	from cgi import escape
	12	from galaxy.datatypes import metadata
	13	from galaxy.datatypes.metadata import MetadataElement
	14	import galaxy_utils.sequence.vcf
	15	from sniff import *
	16
	17	log = logging.getLogger(__name__)
	18
	19	class Tabular( data.Text ):
	20	"""Tab delimited data"""
	21
	22	"""Add metadata elements"""
	23	MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 )
	24	MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 )
	25	MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] )
	26
	27	def init_meta( self, dataset, copy_from=None ):
	28	data.Text.init_meta( self, dataset, copy_from=copy_from )
	29	def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
	30	"""
	31	Tries to determine the number of columns as well as those columns
	32	that contain numerical values in the dataset. A skip parameter is
	33	used because various tabular data types reuse this function, and
	34	their data type classes are responsible to determine how many invalid
	35	comment lines should be skipped. Using None for skip will cause skip
	36	to be zero, but the first line will be processed as a header. A
	37	max_data_lines parameter is used because various tabular data types
	38	reuse this function, and their data type classes are responsible to
	39	determine how many data lines should be processed to ensure that the
	40	non-optional metadata parameters are properly set; if used, optional
	41	metadata parameters will be set to None, unless the entire file has
	42	already been read. Using None (default) for max_data_lines will
	43	process all data lines.
	44
	45	Items of interest:
	46	1. We treat 'overwrite' as always True (we always want to set tabular metadata when called).
	47	2. If a tabular file has no data, it will have one column of type 'str'.
	48	3. We used to check only the first 100 lines when setting metadata and this class's
	49	set_peek() method read the entire file to determine the number of lines in the file.
	50	Since metadata can now be processed on cluster nodes, we've merged the line count portion
	51	of the set_peek() processing here, and we now check the entire contents of the file.
	52	"""
	53	# Store original skip value to check with later
	54	requested_skip = skip
	55	if skip is None:
	56	skip = 0
	57	column_type_set_order = [ 'int', 'float', 'list', 'str' ] #Order to set column types in
	58	default_column_type = column_type_set_order[-1] # Default column type is lowest in list
	59	column_type_compare_order = list( column_type_set_order ) #Order to compare column types
	60	column_type_compare_order.reverse()
	61	def type_overrules_type( column_type1, column_type2 ):
	62	if column_type1 is None or column_type1 == column_type2:
	63	return False
	64	if column_type2 is None:
	65	return True
	66	for column_type in column_type_compare_order:
	67	if column_type1 == column_type:
	68	return True
	69	if column_type2 == column_type:
	70	return False
	71	#neither column type was found in our ordered list, this cannot happen
	72	raise "Tried to compare unknown column types"
	73	def is_int( column_text ):
	74	try:
	75	int( column_text )
	76	return True
	77	except:
	78	return False
	79	def is_float( column_text ):
	80	try:
	81	float( column_text )
	82	return True
	83	except:
	84	if column_text.strip().lower() == 'na':
	85	return True #na is special cased to be a float
	86	return False
	87	def is_list( column_text ):
	88	return "," in column_text
	89	def is_str( column_text ):
	90	#anything, except an empty string, is True
	91	if column_text == "":
	92	return False
	93	return True
	94	is_column_type = {} #Dict to store column type string to checking function
	95	for column_type in column_type_set_order:
	96	is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ]
	97	def guess_column_type( column_text ):
	98	for column_type in column_type_set_order:
	99	if is_column_type[column_type]( column_text ):
	100	return column_type
	101	return None
	102	data_lines = 0
	103	comment_lines = 0
	104	column_types = []
	105	first_line_column_types = [default_column_type] # default value is one column of type str
	106	if dataset.has_data():
	107	#NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
	108	dataset_fh = open( dataset.file_name )
	109	i = 0
	110	while True:
	111	line = dataset_fh.readline()
	112	if not line: break
	113	line = line.rstrip( '\r\n' )
	114	if i < skip or not line or line.startswith( '#' ):
	115	# We'll call blank lines comments
	116	comment_lines += 1
	117	else:
	118	data_lines += 1
	119	fields = line.split( '\t' )
	120	for field_count, field in enumerate( fields ):
	121	if field_count >= len( column_types ): #found a previously unknown column, we append None
	122	column_types.append( None )
	123	column_type = guess_column_type( field )
	124	if type_overrules_type( column_type, column_types[field_count] ):
	125	column_types[field_count] = column_type
	126	if i == 0 and requested_skip is None:
	127	# This is our first line, people seem to like to upload files that have a header line, but do not
	128	# start with '#' (i.e. all column types would then most likely be detected as str). We will assume
	129	# that the first line is always a header (this was previous behavior - it was always skipped). When
	130	# the requested skip is None, we only use the data from the first line if we have no other data for
	131	# a column. This is far from perfect, as
	132	# 1,2,3 1.1 2.2 qwerty
	133	# 0 0 1,2,3
	134	# will be detected as
	135	# "column_types": ["int", "int", "float", "list"]
	136	# instead of
	137	# "column_types": ["list", "float", "float", "str"] *** would seem to be the 'Truth' by manual
	138	# observation that the first line should be included as data. The old method would have detected as
	139	# "column_types": ["int", "int", "str", "list"]
	140	first_line_column_types = column_types
	141	column_types = [ None for col in first_line_column_types ]
	142	if max_data_lines is not None and data_lines >= max_data_lines:
	143	if dataset_fh.tell() != dataset.get_size():
	144	data_lines = None #Clear optional data_lines metadata value
	145	comment_lines = None #Clear optional comment_lines metadata value; additional comment lines could appear below this point
	146	break
	147	i += 1
	148	dataset_fh.close()
	149
	150	#we error on the larger number of columns
	151	#first we pad our column_types by using data from first line
	152	if len( first_line_column_types ) > len( column_types ):
	153	for column_type in first_line_column_types[len( column_types ):]:
	154	column_types.append( column_type )
	155	#Now we fill any unknown (None) column_types with data from first line
	156	for i in range( len( column_types ) ):
	157	if column_types[i] is None:
	158	if len( first_line_column_types ) <= i or first_line_column_types[i] is None:
	159	column_types[i] = default_column_type
	160	else:
	161	column_types[i] = first_line_column_types[i]
	162	# Set the discovered metadata values for the dataset
	163	dataset.metadata.data_lines = data_lines
	164	dataset.metadata.comment_lines = comment_lines
	165	dataset.metadata.column_types = column_types
	166	dataset.metadata.columns = len( column_types )
	167	def make_html_table( self, dataset, skipchars=[] ):
	168	"""Create HTML table, used for displaying peek"""
	169	out = ['<table cellspacing="0" cellpadding="3">']
	170	try:
	171	out.append( '<tr>' )
	172	# Generate column header
	173	for i in range( 1, dataset.metadata.columns+1 ):
	174	out.append( '<th>%s</th>' % str( i ) )
	175	out.append( '</tr>' )
	176	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
	177	out.append( '</table>' )
	178	out = "".join( out )
	179	except Exception, exc:
	180	out = "Can't create peek %s" % str( exc )
	181	return out
	182	def make_html_peek_rows( self, dataset, skipchars=[] ):
	183	out = [""]
	184	comments = []
	185	if not dataset.peek:
	186	dataset.set_peek()
	187	data = dataset.peek
	188	lines = data.splitlines()
	189	for line in lines:
	190	line = line.rstrip( '\r\n' )
	191	if not line:
	192	continue
	193	comment = False
	194	for skipchar in skipchars:
	195	if line.startswith( skipchar ):
	196	comments.append( line )
	197	comment = True
	198	break
	199	if comment:
	200	continue
	201	elems = line.split( '\t' )
	202	if len( elems ) != dataset.metadata.columns:
	203	# We may have an invalid comment line or invalid data
	204	comments.append( line )
	205	comment = True
	206	continue
	207	while len( comments ) > 0: # Keep comments
	208	try:
	209	out.append( '<tr><td colspan="100%">' )
	210	except:
	211	out.append( '<tr><td>' )
	212	out.append( '%s</td></tr>' % escape( comments.pop(0) ) )
	213	out.append( '<tr>' )
	214	for elem in elems: # valid data
	215	elem = escape( elem )
	216	out.append( '<td>%s</td>' % elem )
	217	out.append( '</tr>' )
	218	# Peek may consist only of comments
	219	while len( comments ) > 0:
	220	try:
	221	out.append( '<tr><td colspan="100%">' )
	222	except:
	223	out.append( '<tr><td>' )
	224	out.append( '%s</td></tr>' % escape( comments.pop(0) ) )
	225	return "".join( out )
	226	def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
	227	data.Text.set_peek( self, dataset, line_count=line_count, is_multi_byte=is_multi_byte )
	228	if dataset.metadata.comment_lines:
	229	dataset.blurb = "%s, %s comments" % ( dataset.blurb, util.commaify( str( dataset.metadata.comment_lines ) ) )
	230	def display_peek( self, dataset ):
	231	"""Returns formatted html of peek"""
	232	return self.make_html_table( dataset )
	233	def displayable( self, dataset ):
	234	try:
	235	return dataset.has_data() \
	236	and dataset.state == dataset.states.OK \
	237	and dataset.metadata.columns > 0 \
	238	and dataset.metadata.data_lines > 0
	239	except:
	240	return False
	241	def as_gbrowse_display_file( self, dataset, **kwd ):
	242	return open( dataset.file_name )
	243	def as_ucsc_display_file( self, dataset, **kwd ):
	244	return open( dataset.file_name )
	245
	246	class Taxonomy( Tabular ):
	247	def __init__(self, **kwd):
	248	"""Initialize taxonomy datatype"""
	249	Tabular.__init__( self, **kwd )
	250	self.column_names = ['Name', 'TaxId', 'Root', 'Superkingdom', 'Kingdom', 'Subkingdom',
	251	'Superphylum', 'Phylum', 'Subphylum', 'Superclass', 'Class', 'Subclass',
	252	'Superorder', 'Order', 'Suborder', 'Superfamily', 'Family', 'Subfamily',
	253	'Tribe', 'Subtribe', 'Genus', 'Subgenus', 'Species', 'Subspecies'
	254	]
	255	def make_html_table( self, dataset, skipchars=[] ):
	256	"""Create HTML table, used for displaying peek"""
	257	out = ['<table cellspacing="0" cellpadding="3">']
	258	comments = []
	259	try:
	260	# Generate column header
	261	out.append( '<tr>' )
	262	for i, name in enumerate( self.column_names ):
	263	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
	264	# This data type requires at least 24 columns in the data
	265	if dataset.metadata.columns - len( self.column_names ) > 0:
	266	for i in range( len( self.column_names ), dataset.metadata.columns ):
	267	out.append( '<th>%s</th>' % str( i+1 ) )
	268	out.append( '</tr>' )
	269	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
	270	out.append( '</table>' )
	271	out = "".join( out )
	272	except Exception, exc:
	273	out = "Can't create peek %s" % exc
	274	return out
	275
	276	class Sam( Tabular ):
	277	file_ext = 'sam'
	278	def __init__(self, **kwd):
	279	"""Initialize taxonomy datatype"""
	280	Tabular.__init__( self, **kwd )
	281	self.column_names = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR',
	282	'MRNM', 'MPOS', 'ISIZE', 'SEQ', 'QUAL', 'OPT'
	283	]
	284	def make_html_table( self, dataset, skipchars=[] ):
	285	"""Create HTML table, used for displaying peek"""
	286	out = ['<table cellspacing="0" cellpadding="3">']
	287	try:
	288	# Generate column header
	289	out.append( '<tr>' )
	290	for i, name in enumerate( self.column_names ):
	291	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
	292	# This data type requires at least 11 columns in the data
	293	if dataset.metadata.columns - len( self.column_names ) > 0:
	294	for i in range( len( self.column_names ), dataset.metadata.columns ):
	295	out.append( '<th>%s</th>' % str( i+1 ) )
	296	out.append( '</tr>' )
	297	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
	298	out.append( '</table>' )
	299	out = "".join( out )
	300	except Exception, exc:
	301	out = "Can't create peek %s" % exc
	302	return out
	303	def sniff( self, filename ):
	304	"""
	305	Determines whether the file is in SAM format
	306
	307	A file in SAM format consists of lines of tab-separated data.
	308	The following header line may be the first line:
	309	@QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL
	310	or
	311	@QNAME FLAG RNAME POS MAPQ CIGAR MRNM MPOS ISIZE SEQ QUAL OPT
	312	Data in the OPT column is optional and can consist of tab-separated data
	313
	314	For complete details see http://samtools.sourceforge.net/SAM1.pdf
	315
	316	Rules for sniffing as True:
	317	There must be 11 or more columns of data on each line
	318	Columns 2 (FLAG), 4(POS), 5 (MAPQ), 8 (MPOS), and 9 (ISIZE) must be numbers (9 can be negative)
	319	We will only check that up to the first 5 alignments are correctly formatted.
	320
	321	>>> fname = get_test_fname( 'sequence.maf' )
	322	>>> Sam().sniff( fname )
	323	False
	324	>>> fname = get_test_fname( '1.sam' )
	325	>>> Sam().sniff( fname )
	326	True
	327	"""
	328	try:
	329	fh = open( filename )
	330	count = 0
	331	while True:
	332	line = fh.readline()
	333	line = line.strip()
	334	if not line:
	335	break #EOF
	336	if line:
	337	if line[0] != '@':
	338	linePieces = line.split('\t')
	339	if len(linePieces) < 11:
	340	return False
	341	try:
	342	check = int(linePieces[1])
	343	check = int(linePieces[3])
	344	check = int(linePieces[4])
	345	check = int(linePieces[7])
	346	check = int(linePieces[8])
	347	except ValueError:
	348	return False
	349	count += 1
	350	if count == 5:
	351	return True
	352	fh.close()
	353	if count < 5 and count > 0:
	354	return True
	355	except:
	356	pass
	357	return False
	358
	359	class Pileup( Tabular ):
	360	"""Tab delimited data in pileup (6- or 10-column) format"""
	361	file_ext = "pileup"
	362
	363	"""Add metadata elements"""
	364	MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
	365	MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
	366	MetadataElement( name="baseCol", default=3, desc="Reference base column", param=metadata.ColumnParameter )
	367
	368	def init_meta( self, dataset, copy_from=None ):
	369	Tabular.init_meta( self, dataset, copy_from=copy_from )
	370
	371	def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
	372	"""Set the peek and blurb text"""
	373	if not dataset.dataset.purged:
	374	dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
	375	if line_count is None:
	376	# See if line_count is stored in the metadata
	377	if dataset.metadata.data_lines:
	378	dataset.blurb = "%s genomic coordinates" % util.commaify( str( dataset.metadata.data_lines ) )
	379	else:
	380	# Number of lines is not known ( this should not happen ), and auto-detect is
	381	# needed to set metadata
	382	dataset.blurb = "? genomic coordinates"
	383	else:
	384	dataset.blurb = "%s genomic coordinates" % util.commaify( str( line_count ) )
	385	else:
	386	dataset.peek = 'file does not exist'
	387	dataset.blurb = 'file purged from disk'
	388
	389	def make_html_table( self, dataset, skipchars=[] ):
	390	"""Create HTML table, used for displaying peek"""
	391	out = ['<table cellspacing="0" cellpadding="3">']
	392	comments = []
	393	try:
	394	# Generate column header
	395	out.append('<tr>')
	396	for i in range( 1, dataset.metadata.columns+1 ):
	397	if i == dataset.metadata.chromCol:
	398	out.append( '<th>%s.Chrom</th>' % i )
	399	elif i == dataset.metadata.startCol:
	400	out.append( '<th>%s.Start</th>' % i )
	401	elif i == dataset.metadata.baseCol:
	402	out.append( '<th>%s.Base</th>' % i )
	403	else:
	404	out.append( '<th>%s</th>' % i )
	405	out.append('</tr>')
	406	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
	407	out.append( '</table>' )
	408	out = "".join( out )
	409	except Exception, exc:
	410	out = "Can't create peek %s" % str( exc )
	411	return out
	412
	413	def repair_methods( self, dataset ):
	414	"""Return options for removing errors along with a description"""
	415	return [ ("lines", "Remove erroneous lines") ]
	416
	417	def sniff( self, filename ):
	418	"""
	419	Checks for 'pileup-ness'
	420
	421	There are two main types of pileup: 6-column and 10-column. For both,
	422	the first three and last two columns are the same. We only check the
	423	first three to allow for some personalization of the format.
	424
	425	>>> fname = get_test_fname( 'interval.interval' )
	426	>>> Pileup().sniff( fname )
	427	False
	428	>>> fname = get_test_fname( '6col.pileup' )
	429	>>> Pileup().sniff( fname )
	430	True
	431	>>> fname = get_test_fname( '10col.pileup' )
	432	>>> Pileup().sniff( fname )
	433	True
	434	"""
	435	headers = get_headers( filename, '\t' )
	436	try:
	437	for hdr in headers:
	438	if hdr and not hdr[0].startswith( '#' ):
	439	if len( hdr ) < 3:
	440	return False
	441	try:
	442	# chrom start in column 1 (with 0-based columns)
	443	# and reference base is in column 2
	444	check = int( hdr[1] )
	445	assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ]
	446	except:
	447	return False
	448	return True
	449	except:
	450	return False
	451
	452	class Eland( Tabular ):
	453	file_ext = 'eland'
	454
	455	def sniff( self, filename ):
	456	return False
	457
	458	class ElandMulti( Tabular ):
	459	file_ext = 'elandmulti'
	460
	461	def sniff( self, filename ):
	462	return False
	463
	464	class Vcf( Tabular ):
	465	""" Variant Call Format for describing SNPs and other simple genome variations. """
	466
	467	file_ext = 'vcf'
	468	column_names = [ 'Chrom', 'Pos', 'ID', 'Ref', 'Alt', 'Qual', 'Filter', 'Info', 'Format', 'data' ]
	469
	470	MetadataElement( name="columns", default=10, desc="Number of columns", readonly=True, visible=False )
	471	MetadataElement( name="column_types", default=['str','int','str','str','str','int','str','list','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
	472	MetadataElement( name="viz_filter_cols", default=[5], param=metadata.ColumnParameter, multiple=True )
	473
	474	def sniff( self, filename ):
	475	try:
	476	# If reader can read and parse file, it's VCF.
	477	for line in list( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ):
	478	pass
	479	return True
	480	except:
	481	return False
	482
	483	def make_html_table( self, dataset, skipchars=[] ):
	484	"""Create HTML table, used for displaying peek"""
	485	out = ['<table cellspacing="0" cellpadding="3">']
	486	try:
	487	# Generate column header
	488	out.append( '<tr>' )
	489	for i, name in enumerate( self.column_names ):
	490	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
	491	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
	492	out.append( '</table>' )
	493	out = "".join( out )
	494	except Exception, exc:
	495	out = "Can't create peek %s" % exc
	496	return out
	497
	498	def get_track_type( self ):
	499	return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"}

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/datatypes/tabular.py @ 2

異なるフォーマットでダウンロード: