Context Navigation

genetics.py @ 2

リビジョン 2, 32.0 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	"""
	2	rgenetics datatypes
	3	Use at your peril
	4	Ross Lazarus
	5	for the rgenetics and galaxy projects
	6
	7	genome graphs datatypes derived from Interval datatypes
	8	genome graphs datasets have a header row with appropriate columnames
	9	The first column is always the marker - eg columname = rs, first row= rs12345 if the rows are snps
	10	subsequent row values are all numeric ! Will fail if any non numeric (eg '+' or 'NA') values
	11	ross lazarus for rgenetics
	12	august 20 2007
	13	"""
	14
	15	import logging, os, sys, time, tempfile, shutil, string, glob
	16	import data
	17	from galaxy import util
	18	from cgi import escape
	19	import urllib, binascii
	20	from galaxy.web import url_for
	21	from galaxy.datatypes import metadata
	22	from galaxy.datatypes.metadata import MetadataElement
	23	from galaxy.datatypes.data import Text
	24	from galaxy.datatypes.tabular import Tabular
	25	from galaxy.datatypes.images import Html
	26	from galaxy.datatypes.interval import Interval
	27	from galaxy.util.hash_util import *
	28
	29	gal_Log = logging.getLogger(__name__)
	30	verbose = False
	31
	32	class GenomeGraphs( Tabular ):
	33	"""
	34	Tab delimited data containing a marker id and any number of numeric values
	35	"""
	36
	37	MetadataElement( name="markerCol", default=1, desc="Marker ID column", param=metadata.ColumnParameter )
	38	MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True )
	39	MetadataElement( name="column_types", default=[], desc="Column types", readonly=True, visible=False )
	40	file_ext = 'gg'
	41
	42	def __init__(self, **kwd):
	43	"""
	44	Initialize gg datatype, by adding UCSC display apps
	45	"""
	46	Tabular.__init__(self, **kwd)
	47	self.add_display_app ( 'ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links' )
	48
	49
	50	def set_meta(self,dataset,**kwd):
	51	Tabular.set_meta( self, dataset, **kwd)
	52	dataset.metadata.markerCol = 1
	53	header = file(dataset.file_name,'r').readlines()[0].strip().split('\t')
	54	dataset.metadata.columns = len(header)
	55	t = ['numeric' for x in header]
	56	t[0] = 'string'
	57	dataset.metadata.column_types = t
	58	return True
	59
	60	def as_ucsc_display_file( self, dataset, **kwd ):
	61	"""
	62	Returns file
	63	"""
	64	return file(dataset.file_name,'r')
	65
	66	def ucsc_links( self, dataset, type, app, base_url ):
	67	"""
	68	from the ever-helpful angie hinrichs angie@soe.ucsc.edu
	69	a genome graphs call looks like this
	70	http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname
	71	&hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess
	72	&hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals=
	73	&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index
	74	&hgGenome_doSubmitUpload=submit
	75	Galaxy gives this for an interval file
	76	http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText=
	77	http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc
	78	"""
	79	ret_val = []
	80	ggtail = 'hgGenome_doSubmitUpload=submit'
	81	if not dataset.dbkey:
	82	dataset.dbkey = 'hg18' # punt!
	83	if dataset.has_data():
	84	for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey):
	85	if site_name in app.config.ucsc_display_sites:
	86	site_url = site_url.replace('/hgTracks?','/hgGenome?') # for genome graphs
	87	internal_url = "%s" % url_for( controller='dataset',
	88	dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name )
	89	display_url = "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type)
	90	display_url = urllib.quote_plus( display_url )
	91	# was display_url = urllib.quote_plus( "%s/display_as?id=%i&display_app=%s" % (base_url, dataset.id, type) )
	92	#redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) )
	93	sl = ["%sdb=%s" % (site_url,dataset.dbkey ),]
	94	#sl.append("&hgt.customText=%s")
	95	sl.append("&hgGenome_dataSetName=%s&hgGenome_dataSetDescription=%s" % (dataset.name, 'GalaxyGG_data'))
	96	sl.append("&hgGenome_formatType=best guess&hgGenome_markerType=best guess")
	97	sl.append("&hgGenome_columnLabels=first row&hgGenome_maxVal=&hgGenome_labelVals=")
	98	sl.append("&hgGenome_doSubmitUpload=submit")
	99	sl.append("&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%s" % display_url)
	100	s = ''.join(sl)
	101	s = urllib.quote_plus(s)
	102	redirect_url = s
	103	log.debug('## rg gg ucsc rdurl=%s; s = %s' % (redirect_url,s))
	104	link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
	105	ret_val.append( (site_name, link) )
	106	return ret_val
	107
	108	def make_html_table( self, dataset, skipchars=[] ):
	109	"""
	110	Create HTML table, used for displaying peek
	111	"""
	112	npeek = 5
	113	out = ['<table cellspacing="0" cellpadding="3">']
	114	f = open(dataset.file_name,'r')
	115	d = f.readlines()[:5]
	116	if len(d) == 0:
	117	out = "Cannot find anything to parse in %s" % dataset.name
	118	return out
	119	hasheader = 0
	120	try:
	121	test = ['%f' % x for x in d[0][1:]] # first is name - see if starts all numerics
	122	except:
	123	hasheader = 1
	124	try:
	125	# Generate column header
	126	out.append( '<tr>' )
	127	if hasheader:
	128	for i, name in enumerate(d[0].split() ):
	129	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
	130	d.pop(0)
	131	out.append('</tr>')
	132	for row in d:
	133	out.append('<tr>')
	134	out.append(''.join(['<td>%s</td>' % x for x in row.split()]))
	135	out.append('</tr>')
	136	out.append( '</table>' )
	137	out = "".join( out )
	138	except Exception, exc:
	139	out = "Can't create peek %s" % exc
	140	return out
	141
	142	def validate( self, dataset ):
	143	"""
	144	Validate a gg file - all numeric after header row
	145	"""
	146	errors = list()
	147	infile = open(dataset.file_name, "r")
	148	header= infile.next() # header
	149	for i,row in enumerate(infile):
	150	ll = row.strip().split('\t')[1:] # first is alpha feature identifier
	151	badvals = []
	152	for j,x in enumerate(ll):
	153	try:
	154	x = float(x)
	155	except:
	156	badval.append('col%d:%s' % (j+1,x))
	157	if len(badvals) > 0:
	158	errors.append('row %d, %s' % (' '.join(badvals)))
	159	return errors
	160
	161	def sniff( self, filename ):
	162	"""
	163	Determines whether the file is in gg format
	164	"""
	165	f = open(filename,'r')
	166	headers = f.readline().split()
	167	rows = [f.readline().split()[1:] for x in range(3)] # small sample
	168	#headers = get_headers( filename, '\t' )
	169	for row in rows:
	170	try:
	171	nums = [float(x) for x in row] # first col has been removed
	172	except:
	173	return false
	174	return true
	175
	176	def get_mime(self):
	177	"""Returns the mime type of the datatype"""
	178	return 'application/vnd.ms-excel'
	179
	180
	181	class rgTabList(Tabular):
	182	"""
	183	for sampleid and for featureid lists of exclusions or inclusions in the clean tool
	184	featureid subsets on statistical criteria -> specialized display such as gg
	185	"""
	186	file_ext = "rgTList"
	187
	188
	189	def __init__(self, **kwd):
	190	"""
	191	Initialize featurelistt datatype
	192	"""
	193	Tabular.__init__( self, **kwd )
	194	self.column_names = []
	195
	196	def make_html_table( self, dataset, skipchars=[] ):
	197	"""
	198	Create HTML table, used for displaying peek
	199	"""
	200	out = ['<table cellspacing="0" cellpadding="3">']
	201	comments = []
	202	try:
	203	# Generate column header
	204	out.append( '<tr>' )
	205	for i, name in enumerate( self.column_names ):
	206	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
	207	if dataset.metadata.columns - len( self.column_names ) > 0:
	208	for i in range( len( self.column_names ), dataset.metadata.columns ):
	209	out.append( '<th>%s</th>' % str( i+1 ) )
	210	out.append( '</tr>' )
	211	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
	212	out.append( '</table>' )
	213	out = "".join( out )
	214	except Exception, exc:
	215	out = "Can't create peek %s" % exc
	216	return out
	217
	218	def get_mime(self):
	219	"""Returns the mime type of the datatype"""
	220	return 'text/html'
	221
	222
	223	class rgSampleList(rgTabList):
	224	"""
	225	for sampleid exclusions or inclusions in the clean tool
	226	output from QC eg excess het, gender error, ibd pair member,eigen outlier,excess mendel errors,...
	227	since they can be uploaded, should be flexible
	228	but they are persistent at least
	229	same infrastructure for expression?
	230	"""
	231	file_ext = "rgSList"
	232
	233	def __init__(self, **kwd):
	234	"""
	235	Initialize samplelist datatype
	236	"""
	237	rgTabList.__init__( self, **kwd )
	238	self.column_names[0] = 'FID'
	239	self.column_names[1] = 'IID'
	240	# this is what Plink wants as at 2009
	241
	242	def sniff(self,filename):
	243	infile = open(dataset.file_name, "r")
	244	header= infile.next() # header
	245	if header[0] == 'FID' and header[1] == 'IID':
	246	return True
	247	else:
	248	return False
	249
	250	class rgFeatureList( rgTabList ):
	251	"""
	252	for featureid lists of exclusions or inclusions in the clean tool
	253	output from QC eg low maf, high missingness, bad hwe in controls, excess mendel errors,...
	254	featureid subsets on statistical criteria -> specialized display such as gg
	255	same infrastructure for expression?
	256	"""
	257	file_ext = "rgFList"
	258
	259	def __init__(self, **kwd):
	260	"""Initialize featurelist datatype"""
	261	rgTabList.__init__( self, **kwd )
	262	for i,s in enumerate(['#FeatureId', 'Chr', 'Genpos', 'Mappos']):
	263	self.column_names[i] = s
	264
	265
	266	class Rgenetics(Html):
	267	"""
	268	base class to use for rgenetics datatypes
	269	derived from html - composite datatype elements
	270	stored in extra files path
	271	"""
	272
	273	MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default='RgeneticsData',
	274	readonly=True, set_in_upload=True)
	275
	276	composite_type = 'auto_primary_file'
	277	allow_datatype_change = False
	278	file_ext = 'rgenetics'
	279
	280	def generate_primary_file( self, dataset = None ):
	281	rval = ['<html><head><title>Rgenetics Galaxy Composite Dataset </title></head><p/>']
	282	rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
	283	for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
	284	fn = composite_name
	285	opt_text = ''
	286	if composite_file.optional:
	287	opt_text = ' (optional)'
	288	if composite_file.get('description'):
	289	rval.append( '<li><a href="%s" type="application/binary">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
	290	else:
	291	rval.append( '<li><a href="%s" type="application/binary">%s</a>%s</li>' % ( fn, fn, opt_text ) )
	292	rval.append( '</ul></div></html>' )
	293	return "\n".join( rval )
	294
	295	def regenerate_primary_file(self,dataset):
	296	"""
	297	cannot do this until we are setting metadata
	298	"""
	299	bn = dataset.metadata.base_name
	300	efp = dataset.extra_files_path
	301	flist = os.listdir(efp)
	302	rval = ['<html><head><title>Files for Composite Dataset %s</title></head><body><p/>Composite %s contains:<p/><ul>' % (dataset.name,dataset.name)]
	303	for i,fname in enumerate(flist):
	304	sfname = os.path.split(fname)[-1]
	305	f,e = os.path.splitext(fname)
	306	rval.append( '<li><a href="%s">%s</a></li>' % ( sfname, sfname) )
	307	rval.append( '</ul></body></html>' )
	308	f = file(dataset.file_name,'w')
	309	f.write("\n".join( rval ))
	310	f.write('\n')
	311	f.close()
	312
	313	def get_mime(self):
	314	"""Returns the mime type of the datatype"""
	315	return 'text/html'
	316
	317
	318	def set_meta( self, dataset, **kwd ):
	319
	320	"""
	321	for lped/pbed eg
	322
	323	"""
	324	Html.set_meta( self, dataset, **kwd )
	325	if kwd.get('overwrite') == False:
	326	if verbose:
	327	gal_Log.debug('@@@ rgenetics set_meta called with overwrite = False')
	328	return True
	329	try:
	330	efp = dataset.extra_files_path
	331	except:
	332	if verbose:
	333	gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0], dataset.name))
	334	return False
	335	try:
	336	flist = os.listdir(efp)
	337	except:
	338	if verbose: gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0],dataset.name))
	339	return False
	340	if len(flist) == 0:
	341	if verbose:
	342	gal_Log.debug('@@@rgenetics set_meta failed - %s efp %s is empty?' % (dataset.name,efp))
	343	return False
	344	self.regenerate_primary_file(dataset)
	345	if not dataset.info:
	346	dataset.info = 'Galaxy genotype datatype object'
	347	if not dataset.blurb:
	348	dataset.blurb = 'Composite file - Rgenetics Galaxy toolkit'
	349	return True
	350
	351
	352
	353	class SNPMatrix(Rgenetics):
	354	"""
	355	BioC SNPMatrix Rgenetics data collections
	356	"""
	357	file_ext="snpmatrix"
	358
	359	def set_peek( self, dataset, **kwd ):
	360	if not dataset.dataset.purged:
	361	dataset.peek = "Binary RGenetics file"
	362	dataset.blurb = data.nice_size( dataset.get_size() )
	363	else:
	364	dataset.peek = 'file does not exist'
	365	dataset.blurb = 'file purged from disk'
	366
	367	def sniff(self,filename):
	368	""" need to check the file header hex code
	369	"""
	370	infile = open(dataset.file_name, "b")
	371	head = infile.read(16)
	372	head = [hex(x) for x in head]
	373	if head <> '':
	374	return False
	375	else:
	376	return True
	377
	378
	379	class Lped(Rgenetics):
	380	"""
	381	linkage pedigree (ped,map) Rgenetics data collections
	382	"""
	383	file_ext="lped"
	384
	385	def __init__( self, **kwd ):
	386	Rgenetics.__init__(self, **kwd)
	387	self.add_composite_file( '%s.ped', description = 'Pedigree File', substitute_name_with_metadata = 'base_name', is_binary = False )
	388	self.add_composite_file( '%s.map', description = 'Map File', substitute_name_with_metadata = 'base_name', is_binary = False )
	389
	390
	391	class Pphe(Rgenetics):
	392	"""
	393	Plink phenotype file - header must have FID\tIID... Rgenetics data collections
	394	"""
	395	file_ext="pphe"
	396
	397	def __init__( self, **kwd ):
	398	Rgenetics.__init__(self, **kwd)
	399	self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name', is_binary = False )
	400
	401
	402
	403
	404	class Fphe(Rgenetics):
	405	"""
	406	fbat pedigree file - mad format with ! as first char on header row
	407	Rgenetics data collections
	408	"""
	409	file_ext="fphe"
	410
	411	def __init__( self, **kwd ):
	412	Rgenetics.__init__(self, **kwd)
	413	self.add_composite_file( '%s.fphe', description = 'FBAT Phenotype File', substitute_name_with_metadata = 'base_name' )
	414
	415	class Phe(Rgenetics):
	416	"""
	417	Phenotype file
	418	"""
	419	file_ext="phe"
	420
	421	def __init__( self, **kwd ):
	422	Rgenetics.__init__(self, **kwd)
	423	self.add_composite_file( '%s.phe', description = 'Phenotype File', substitute_name_with_metadata = 'base_name',
	424	is_binary = False )
	425
	426
	427
	428	class Fped(Rgenetics):
	429	"""
	430	FBAT pedigree format - single file, map is header row of rs numbers. Strange.
	431	Rgenetics data collections
	432	"""
	433	file_ext="fped"
	434
	435	def __init__( self, **kwd ):
	436	Rgenetics.__init__(self, **kwd)
	437	self.add_composite_file( '%s.fped', description = 'FBAT format pedfile', substitute_name_with_metadata = 'base_name',
	438	is_binary = False )
	439
	440
	441	class Pbed(Rgenetics):
	442	"""
	443	Plink Binary compressed 2bit/geno Rgenetics data collections
	444	"""
	445	file_ext="pbed"
	446
	447	def __init__( self, **kwd ):
	448	Rgenetics.__init__(self, **kwd)
	449	self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
	450	self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
	451	self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
	452
	453	class ldIndep(Rgenetics):
	454	"""
	455	LD (a good measure of redundancy of information) depleted Plink Binary compressed 2bit/geno
	456	This is really a plink binary, but some tools work better with less redundancy so are constrained to
	457	these files
	458	"""
	459	file_ext="ldreduced"
	460
	461	def __init__( self, **kwd ):
	462	Rgenetics.__init__(self, **kwd)
	463	self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
	464	self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
	465	self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
	466
	467
	468	class Eigenstratgeno(Rgenetics):
	469	"""
	470	Eigenstrat format - may be able to get rid of this
	471	if we move to shellfish
	472	Rgenetics data collections
	473	"""
	474	file_ext="eigenstratgeno"
	475
	476	def __init__( self, **kwd ):
	477	Rgenetics.__init__(self, **kwd)
	478	self.add_composite_file( '%s.eigenstratgeno', substitute_name_with_metadata = 'base_name', is_binary = False )
	479	self.add_composite_file( '%s.ind', substitute_name_with_metadata = 'base_name', is_binary = False )
	480	self.add_composite_file( '%s.map', substitute_name_with_metadata = 'base_name', is_binary = False )
	481
	482
	483
	484	class Eigenstratpca(Rgenetics):
	485	"""
	486	Eigenstrat PCA file for case control adjustment
	487	Rgenetics data collections
	488	"""
	489	file_ext="eigenstratpca"
	490
	491	def __init__( self, **kwd ):
	492	Rgenetics.__init__(self, **kwd)
	493	self.add_composite_file( '%s.eigenstratpca', description = 'Eigenstrat PCA file', substitute_name_with_metadata = 'base_name' )
	494
	495
	496	class Snptest(Rgenetics):
	497	"""
	498	BioC snptest Rgenetics data collections
	499	"""
	500	file_ext="snptest"
	501
	502
	503	class Pheno(Tabular):
	504	"""
	505	base class for pheno files
	506	"""
	507	file_ext = 'pheno'
	508
	509
	510	class RexpBase( Html ):
	511	"""
	512	base class for BioC data structures in Galaxy
	513	must be constructed with the pheno data in place since that
	514	goes into the metadata for each instance
	515	"""
	516	MetadataElement( name="columns", default=0, desc="Number of columns", visible=True )
	517	MetadataElement( name="column_names", default=[], desc="Column names", visible=True )
	518	MetadataElement(name="pheCols",default=[],desc="Select list for potentially interesting variables",visible=True)
	519	MetadataElement( name="base_name",
	520	desc="base name for all transformed versions of this expression dataset", default='rexpression', set_in_upload=True)
	521	MetadataElement( name="pheno_path", desc="Path to phenotype data for this experiment", default="rexpression.pheno", visible=True)
	522	file_ext = 'rexpbase'
	523	html_table = None
	524	is_binary = True
	525	composite_type = 'auto_primary_file'
	526	allow_datatype_change = False
	527
	528
	529	def __init__( self, **kwd ):
	530	Html.__init__(self,**kwd)
	531	self.add_composite_file( '%s.pheno', description = 'Phenodata tab text file',
	532	substitute_name_with_metadata = 'base_name', is_binary=False)
	533
	534	def generate_primary_file( self, dataset = None ):
	535	"""
	536	This is called only at upload to write the html file
	537	cannot rename the datasets here - they come with the default unfortunately
	538	"""
	539	return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
	540
	541	def get_mime(self):
	542	"""Returns the mime type of the datatype"""
	543	return 'text/html'
	544
	545	def get_phecols(self, phenolist=[], maxConc=20):
	546	"""
	547	sept 2009: cannot use whitespace to split - make a more complex structure here
	548	and adjust the methods that rely on this structure
	549	return interesting phenotype column names for an rexpression eset or affybatch
	550	to use in array subsetting and so on. Returns a data structure for a
	551	dynamic Galaxy select parameter.
	552	A column with only 1 value doesn't change, so is not interesting for
	553	analysis. A column with a different value in every row is equivalent to a unique
	554	identifier so is also not interesting for anova or limma analysis - both these
	555	are removed after the concordance (count of unique terms) is constructed for each
	556	column. Then a complication - each remaining pair of columns is tested for
	557	redundancy - if two columns are always paired, then only one is needed :)
	558	"""
	559	for nrows,row in enumerate(phenolist): # construct concordance
	560	if len(row.strip()) == 0:
	561	break
	562	row = row.strip().split('\t')
	563	if nrows == 0: # set up from header
	564	head = row
	565	totcols = len(row)
	566	concordance = [{} for x in head] # list of dicts
	567	else:
	568	for col,code in enumerate(row): # keep column order correct
	569	if col >= totcols:
	570	gal_Log.warning('### get_phecols error in pheno file - row %d col %d (%s) longer than header %s' % (nrows, col, row, head))
	571	else:
	572	concordance[col].setdefault(code,0) # first one is zero
	573	concordance[col][code] += 1
	574	useCols = []
	575	useConc = [] # columns of interest to keep
	576	nrows = len(phenolist)
	577	nrows -= 1 # drop head from count
	578	for c,conc in enumerate(concordance): # c is column number
	579	if (len(conc) > 1) and (len(conc) < min(nrows,maxConc)): # not all same and not all different!!
	580	useConc.append(conc) # keep concordance
	581	useCols.append(c) # keep column
	582	nuse = len(useCols)
	583	# now to check for pairs of concordant columns - drop one of these.
	584	delme = []
	585	p = phenolist[1:] # drop header
	586	plist = [x.strip().split('\t') for x in p] # list of lists
	587	phe = [[x[i] for i in useCols] for x in plist if len(x) >= totcols] # strip unused data
	588	for i in range(0,(nuse-1)): # for each interesting column
	589	for j in range(i+1,nuse):
	590	kdict = {}
	591	for row in phe: # row is a list of lists
	592	k = '%s%s' % (row[i],row[j]) # composite key
	593	kdict[k] = k
	594	if (len(kdict.keys()) == len(concordance[useCols[j]])): # i and j are always matched
	595	delme.append(j)
	596	delme = list(set(delme)) # remove dupes
	597	listCol = []
	598	delme.sort()
	599	delme.reverse() # must delete from far end!
	600	for i in delme:
	601	del useConc[i] # get rid of concordance
	602	del useCols[i] # and usecols entry
	603	for i,conc in enumerate(useConc): # these are all unique columns for the design matrix
	604	ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
	605	ccounts.sort()
	606	cc = [(x[1],x[0]) for x in ccounts] # list of code count tuples
	607	codeDetails = (head[useCols[i]],cc) # ('foo',[('a',3),('b',11),..])
	608	listCol.append(codeDetails)
	609	if len(listCol) > 0:
	610	res = listCol
	611	# metadata.pheCols becomes [('bar;22,zot;113','foo'), ...]
	612	else:
	613	res = [('no usable phenotype columns found',[('?',0),]),]
	614	return res
	615
	616
	617
	618	def get_pheno(self,dataset):
	619	"""
	620	expects a .pheno file in the extra_files_dir - ugh
	621	note that R is wierd and adds the row.name in
	622	the header so the columns are all wrong - unless you tell it not to.
	623	A file can be written as
	624	write.table(file='foo.pheno',pData(foo),sep='\t',quote=F,row.names=F)
	625	"""
	626	p = file(dataset.metadata.pheno_path,'r').readlines()
	627	if len(p) > 0: # should only need to fix an R pheno file once
	628	head = p[0].strip().split('\t')
	629	line1 = p[1].strip().split('\t')
	630	if len(head) < len(line1):
	631	head.insert(0,'ChipFileName') # fix R write.table b0rken-ness
	632	p[0] = '\t'.join(head)
	633	else:
	634	p = []
	635	return '\n'.join(p)
	636
	637	def set_peek( self, dataset, **kwd ):
	638	"""
	639	expects a .pheno file in the extra_files_dir - ugh
	640	note that R is wierd and does not include the row.name in
	641	the header. why?"""
	642	if not dataset.dataset.purged:
	643	pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
	644	try:
	645	p = file(pp,'r').readlines()
	646	except:
	647	p = ['##failed to find %s' % pp,]
	648	dataset.peek = ''.join(p[:5])
	649	dataset.blurb = 'Galaxy Rexpression composite file'
	650	else:
	651	dataset.peek = 'file does not exist\n'
	652	dataset.blurb = 'file purged from disk'
	653
	654	def get_peek( self, dataset ):
	655	"""
	656	expects a .pheno file in the extra_files_dir - ugh
	657	"""
	658	pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
	659	try:
	660	p = file(pp,'r').readlines()
	661	except:
	662	p = ['##failed to find %s' % pp]
	663	return ''.join(p[:5])
	664
	665	def get_file_peek(self,filename):
	666	"""
	667	can't really peek at a filename - need the extra_files_path and such?
	668	"""
	669	h = '## rexpression get_file_peek: no file found'
	670	try:
	671	h = file(filename,'r').readlines()
	672	except:
	673	pass
	674	return ''.join(h[:5])
	675
	676	def regenerate_primary_file(self,dataset):
	677	"""
	678	cannot do this until we are setting metadata
	679	"""
	680	bn = dataset.metadata.base_name
	681	flist = os.listdir(dataset.extra_files_path)
	682	rval = ['<html><head><title>Files for Composite Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)]
	683	for i,fname in enumerate(flist):
	684	sfname = os.path.split(fname)[-1]
	685	rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) )
	686	rval.append( '</ul></html>' )
	687	f = file(dataset.file_name,'w')
	688	f.write("\n".join( rval ))
	689	f.write('\n')
	690	f.close()
	691
	692	def init_meta( self, dataset, copy_from=None ):
	693	if copy_from:
	694	dataset.metadata = copy_from.metadata
	695
	696	def set_meta( self, dataset, **kwd ):
	697
	698	"""
	699	NOTE we apply the tabular machinary to the phenodata extracted
	700	from a BioC eSet or affybatch.
	701
	702	"""
	703	Html.set_meta(self, dataset, **kwd)
	704	try:
	705	flist = os.listdir(dataset.extra_files_path)
	706	except:
	707	if verbose:
	708	gal_Log.debug('@@@rexpression set_meta failed - no dataset?')
	709	return False
	710	bn = dataset.metadata.base_name
	711	if not bn:
	712	for f in flist:
	713	n = os.path.splitext(f)[0]
	714	bn = n
	715	dataset.metadata.base_name = bn
	716	if not bn:
	717	bn = '?'
	718	dataset.metadata.base_name = bn
	719	pn = '%s.pheno' % (bn)
	720	pp = os.path.join(dataset.extra_files_path,pn)
	721	dataset.metadata.pheno_path=pp
	722	try:
	723	pf = file(pp,'r').readlines() # read the basename.phenodata in the extra_files_path
	724	except:
	725	pf = None
	726	if pf:
	727	h = pf[0].strip()
	728	h = h.split('\t') # hope is header
	729	h = [escape(x) for x in h]
	730	dataset.metadata.column_names = h
	731	dataset.metadata.columns = len(h)
	732	dataset.peek = ''.join(pf[:5])
	733	else:
	734	dataset.metadata.column_names = []
	735	dataset.metadata.columns = 0
	736	dataset.peek = 'No pheno file found'
	737	if pf and len(pf) > 1:
	738	dataset.metadata.pheCols = self.get_phecols(phenolist=pf)
	739	else:
	740	dataset.metadata.pheCols = [('','No useable phenotypes found',False),]
	741	#self.regenerate_primary_file(dataset)
	742	if not dataset.info:
	743	dataset.info = 'Galaxy Expression datatype object'
	744	if not dataset.blurb:
	745	dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit'
	746	return True
	747
	748	def make_html_table( self, pp='nothing supplied from peek\n'):
	749	"""
	750	Create HTML table, used for displaying peek
	751	"""
	752	out = ['<table cellspacing="0" cellpadding="3">',]
	753	p = pp.split('\n')
	754	try:
	755	# Generate column header
	756	for i,row in enumerate(p):
	757	lrow = row.strip().split('\t')
	758	if i == 0:
	759	orow = ['<th>%s</th>' % escape(x) for x in lrow]
	760	orow.insert(0,'<tr>')
	761	orow.append('</tr>')
	762	else:
	763	orow = ['<td>%s</td>' % escape(x) for x in lrow]
	764	orow.insert(0,'<tr>')
	765	orow.append('</tr>')
	766	out.append(''.join(orow))
	767	out.append( '</table>' )
	768	out = "\n".join( out )
	769	except Exception, exc:
	770	out = "Can't create html table %s" % str( exc )
	771	return out
	772
	773	def display_peek( self, dataset ):
	774	"""
	775	Returns formatted html of peek
	776	"""
	777	out=self.make_html_table(dataset.peek)
	778	return out
	779
	780	def get_mime(self):
	781	"""
	782	Returns the mime type of the datatype
	783	"""
	784	return 'text/html'
	785
	786
	787	class Affybatch( RexpBase ):
	788	"""
	789	derived class for BioC data structures in Galaxy
	790	"""
	791
	792	file_ext = "affybatch"
	793
	794	def __init__( self, **kwd ):
	795	RexpBase.__init__(self, **kwd)
	796	self.add_composite_file( '%s.affybatch', description = 'AffyBatch R object saved to file',
	797	substitute_name_with_metadata = 'base_name', is_binary=True )
	798
	799	class Eset( RexpBase ):
	800	"""
	801	derived class for BioC data structures in Galaxy
	802	"""
	803	file_ext = "eset"
	804
	805	def __init__( self, **kwd ):
	806	RexpBase.__init__(self, **kwd)
	807	self.add_composite_file( '%s.eset', description = 'ESet R object saved to file',
	808	substitute_name_with_metadata = 'base_name', is_binary = True )
	809
	810
	811	class MAlist( RexpBase ):
	812	"""
	813	derived class for BioC data structures in Galaxy
	814	"""
	815	file_ext = "malist"
	816
	817	def __init__( self, **kwd ):
	818	RexpBase.__init__(self, **kwd)
	819	self.add_composite_file( '%s.malist', description = 'MAlist R object saved to file',
	820	substitute_name_with_metadata = 'base_name', is_binary = True )
	821
	822
	823	if __name__ == '__main__':
	824	import doctest, sys
	825	doctest.testmod(sys.modules[__name__])
	826

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/datatypes/genetics.py @ 2

異なるフォーマットでダウンロード: