Context Navigation

genetics.py

リビジョン 2, 32.0 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	"""
2	rgenetics datatypes
3	Use at your peril
4	Ross Lazarus
5	for the rgenetics and galaxy projects
6
7	genome graphs datatypes derived from Interval datatypes
8	genome graphs datasets have a header row with appropriate columnames
9	The first column is always the marker - eg columname = rs, first row= rs12345 if the rows are snps
10	subsequent row values are all numeric ! Will fail if any non numeric (eg '+' or 'NA') values
11	ross lazarus for rgenetics
12	august 20 2007
13	"""
14
15	import logging, os, sys, time, tempfile, shutil, string, glob
16	import data
17	from galaxy import util
18	from cgi import escape
19	import urllib, binascii
20	from galaxy.web import url_for
21	from galaxy.datatypes import metadata
22	from galaxy.datatypes.metadata import MetadataElement
23	from galaxy.datatypes.data import Text
24	from galaxy.datatypes.tabular import Tabular
25	from galaxy.datatypes.images import Html
26	from galaxy.datatypes.interval import Interval
27	from galaxy.util.hash_util import *
28
29	gal_Log = logging.getLogger(__name__)
30	verbose = False
31
32	class GenomeGraphs( Tabular ):
33	"""
34	Tab delimited data containing a marker id and any number of numeric values
35	"""
36
37	MetadataElement( name="markerCol", default=1, desc="Marker ID column", param=metadata.ColumnParameter )
38	MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True )
39	MetadataElement( name="column_types", default=[], desc="Column types", readonly=True, visible=False )
40	file_ext = 'gg'
41
42	def __init__(self, **kwd):
43	"""
44	Initialize gg datatype, by adding UCSC display apps
45	"""
46	Tabular.__init__(self, **kwd)
47	self.add_display_app ( 'ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links' )
48
49
50	def set_meta(self,dataset,**kwd):
51	Tabular.set_meta( self, dataset, **kwd)
52	dataset.metadata.markerCol = 1
53	header = file(dataset.file_name,'r').readlines()[0].strip().split('\t')
54	dataset.metadata.columns = len(header)
55	t = ['numeric' for x in header]
56	t[0] = 'string'
57	dataset.metadata.column_types = t
58	return True
59
60	def as_ucsc_display_file( self, dataset, **kwd ):
61	"""
62	Returns file
63	"""
64	return file(dataset.file_name,'r')
65
66	def ucsc_links( self, dataset, type, app, base_url ):
67	"""
68	from the ever-helpful angie hinrichs angie@soe.ucsc.edu
69	a genome graphs call looks like this
70	http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname
71	&hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess
72	&hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals=
73	&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index
74	&hgGenome_doSubmitUpload=submit
75	Galaxy gives this for an interval file
76	http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText=
77	http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc
78	"""
79	ret_val = []
80	ggtail = 'hgGenome_doSubmitUpload=submit'
81	if not dataset.dbkey:
82	dataset.dbkey = 'hg18' # punt!
83	if dataset.has_data():
84	for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey):
85	if site_name in app.config.ucsc_display_sites:
86	site_url = site_url.replace('/hgTracks?','/hgGenome?') # for genome graphs
87	internal_url = "%s" % url_for( controller='dataset',
88	dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name )
89	display_url = "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type)
90	display_url = urllib.quote_plus( display_url )
91	# was display_url = urllib.quote_plus( "%s/display_as?id=%i&display_app=%s" % (base_url, dataset.id, type) )
92	#redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) )
93	sl = ["%sdb=%s" % (site_url,dataset.dbkey ),]
94	#sl.append("&hgt.customText=%s")
95	sl.append("&hgGenome_dataSetName=%s&hgGenome_dataSetDescription=%s" % (dataset.name, 'GalaxyGG_data'))
96	sl.append("&hgGenome_formatType=best guess&hgGenome_markerType=best guess")
97	sl.append("&hgGenome_columnLabels=first row&hgGenome_maxVal=&hgGenome_labelVals=")
98	sl.append("&hgGenome_doSubmitUpload=submit")
99	sl.append("&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%s" % display_url)
100	s = ''.join(sl)
101	s = urllib.quote_plus(s)
102	redirect_url = s
103	log.debug('## rg gg ucsc rdurl=%s; s = %s' % (redirect_url,s))
104	link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
105	ret_val.append( (site_name, link) )
106	return ret_val
107
108	def make_html_table( self, dataset, skipchars=[] ):
109	"""
110	Create HTML table, used for displaying peek
111	"""
112	npeek = 5
113	out = ['<table cellspacing="0" cellpadding="3">']
114	f = open(dataset.file_name,'r')
115	d = f.readlines()[:5]
116	if len(d) == 0:
117	out = "Cannot find anything to parse in %s" % dataset.name
118	return out
119	hasheader = 0
120	try:
121	test = ['%f' % x for x in d[0][1:]] # first is name - see if starts all numerics
122	except:
123	hasheader = 1
124	try:
125	# Generate column header
126	out.append( '<tr>' )
127	if hasheader:
128	for i, name in enumerate(d[0].split() ):
129	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
130	d.pop(0)
131	out.append('</tr>')
132	for row in d:
133	out.append('<tr>')
134	out.append(''.join(['<td>%s</td>' % x for x in row.split()]))
135	out.append('</tr>')
136	out.append( '</table>' )
137	out = "".join( out )
138	except Exception, exc:
139	out = "Can't create peek %s" % exc
140	return out
141
142	def validate( self, dataset ):
143	"""
144	Validate a gg file - all numeric after header row
145	"""
146	errors = list()
147	infile = open(dataset.file_name, "r")
148	header= infile.next() # header
149	for i,row in enumerate(infile):
150	ll = row.strip().split('\t')[1:] # first is alpha feature identifier
151	badvals = []
152	for j,x in enumerate(ll):
153	try:
154	x = float(x)
155	except:
156	badval.append('col%d:%s' % (j+1,x))
157	if len(badvals) > 0:
158	errors.append('row %d, %s' % (' '.join(badvals)))
159	return errors
160
161	def sniff( self, filename ):
162	"""
163	Determines whether the file is in gg format
164	"""
165	f = open(filename,'r')
166	headers = f.readline().split()
167	rows = [f.readline().split()[1:] for x in range(3)] # small sample
168	#headers = get_headers( filename, '\t' )
169	for row in rows:
170	try:
171	nums = [float(x) for x in row] # first col has been removed
172	except:
173	return false
174	return true
175
176	def get_mime(self):
177	"""Returns the mime type of the datatype"""
178	return 'application/vnd.ms-excel'
179
180
181	class rgTabList(Tabular):
182	"""
183	for sampleid and for featureid lists of exclusions or inclusions in the clean tool
184	featureid subsets on statistical criteria -> specialized display such as gg
185	"""
186	file_ext = "rgTList"
187
188
189	def __init__(self, **kwd):
190	"""
191	Initialize featurelistt datatype
192	"""
193	Tabular.__init__( self, **kwd )
194	self.column_names = []
195
196	def make_html_table( self, dataset, skipchars=[] ):
197	"""
198	Create HTML table, used for displaying peek
199	"""
200	out = ['<table cellspacing="0" cellpadding="3">']
201	comments = []
202	try:
203	# Generate column header
204	out.append( '<tr>' )
205	for i, name in enumerate( self.column_names ):
206	out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
207	if dataset.metadata.columns - len( self.column_names ) > 0:
208	for i in range( len( self.column_names ), dataset.metadata.columns ):
209	out.append( '<th>%s</th>' % str( i+1 ) )
210	out.append( '</tr>' )
211	out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
212	out.append( '</table>' )
213	out = "".join( out )
214	except Exception, exc:
215	out = "Can't create peek %s" % exc
216	return out
217
218	def get_mime(self):
219	"""Returns the mime type of the datatype"""
220	return 'text/html'
221
222
223	class rgSampleList(rgTabList):
224	"""
225	for sampleid exclusions or inclusions in the clean tool
226	output from QC eg excess het, gender error, ibd pair member,eigen outlier,excess mendel errors,...
227	since they can be uploaded, should be flexible
228	but they are persistent at least
229	same infrastructure for expression?
230	"""
231	file_ext = "rgSList"
232
233	def __init__(self, **kwd):
234	"""
235	Initialize samplelist datatype
236	"""
237	rgTabList.__init__( self, **kwd )
238	self.column_names[0] = 'FID'
239	self.column_names[1] = 'IID'
240	# this is what Plink wants as at 2009
241
242	def sniff(self,filename):
243	infile = open(dataset.file_name, "r")
244	header= infile.next() # header
245	if header[0] == 'FID' and header[1] == 'IID':
246	return True
247	else:
248	return False
249
250	class rgFeatureList( rgTabList ):
251	"""
252	for featureid lists of exclusions or inclusions in the clean tool
253	output from QC eg low maf, high missingness, bad hwe in controls, excess mendel errors,...
254	featureid subsets on statistical criteria -> specialized display such as gg
255	same infrastructure for expression?
256	"""
257	file_ext = "rgFList"
258
259	def __init__(self, **kwd):
260	"""Initialize featurelist datatype"""
261	rgTabList.__init__( self, **kwd )
262	for i,s in enumerate(['#FeatureId', 'Chr', 'Genpos', 'Mappos']):
263	self.column_names[i] = s
264
265
266	class Rgenetics(Html):
267	"""
268	base class to use for rgenetics datatypes
269	derived from html - composite datatype elements
270	stored in extra files path
271	"""
272
273	MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default='RgeneticsData',
274	readonly=True, set_in_upload=True)
275
276	composite_type = 'auto_primary_file'
277	allow_datatype_change = False
278	file_ext = 'rgenetics'
279
280	def generate_primary_file( self, dataset = None ):
281	rval = ['<html><head><title>Rgenetics Galaxy Composite Dataset </title></head><p/>']
282	rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
283	for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
284	fn = composite_name
285	opt_text = ''
286	if composite_file.optional:
287	opt_text = ' (optional)'
288	if composite_file.get('description'):
289	rval.append( '<li><a href="%s" type="application/binary">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
290	else:
291	rval.append( '<li><a href="%s" type="application/binary">%s</a>%s</li>' % ( fn, fn, opt_text ) )
292	rval.append( '</ul></div></html>' )
293	return "\n".join( rval )
294
295	def regenerate_primary_file(self,dataset):
296	"""
297	cannot do this until we are setting metadata
298	"""
299	bn = dataset.metadata.base_name
300	efp = dataset.extra_files_path
301	flist = os.listdir(efp)
302	rval = ['<html><head><title>Files for Composite Dataset %s</title></head><body><p/>Composite %s contains:<p/><ul>' % (dataset.name,dataset.name)]
303	for i,fname in enumerate(flist):
304	sfname = os.path.split(fname)[-1]
305	f,e = os.path.splitext(fname)
306	rval.append( '<li><a href="%s">%s</a></li>' % ( sfname, sfname) )
307	rval.append( '</ul></body></html>' )
308	f = file(dataset.file_name,'w')
309	f.write("\n".join( rval ))
310	f.write('\n')
311	f.close()
312
313	def get_mime(self):
314	"""Returns the mime type of the datatype"""
315	return 'text/html'
316
317
318	def set_meta( self, dataset, **kwd ):
319
320	"""
321	for lped/pbed eg
322
323	"""
324	Html.set_meta( self, dataset, **kwd )
325	if kwd.get('overwrite') == False:
326	if verbose:
327	gal_Log.debug('@@@ rgenetics set_meta called with overwrite = False')
328	return True
329	try:
330	efp = dataset.extra_files_path
331	except:
332	if verbose:
333	gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0], dataset.name))
334	return False
335	try:
336	flist = os.listdir(efp)
337	except:
338	if verbose: gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0],dataset.name))
339	return False
340	if len(flist) == 0:
341	if verbose:
342	gal_Log.debug('@@@rgenetics set_meta failed - %s efp %s is empty?' % (dataset.name,efp))
343	return False
344	self.regenerate_primary_file(dataset)
345	if not dataset.info:
346	dataset.info = 'Galaxy genotype datatype object'
347	if not dataset.blurb:
348	dataset.blurb = 'Composite file - Rgenetics Galaxy toolkit'
349	return True
350
351
352
353	class SNPMatrix(Rgenetics):
354	"""
355	BioC SNPMatrix Rgenetics data collections
356	"""
357	file_ext="snpmatrix"
358
359	def set_peek( self, dataset, **kwd ):
360	if not dataset.dataset.purged:
361	dataset.peek = "Binary RGenetics file"
362	dataset.blurb = data.nice_size( dataset.get_size() )
363	else:
364	dataset.peek = 'file does not exist'
365	dataset.blurb = 'file purged from disk'
366
367	def sniff(self,filename):
368	""" need to check the file header hex code
369	"""
370	infile = open(dataset.file_name, "b")
371	head = infile.read(16)
372	head = [hex(x) for x in head]
373	if head <> '':
374	return False
375	else:
376	return True
377
378
379	class Lped(Rgenetics):
380	"""
381	linkage pedigree (ped,map) Rgenetics data collections
382	"""
383	file_ext="lped"
384
385	def __init__( self, **kwd ):
386	Rgenetics.__init__(self, **kwd)
387	self.add_composite_file( '%s.ped', description = 'Pedigree File', substitute_name_with_metadata = 'base_name', is_binary = False )
388	self.add_composite_file( '%s.map', description = 'Map File', substitute_name_with_metadata = 'base_name', is_binary = False )
389
390
391	class Pphe(Rgenetics):
392	"""
393	Plink phenotype file - header must have FID\tIID... Rgenetics data collections
394	"""
395	file_ext="pphe"
396
397	def __init__( self, **kwd ):
398	Rgenetics.__init__(self, **kwd)
399	self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name', is_binary = False )
400
401
402
403
404	class Fphe(Rgenetics):
405	"""
406	fbat pedigree file - mad format with ! as first char on header row
407	Rgenetics data collections
408	"""
409	file_ext="fphe"
410
411	def __init__( self, **kwd ):
412	Rgenetics.__init__(self, **kwd)
413	self.add_composite_file( '%s.fphe', description = 'FBAT Phenotype File', substitute_name_with_metadata = 'base_name' )
414
415	class Phe(Rgenetics):
416	"""
417	Phenotype file
418	"""
419	file_ext="phe"
420
421	def __init__( self, **kwd ):
422	Rgenetics.__init__(self, **kwd)
423	self.add_composite_file( '%s.phe', description = 'Phenotype File', substitute_name_with_metadata = 'base_name',
424	is_binary = False )
425
426
427
428	class Fped(Rgenetics):
429	"""
430	FBAT pedigree format - single file, map is header row of rs numbers. Strange.
431	Rgenetics data collections
432	"""
433	file_ext="fped"
434
435	def __init__( self, **kwd ):
436	Rgenetics.__init__(self, **kwd)
437	self.add_composite_file( '%s.fped', description = 'FBAT format pedfile', substitute_name_with_metadata = 'base_name',
438	is_binary = False )
439
440
441	class Pbed(Rgenetics):
442	"""
443	Plink Binary compressed 2bit/geno Rgenetics data collections
444	"""
445	file_ext="pbed"
446
447	def __init__( self, **kwd ):
448	Rgenetics.__init__(self, **kwd)
449	self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
450	self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
451	self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
452
453	class ldIndep(Rgenetics):
454	"""
455	LD (a good measure of redundancy of information) depleted Plink Binary compressed 2bit/geno
456	This is really a plink binary, but some tools work better with less redundancy so are constrained to
457	these files
458	"""
459	file_ext="ldreduced"
460
461	def __init__( self, **kwd ):
462	Rgenetics.__init__(self, **kwd)
463	self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
464	self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
465	self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
466
467
468	class Eigenstratgeno(Rgenetics):
469	"""
470	Eigenstrat format - may be able to get rid of this
471	if we move to shellfish
472	Rgenetics data collections
473	"""
474	file_ext="eigenstratgeno"
475
476	def __init__( self, **kwd ):
477	Rgenetics.__init__(self, **kwd)
478	self.add_composite_file( '%s.eigenstratgeno', substitute_name_with_metadata = 'base_name', is_binary = False )
479	self.add_composite_file( '%s.ind', substitute_name_with_metadata = 'base_name', is_binary = False )
480	self.add_composite_file( '%s.map', substitute_name_with_metadata = 'base_name', is_binary = False )
481
482
483
484	class Eigenstratpca(Rgenetics):
485	"""
486	Eigenstrat PCA file for case control adjustment
487	Rgenetics data collections
488	"""
489	file_ext="eigenstratpca"
490
491	def __init__( self, **kwd ):
492	Rgenetics.__init__(self, **kwd)
493	self.add_composite_file( '%s.eigenstratpca', description = 'Eigenstrat PCA file', substitute_name_with_metadata = 'base_name' )
494
495
496	class Snptest(Rgenetics):
497	"""
498	BioC snptest Rgenetics data collections
499	"""
500	file_ext="snptest"
501
502
503	class Pheno(Tabular):
504	"""
505	base class for pheno files
506	"""
507	file_ext = 'pheno'
508
509
510	class RexpBase( Html ):
511	"""
512	base class for BioC data structures in Galaxy
513	must be constructed with the pheno data in place since that
514	goes into the metadata for each instance
515	"""
516	MetadataElement( name="columns", default=0, desc="Number of columns", visible=True )
517	MetadataElement( name="column_names", default=[], desc="Column names", visible=True )
518	MetadataElement(name="pheCols",default=[],desc="Select list for potentially interesting variables",visible=True)
519	MetadataElement( name="base_name",
520	desc="base name for all transformed versions of this expression dataset", default='rexpression', set_in_upload=True)
521	MetadataElement( name="pheno_path", desc="Path to phenotype data for this experiment", default="rexpression.pheno", visible=True)
522	file_ext = 'rexpbase'
523	html_table = None
524	is_binary = True
525	composite_type = 'auto_primary_file'
526	allow_datatype_change = False
527
528
529	def __init__( self, **kwd ):
530	Html.__init__(self,**kwd)
531	self.add_composite_file( '%s.pheno', description = 'Phenodata tab text file',
532	substitute_name_with_metadata = 'base_name', is_binary=False)
533
534	def generate_primary_file( self, dataset = None ):
535	"""
536	This is called only at upload to write the html file
537	cannot rename the datasets here - they come with the default unfortunately
538	"""
539	return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
540
541	def get_mime(self):
542	"""Returns the mime type of the datatype"""
543	return 'text/html'
544
545	def get_phecols(self, phenolist=[], maxConc=20):
546	"""
547	sept 2009: cannot use whitespace to split - make a more complex structure here
548	and adjust the methods that rely on this structure
549	return interesting phenotype column names for an rexpression eset or affybatch
550	to use in array subsetting and so on. Returns a data structure for a
551	dynamic Galaxy select parameter.
552	A column with only 1 value doesn't change, so is not interesting for
553	analysis. A column with a different value in every row is equivalent to a unique
554	identifier so is also not interesting for anova or limma analysis - both these
555	are removed after the concordance (count of unique terms) is constructed for each
556	column. Then a complication - each remaining pair of columns is tested for
557	redundancy - if two columns are always paired, then only one is needed :)
558	"""
559	for nrows,row in enumerate(phenolist): # construct concordance
560	if len(row.strip()) == 0:
561	break
562	row = row.strip().split('\t')
563	if nrows == 0: # set up from header
564	head = row
565	totcols = len(row)
566	concordance = [{} for x in head] # list of dicts
567	else:
568	for col,code in enumerate(row): # keep column order correct
569	if col >= totcols:
570	gal_Log.warning('### get_phecols error in pheno file - row %d col %d (%s) longer than header %s' % (nrows, col, row, head))
571	else:
572	concordance[col].setdefault(code,0) # first one is zero
573	concordance[col][code] += 1
574	useCols = []
575	useConc = [] # columns of interest to keep
576	nrows = len(phenolist)
577	nrows -= 1 # drop head from count
578	for c,conc in enumerate(concordance): # c is column number
579	if (len(conc) > 1) and (len(conc) < min(nrows,maxConc)): # not all same and not all different!!
580	useConc.append(conc) # keep concordance
581	useCols.append(c) # keep column
582	nuse = len(useCols)
583	# now to check for pairs of concordant columns - drop one of these.
584	delme = []
585	p = phenolist[1:] # drop header
586	plist = [x.strip().split('\t') for x in p] # list of lists
587	phe = [[x[i] for i in useCols] for x in plist if len(x) >= totcols] # strip unused data
588	for i in range(0,(nuse-1)): # for each interesting column
589	for j in range(i+1,nuse):
590	kdict = {}
591	for row in phe: # row is a list of lists
592	k = '%s%s' % (row[i],row[j]) # composite key
593	kdict[k] = k
594	if (len(kdict.keys()) == len(concordance[useCols[j]])): # i and j are always matched
595	delme.append(j)
596	delme = list(set(delme)) # remove dupes
597	listCol = []
598	delme.sort()
599	delme.reverse() # must delete from far end!
600	for i in delme:
601	del useConc[i] # get rid of concordance
602	del useCols[i] # and usecols entry
603	for i,conc in enumerate(useConc): # these are all unique columns for the design matrix
604	ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
605	ccounts.sort()
606	cc = [(x[1],x[0]) for x in ccounts] # list of code count tuples
607	codeDetails = (head[useCols[i]],cc) # ('foo',[('a',3),('b',11),..])
608	listCol.append(codeDetails)
609	if len(listCol) > 0:
610	res = listCol
611	# metadata.pheCols becomes [('bar;22,zot;113','foo'), ...]
612	else:
613	res = [('no usable phenotype columns found',[('?',0),]),]
614	return res
615
616
617
618	def get_pheno(self,dataset):
619	"""
620	expects a .pheno file in the extra_files_dir - ugh
621	note that R is wierd and adds the row.name in
622	the header so the columns are all wrong - unless you tell it not to.
623	A file can be written as
624	write.table(file='foo.pheno',pData(foo),sep='\t',quote=F,row.names=F)
625	"""
626	p = file(dataset.metadata.pheno_path,'r').readlines()
627	if len(p) > 0: # should only need to fix an R pheno file once
628	head = p[0].strip().split('\t')
629	line1 = p[1].strip().split('\t')
630	if len(head) < len(line1):
631	head.insert(0,'ChipFileName') # fix R write.table b0rken-ness
632	p[0] = '\t'.join(head)
633	else:
634	p = []
635	return '\n'.join(p)
636
637	def set_peek( self, dataset, **kwd ):
638	"""
639	expects a .pheno file in the extra_files_dir - ugh
640	note that R is wierd and does not include the row.name in
641	the header. why?"""
642	if not dataset.dataset.purged:
643	pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
644	try:
645	p = file(pp,'r').readlines()
646	except:
647	p = ['##failed to find %s' % pp,]
648	dataset.peek = ''.join(p[:5])
649	dataset.blurb = 'Galaxy Rexpression composite file'
650	else:
651	dataset.peek = 'file does not exist\n'
652	dataset.blurb = 'file purged from disk'
653
654	def get_peek( self, dataset ):
655	"""
656	expects a .pheno file in the extra_files_dir - ugh
657	"""
658	pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
659	try:
660	p = file(pp,'r').readlines()
661	except:
662	p = ['##failed to find %s' % pp]
663	return ''.join(p[:5])
664
665	def get_file_peek(self,filename):
666	"""
667	can't really peek at a filename - need the extra_files_path and such?
668	"""
669	h = '## rexpression get_file_peek: no file found'
670	try:
671	h = file(filename,'r').readlines()
672	except:
673	pass
674	return ''.join(h[:5])
675
676	def regenerate_primary_file(self,dataset):
677	"""
678	cannot do this until we are setting metadata
679	"""
680	bn = dataset.metadata.base_name
681	flist = os.listdir(dataset.extra_files_path)
682	rval = ['<html><head><title>Files for Composite Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)]
683	for i,fname in enumerate(flist):
684	sfname = os.path.split(fname)[-1]
685	rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) )
686	rval.append( '</ul></html>' )
687	f = file(dataset.file_name,'w')
688	f.write("\n".join( rval ))
689	f.write('\n')
690	f.close()
691
692	def init_meta( self, dataset, copy_from=None ):
693	if copy_from:
694	dataset.metadata = copy_from.metadata
695
696	def set_meta( self, dataset, **kwd ):
697
698	"""
699	NOTE we apply the tabular machinary to the phenodata extracted
700	from a BioC eSet or affybatch.
701
702	"""
703	Html.set_meta(self, dataset, **kwd)
704	try:
705	flist = os.listdir(dataset.extra_files_path)
706	except:
707	if verbose:
708	gal_Log.debug('@@@rexpression set_meta failed - no dataset?')
709	return False
710	bn = dataset.metadata.base_name
711	if not bn:
712	for f in flist:
713	n = os.path.splitext(f)[0]
714	bn = n
715	dataset.metadata.base_name = bn
716	if not bn:
717	bn = '?'
718	dataset.metadata.base_name = bn
719	pn = '%s.pheno' % (bn)
720	pp = os.path.join(dataset.extra_files_path,pn)
721	dataset.metadata.pheno_path=pp
722	try:
723	pf = file(pp,'r').readlines() # read the basename.phenodata in the extra_files_path
724	except:
725	pf = None
726	if pf:
727	h = pf[0].strip()
728	h = h.split('\t') # hope is header
729	h = [escape(x) for x in h]
730	dataset.metadata.column_names = h
731	dataset.metadata.columns = len(h)
732	dataset.peek = ''.join(pf[:5])
733	else:
734	dataset.metadata.column_names = []
735	dataset.metadata.columns = 0
736	dataset.peek = 'No pheno file found'
737	if pf and len(pf) > 1:
738	dataset.metadata.pheCols = self.get_phecols(phenolist=pf)
739	else:
740	dataset.metadata.pheCols = [('','No useable phenotypes found',False),]
741	#self.regenerate_primary_file(dataset)
742	if not dataset.info:
743	dataset.info = 'Galaxy Expression datatype object'
744	if not dataset.blurb:
745	dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit'
746	return True
747
748	def make_html_table( self, pp='nothing supplied from peek\n'):
749	"""
750	Create HTML table, used for displaying peek
751	"""
752	out = ['<table cellspacing="0" cellpadding="3">',]
753	p = pp.split('\n')
754	try:
755	# Generate column header
756	for i,row in enumerate(p):
757	lrow = row.strip().split('\t')
758	if i == 0:
759	orow = ['<th>%s</th>' % escape(x) for x in lrow]
760	orow.insert(0,'<tr>')
761	orow.append('</tr>')
762	else:
763	orow = ['<td>%s</td>' % escape(x) for x in lrow]
764	orow.insert(0,'<tr>')
765	orow.append('</tr>')
766	out.append(''.join(orow))
767	out.append( '</table>' )
768	out = "\n".join( out )
769	except Exception, exc:
770	out = "Can't create html table %s" % str( exc )
771	return out
772
773	def display_peek( self, dataset ):
774	"""
775	Returns formatted html of peek
776	"""
777	out=self.make_html_table(dataset.peek)
778	return out
779
780	def get_mime(self):
781	"""
782	Returns the mime type of the datatype
783	"""
784	return 'text/html'
785
786
787	class Affybatch( RexpBase ):
788	"""
789	derived class for BioC data structures in Galaxy
790	"""
791
792	file_ext = "affybatch"
793
794	def __init__( self, **kwd ):
795	RexpBase.__init__(self, **kwd)
796	self.add_composite_file( '%s.affybatch', description = 'AffyBatch R object saved to file',
797	substitute_name_with_metadata = 'base_name', is_binary=True )
798
799	class Eset( RexpBase ):
800	"""
801	derived class for BioC data structures in Galaxy
802	"""
803	file_ext = "eset"
804
805	def __init__( self, **kwd ):
806	RexpBase.__init__(self, **kwd)
807	self.add_composite_file( '%s.eset', description = 'ESet R object saved to file',
808	substitute_name_with_metadata = 'base_name', is_binary = True )
809
810
811	class MAlist( RexpBase ):
812	"""
813	derived class for BioC data structures in Galaxy
814	"""
815	file_ext = "malist"
816
817	def __init__( self, **kwd ):
818	RexpBase.__init__(self, **kwd)
819	self.add_composite_file( '%s.malist', description = 'MAlist R object saved to file',
820	substitute_name_with_metadata = 'base_name', is_binary = True )
821
822
823	if __name__ == '__main__':
824	import doctest, sys
825	doctest.testmod(sys.modules[__name__])
826

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/lib/galaxy/datatypes/genetics.py

異なるフォーマットでダウンロード: