1 | """
|
---|
2 | rgenetics datatypes
|
---|
3 | Use at your peril
|
---|
4 | Ross Lazarus
|
---|
5 | for the rgenetics and galaxy projects
|
---|
6 |
|
---|
7 | genome graphs datatypes derived from Interval datatypes
|
---|
8 | genome graphs datasets have a header row with appropriate columnames
|
---|
9 | The first column is always the marker - eg columname = rs, first row= rs12345 if the rows are snps
|
---|
10 | subsequent row values are all numeric ! Will fail if any non numeric (eg '+' or 'NA') values
|
---|
11 | ross lazarus for rgenetics
|
---|
12 | august 20 2007
|
---|
13 | """
|
---|
14 |
|
---|
15 | import logging, os, sys, time, tempfile, shutil, string, glob
|
---|
16 | import data
|
---|
17 | from galaxy import util
|
---|
18 | from cgi import escape
|
---|
19 | import urllib, binascii
|
---|
20 | from galaxy.web import url_for
|
---|
21 | from galaxy.datatypes import metadata
|
---|
22 | from galaxy.datatypes.metadata import MetadataElement
|
---|
23 | from galaxy.datatypes.data import Text
|
---|
24 | from galaxy.datatypes.tabular import Tabular
|
---|
25 | from galaxy.datatypes.images import Html
|
---|
26 | from galaxy.datatypes.interval import Interval
|
---|
27 | from galaxy.util.hash_util import *
|
---|
28 |
|
---|
29 | gal_Log = logging.getLogger(__name__)
|
---|
30 | verbose = False
|
---|
31 |
|
---|
32 | class GenomeGraphs( Tabular ):
|
---|
33 | """
|
---|
34 | Tab delimited data containing a marker id and any number of numeric values
|
---|
35 | """
|
---|
36 |
|
---|
37 | MetadataElement( name="markerCol", default=1, desc="Marker ID column", param=metadata.ColumnParameter )
|
---|
38 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True )
|
---|
39 | MetadataElement( name="column_types", default=[], desc="Column types", readonly=True, visible=False )
|
---|
40 | file_ext = 'gg'
|
---|
41 |
|
---|
42 | def __init__(self, **kwd):
|
---|
43 | """
|
---|
44 | Initialize gg datatype, by adding UCSC display apps
|
---|
45 | """
|
---|
46 | Tabular.__init__(self, **kwd)
|
---|
47 | self.add_display_app ( 'ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links' )
|
---|
48 |
|
---|
49 |
|
---|
50 | def set_meta(self,dataset,**kwd):
|
---|
51 | Tabular.set_meta( self, dataset, **kwd)
|
---|
52 | dataset.metadata.markerCol = 1
|
---|
53 | header = file(dataset.file_name,'r').readlines()[0].strip().split('\t')
|
---|
54 | dataset.metadata.columns = len(header)
|
---|
55 | t = ['numeric' for x in header]
|
---|
56 | t[0] = 'string'
|
---|
57 | dataset.metadata.column_types = t
|
---|
58 | return True
|
---|
59 |
|
---|
60 | def as_ucsc_display_file( self, dataset, **kwd ):
|
---|
61 | """
|
---|
62 | Returns file
|
---|
63 | """
|
---|
64 | return file(dataset.file_name,'r')
|
---|
65 |
|
---|
66 | def ucsc_links( self, dataset, type, app, base_url ):
|
---|
67 | """
|
---|
68 | from the ever-helpful angie hinrichs angie@soe.ucsc.edu
|
---|
69 | a genome graphs call looks like this
|
---|
70 | http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname
|
---|
71 | &hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess
|
---|
72 | &hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals=
|
---|
73 | &hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index
|
---|
74 | &hgGenome_doSubmitUpload=submit
|
---|
75 | Galaxy gives this for an interval file
|
---|
76 | http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText=
|
---|
77 | http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc
|
---|
78 | """
|
---|
79 | ret_val = []
|
---|
80 | ggtail = 'hgGenome_doSubmitUpload=submit'
|
---|
81 | if not dataset.dbkey:
|
---|
82 | dataset.dbkey = 'hg18' # punt!
|
---|
83 | if dataset.has_data():
|
---|
84 | for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey):
|
---|
85 | if site_name in app.config.ucsc_display_sites:
|
---|
86 | site_url = site_url.replace('/hgTracks?','/hgGenome?') # for genome graphs
|
---|
87 | internal_url = "%s" % url_for( controller='dataset',
|
---|
88 | dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name )
|
---|
89 | display_url = "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type)
|
---|
90 | display_url = urllib.quote_plus( display_url )
|
---|
91 | # was display_url = urllib.quote_plus( "%s/display_as?id=%i&display_app=%s" % (base_url, dataset.id, type) )
|
---|
92 | #redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) )
|
---|
93 | sl = ["%sdb=%s" % (site_url,dataset.dbkey ),]
|
---|
94 | #sl.append("&hgt.customText=%s")
|
---|
95 | sl.append("&hgGenome_dataSetName=%s&hgGenome_dataSetDescription=%s" % (dataset.name, 'GalaxyGG_data'))
|
---|
96 | sl.append("&hgGenome_formatType=best guess&hgGenome_markerType=best guess")
|
---|
97 | sl.append("&hgGenome_columnLabels=first row&hgGenome_maxVal=&hgGenome_labelVals=")
|
---|
98 | sl.append("&hgGenome_doSubmitUpload=submit")
|
---|
99 | sl.append("&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%s" % display_url)
|
---|
100 | s = ''.join(sl)
|
---|
101 | s = urllib.quote_plus(s)
|
---|
102 | redirect_url = s
|
---|
103 | log.debug('## rg gg ucsc rdurl=%s; s = %s' % (redirect_url,s))
|
---|
104 | link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
|
---|
105 | ret_val.append( (site_name, link) )
|
---|
106 | return ret_val
|
---|
107 |
|
---|
108 | def make_html_table( self, dataset, skipchars=[] ):
|
---|
109 | """
|
---|
110 | Create HTML table, used for displaying peek
|
---|
111 | """
|
---|
112 | npeek = 5
|
---|
113 | out = ['<table cellspacing="0" cellpadding="3">']
|
---|
114 | f = open(dataset.file_name,'r')
|
---|
115 | d = f.readlines()[:5]
|
---|
116 | if len(d) == 0:
|
---|
117 | out = "Cannot find anything to parse in %s" % dataset.name
|
---|
118 | return out
|
---|
119 | hasheader = 0
|
---|
120 | try:
|
---|
121 | test = ['%f' % x for x in d[0][1:]] # first is name - see if starts all numerics
|
---|
122 | except:
|
---|
123 | hasheader = 1
|
---|
124 | try:
|
---|
125 | # Generate column header
|
---|
126 | out.append( '<tr>' )
|
---|
127 | if hasheader:
|
---|
128 | for i, name in enumerate(d[0].split() ):
|
---|
129 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
|
---|
130 | d.pop(0)
|
---|
131 | out.append('</tr>')
|
---|
132 | for row in d:
|
---|
133 | out.append('<tr>')
|
---|
134 | out.append(''.join(['<td>%s</td>' % x for x in row.split()]))
|
---|
135 | out.append('</tr>')
|
---|
136 | out.append( '</table>' )
|
---|
137 | out = "".join( out )
|
---|
138 | except Exception, exc:
|
---|
139 | out = "Can't create peek %s" % exc
|
---|
140 | return out
|
---|
141 |
|
---|
142 | def validate( self, dataset ):
|
---|
143 | """
|
---|
144 | Validate a gg file - all numeric after header row
|
---|
145 | """
|
---|
146 | errors = list()
|
---|
147 | infile = open(dataset.file_name, "r")
|
---|
148 | header= infile.next() # header
|
---|
149 | for i,row in enumerate(infile):
|
---|
150 | ll = row.strip().split('\t')[1:] # first is alpha feature identifier
|
---|
151 | badvals = []
|
---|
152 | for j,x in enumerate(ll):
|
---|
153 | try:
|
---|
154 | x = float(x)
|
---|
155 | except:
|
---|
156 | badval.append('col%d:%s' % (j+1,x))
|
---|
157 | if len(badvals) > 0:
|
---|
158 | errors.append('row %d, %s' % (' '.join(badvals)))
|
---|
159 | return errors
|
---|
160 |
|
---|
161 | def sniff( self, filename ):
|
---|
162 | """
|
---|
163 | Determines whether the file is in gg format
|
---|
164 | """
|
---|
165 | f = open(filename,'r')
|
---|
166 | headers = f.readline().split()
|
---|
167 | rows = [f.readline().split()[1:] for x in range(3)] # small sample
|
---|
168 | #headers = get_headers( filename, '\t' )
|
---|
169 | for row in rows:
|
---|
170 | try:
|
---|
171 | nums = [float(x) for x in row] # first col has been removed
|
---|
172 | except:
|
---|
173 | return false
|
---|
174 | return true
|
---|
175 |
|
---|
176 | def get_mime(self):
|
---|
177 | """Returns the mime type of the datatype"""
|
---|
178 | return 'application/vnd.ms-excel'
|
---|
179 |
|
---|
180 |
|
---|
181 | class rgTabList(Tabular):
|
---|
182 | """
|
---|
183 | for sampleid and for featureid lists of exclusions or inclusions in the clean tool
|
---|
184 | featureid subsets on statistical criteria -> specialized display such as gg
|
---|
185 | """
|
---|
186 | file_ext = "rgTList"
|
---|
187 |
|
---|
188 |
|
---|
189 | def __init__(self, **kwd):
|
---|
190 | """
|
---|
191 | Initialize featurelistt datatype
|
---|
192 | """
|
---|
193 | Tabular.__init__( self, **kwd )
|
---|
194 | self.column_names = []
|
---|
195 |
|
---|
196 | def make_html_table( self, dataset, skipchars=[] ):
|
---|
197 | """
|
---|
198 | Create HTML table, used for displaying peek
|
---|
199 | """
|
---|
200 | out = ['<table cellspacing="0" cellpadding="3">']
|
---|
201 | comments = []
|
---|
202 | try:
|
---|
203 | # Generate column header
|
---|
204 | out.append( '<tr>' )
|
---|
205 | for i, name in enumerate( self.column_names ):
|
---|
206 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
|
---|
207 | if dataset.metadata.columns - len( self.column_names ) > 0:
|
---|
208 | for i in range( len( self.column_names ), dataset.metadata.columns ):
|
---|
209 | out.append( '<th>%s</th>' % str( i+1 ) )
|
---|
210 | out.append( '</tr>' )
|
---|
211 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
|
---|
212 | out.append( '</table>' )
|
---|
213 | out = "".join( out )
|
---|
214 | except Exception, exc:
|
---|
215 | out = "Can't create peek %s" % exc
|
---|
216 | return out
|
---|
217 |
|
---|
218 | def get_mime(self):
|
---|
219 | """Returns the mime type of the datatype"""
|
---|
220 | return 'text/html'
|
---|
221 |
|
---|
222 |
|
---|
223 | class rgSampleList(rgTabList):
|
---|
224 | """
|
---|
225 | for sampleid exclusions or inclusions in the clean tool
|
---|
226 | output from QC eg excess het, gender error, ibd pair member,eigen outlier,excess mendel errors,...
|
---|
227 | since they can be uploaded, should be flexible
|
---|
228 | but they are persistent at least
|
---|
229 | same infrastructure for expression?
|
---|
230 | """
|
---|
231 | file_ext = "rgSList"
|
---|
232 |
|
---|
233 | def __init__(self, **kwd):
|
---|
234 | """
|
---|
235 | Initialize samplelist datatype
|
---|
236 | """
|
---|
237 | rgTabList.__init__( self, **kwd )
|
---|
238 | self.column_names[0] = 'FID'
|
---|
239 | self.column_names[1] = 'IID'
|
---|
240 | # this is what Plink wants as at 2009
|
---|
241 |
|
---|
242 | def sniff(self,filename):
|
---|
243 | infile = open(dataset.file_name, "r")
|
---|
244 | header= infile.next() # header
|
---|
245 | if header[0] == 'FID' and header[1] == 'IID':
|
---|
246 | return True
|
---|
247 | else:
|
---|
248 | return False
|
---|
249 |
|
---|
250 | class rgFeatureList( rgTabList ):
|
---|
251 | """
|
---|
252 | for featureid lists of exclusions or inclusions in the clean tool
|
---|
253 | output from QC eg low maf, high missingness, bad hwe in controls, excess mendel errors,...
|
---|
254 | featureid subsets on statistical criteria -> specialized display such as gg
|
---|
255 | same infrastructure for expression?
|
---|
256 | """
|
---|
257 | file_ext = "rgFList"
|
---|
258 |
|
---|
259 | def __init__(self, **kwd):
|
---|
260 | """Initialize featurelist datatype"""
|
---|
261 | rgTabList.__init__( self, **kwd )
|
---|
262 | for i,s in enumerate(['#FeatureId', 'Chr', 'Genpos', 'Mappos']):
|
---|
263 | self.column_names[i] = s
|
---|
264 |
|
---|
265 |
|
---|
266 | class Rgenetics(Html):
|
---|
267 | """
|
---|
268 | base class to use for rgenetics datatypes
|
---|
269 | derived from html - composite datatype elements
|
---|
270 | stored in extra files path
|
---|
271 | """
|
---|
272 |
|
---|
273 | MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default='RgeneticsData',
|
---|
274 | readonly=True, set_in_upload=True)
|
---|
275 |
|
---|
276 | composite_type = 'auto_primary_file'
|
---|
277 | allow_datatype_change = False
|
---|
278 | file_ext = 'rgenetics'
|
---|
279 |
|
---|
280 | def generate_primary_file( self, dataset = None ):
|
---|
281 | rval = ['<html><head><title>Rgenetics Galaxy Composite Dataset </title></head><p/>']
|
---|
282 | rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
|
---|
283 | for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
|
---|
284 | fn = composite_name
|
---|
285 | opt_text = ''
|
---|
286 | if composite_file.optional:
|
---|
287 | opt_text = ' (optional)'
|
---|
288 | if composite_file.get('description'):
|
---|
289 | rval.append( '<li><a href="%s" type="application/binary">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
|
---|
290 | else:
|
---|
291 | rval.append( '<li><a href="%s" type="application/binary">%s</a>%s</li>' % ( fn, fn, opt_text ) )
|
---|
292 | rval.append( '</ul></div></html>' )
|
---|
293 | return "\n".join( rval )
|
---|
294 |
|
---|
295 | def regenerate_primary_file(self,dataset):
|
---|
296 | """
|
---|
297 | cannot do this until we are setting metadata
|
---|
298 | """
|
---|
299 | bn = dataset.metadata.base_name
|
---|
300 | efp = dataset.extra_files_path
|
---|
301 | flist = os.listdir(efp)
|
---|
302 | rval = ['<html><head><title>Files for Composite Dataset %s</title></head><body><p/>Composite %s contains:<p/><ul>' % (dataset.name,dataset.name)]
|
---|
303 | for i,fname in enumerate(flist):
|
---|
304 | sfname = os.path.split(fname)[-1]
|
---|
305 | f,e = os.path.splitext(fname)
|
---|
306 | rval.append( '<li><a href="%s">%s</a></li>' % ( sfname, sfname) )
|
---|
307 | rval.append( '</ul></body></html>' )
|
---|
308 | f = file(dataset.file_name,'w')
|
---|
309 | f.write("\n".join( rval ))
|
---|
310 | f.write('\n')
|
---|
311 | f.close()
|
---|
312 |
|
---|
313 | def get_mime(self):
|
---|
314 | """Returns the mime type of the datatype"""
|
---|
315 | return 'text/html'
|
---|
316 |
|
---|
317 |
|
---|
318 | def set_meta( self, dataset, **kwd ):
|
---|
319 |
|
---|
320 | """
|
---|
321 | for lped/pbed eg
|
---|
322 |
|
---|
323 | """
|
---|
324 | Html.set_meta( self, dataset, **kwd )
|
---|
325 | if kwd.get('overwrite') == False:
|
---|
326 | if verbose:
|
---|
327 | gal_Log.debug('@@@ rgenetics set_meta called with overwrite = False')
|
---|
328 | return True
|
---|
329 | try:
|
---|
330 | efp = dataset.extra_files_path
|
---|
331 | except:
|
---|
332 | if verbose:
|
---|
333 | gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0], dataset.name))
|
---|
334 | return False
|
---|
335 | try:
|
---|
336 | flist = os.listdir(efp)
|
---|
337 | except:
|
---|
338 | if verbose: gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0],dataset.name))
|
---|
339 | return False
|
---|
340 | if len(flist) == 0:
|
---|
341 | if verbose:
|
---|
342 | gal_Log.debug('@@@rgenetics set_meta failed - %s efp %s is empty?' % (dataset.name,efp))
|
---|
343 | return False
|
---|
344 | self.regenerate_primary_file(dataset)
|
---|
345 | if not dataset.info:
|
---|
346 | dataset.info = 'Galaxy genotype datatype object'
|
---|
347 | if not dataset.blurb:
|
---|
348 | dataset.blurb = 'Composite file - Rgenetics Galaxy toolkit'
|
---|
349 | return True
|
---|
350 |
|
---|
351 |
|
---|
352 |
|
---|
353 | class SNPMatrix(Rgenetics):
|
---|
354 | """
|
---|
355 | BioC SNPMatrix Rgenetics data collections
|
---|
356 | """
|
---|
357 | file_ext="snpmatrix"
|
---|
358 |
|
---|
359 | def set_peek( self, dataset, **kwd ):
|
---|
360 | if not dataset.dataset.purged:
|
---|
361 | dataset.peek = "Binary RGenetics file"
|
---|
362 | dataset.blurb = data.nice_size( dataset.get_size() )
|
---|
363 | else:
|
---|
364 | dataset.peek = 'file does not exist'
|
---|
365 | dataset.blurb = 'file purged from disk'
|
---|
366 |
|
---|
367 | def sniff(self,filename):
|
---|
368 | """ need to check the file header hex code
|
---|
369 | """
|
---|
370 | infile = open(dataset.file_name, "b")
|
---|
371 | head = infile.read(16)
|
---|
372 | head = [hex(x) for x in head]
|
---|
373 | if head <> '':
|
---|
374 | return False
|
---|
375 | else:
|
---|
376 | return True
|
---|
377 |
|
---|
378 |
|
---|
379 | class Lped(Rgenetics):
|
---|
380 | """
|
---|
381 | linkage pedigree (ped,map) Rgenetics data collections
|
---|
382 | """
|
---|
383 | file_ext="lped"
|
---|
384 |
|
---|
385 | def __init__( self, **kwd ):
|
---|
386 | Rgenetics.__init__(self, **kwd)
|
---|
387 | self.add_composite_file( '%s.ped', description = 'Pedigree File', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
388 | self.add_composite_file( '%s.map', description = 'Map File', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
389 |
|
---|
390 |
|
---|
391 | class Pphe(Rgenetics):
|
---|
392 | """
|
---|
393 | Plink phenotype file - header must have FID\tIID... Rgenetics data collections
|
---|
394 | """
|
---|
395 | file_ext="pphe"
|
---|
396 |
|
---|
397 | def __init__( self, **kwd ):
|
---|
398 | Rgenetics.__init__(self, **kwd)
|
---|
399 | self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
400 |
|
---|
401 |
|
---|
402 |
|
---|
403 |
|
---|
404 | class Fphe(Rgenetics):
|
---|
405 | """
|
---|
406 | fbat pedigree file - mad format with ! as first char on header row
|
---|
407 | Rgenetics data collections
|
---|
408 | """
|
---|
409 | file_ext="fphe"
|
---|
410 |
|
---|
411 | def __init__( self, **kwd ):
|
---|
412 | Rgenetics.__init__(self, **kwd)
|
---|
413 | self.add_composite_file( '%s.fphe', description = 'FBAT Phenotype File', substitute_name_with_metadata = 'base_name' )
|
---|
414 |
|
---|
415 | class Phe(Rgenetics):
|
---|
416 | """
|
---|
417 | Phenotype file
|
---|
418 | """
|
---|
419 | file_ext="phe"
|
---|
420 |
|
---|
421 | def __init__( self, **kwd ):
|
---|
422 | Rgenetics.__init__(self, **kwd)
|
---|
423 | self.add_composite_file( '%s.phe', description = 'Phenotype File', substitute_name_with_metadata = 'base_name',
|
---|
424 | is_binary = False )
|
---|
425 |
|
---|
426 |
|
---|
427 |
|
---|
428 | class Fped(Rgenetics):
|
---|
429 | """
|
---|
430 | FBAT pedigree format - single file, map is header row of rs numbers. Strange.
|
---|
431 | Rgenetics data collections
|
---|
432 | """
|
---|
433 | file_ext="fped"
|
---|
434 |
|
---|
435 | def __init__( self, **kwd ):
|
---|
436 | Rgenetics.__init__(self, **kwd)
|
---|
437 | self.add_composite_file( '%s.fped', description = 'FBAT format pedfile', substitute_name_with_metadata = 'base_name',
|
---|
438 | is_binary = False )
|
---|
439 |
|
---|
440 |
|
---|
441 | class Pbed(Rgenetics):
|
---|
442 | """
|
---|
443 | Plink Binary compressed 2bit/geno Rgenetics data collections
|
---|
444 | """
|
---|
445 | file_ext="pbed"
|
---|
446 |
|
---|
447 | def __init__( self, **kwd ):
|
---|
448 | Rgenetics.__init__(self, **kwd)
|
---|
449 | self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
450 | self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
|
---|
451 | self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
452 |
|
---|
453 | class ldIndep(Rgenetics):
|
---|
454 | """
|
---|
455 | LD (a good measure of redundancy of information) depleted Plink Binary compressed 2bit/geno
|
---|
456 | This is really a plink binary, but some tools work better with less redundancy so are constrained to
|
---|
457 | these files
|
---|
458 | """
|
---|
459 | file_ext="ldreduced"
|
---|
460 |
|
---|
461 | def __init__( self, **kwd ):
|
---|
462 | Rgenetics.__init__(self, **kwd)
|
---|
463 | self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
464 | self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
|
---|
465 | self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
466 |
|
---|
467 |
|
---|
468 | class Eigenstratgeno(Rgenetics):
|
---|
469 | """
|
---|
470 | Eigenstrat format - may be able to get rid of this
|
---|
471 | if we move to shellfish
|
---|
472 | Rgenetics data collections
|
---|
473 | """
|
---|
474 | file_ext="eigenstratgeno"
|
---|
475 |
|
---|
476 | def __init__( self, **kwd ):
|
---|
477 | Rgenetics.__init__(self, **kwd)
|
---|
478 | self.add_composite_file( '%s.eigenstratgeno', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
479 | self.add_composite_file( '%s.ind', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
480 | self.add_composite_file( '%s.map', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
481 |
|
---|
482 |
|
---|
483 |
|
---|
484 | class Eigenstratpca(Rgenetics):
|
---|
485 | """
|
---|
486 | Eigenstrat PCA file for case control adjustment
|
---|
487 | Rgenetics data collections
|
---|
488 | """
|
---|
489 | file_ext="eigenstratpca"
|
---|
490 |
|
---|
491 | def __init__( self, **kwd ):
|
---|
492 | Rgenetics.__init__(self, **kwd)
|
---|
493 | self.add_composite_file( '%s.eigenstratpca', description = 'Eigenstrat PCA file', substitute_name_with_metadata = 'base_name' )
|
---|
494 |
|
---|
495 |
|
---|
496 | class Snptest(Rgenetics):
|
---|
497 | """
|
---|
498 | BioC snptest Rgenetics data collections
|
---|
499 | """
|
---|
500 | file_ext="snptest"
|
---|
501 |
|
---|
502 |
|
---|
503 | class Pheno(Tabular):
|
---|
504 | """
|
---|
505 | base class for pheno files
|
---|
506 | """
|
---|
507 | file_ext = 'pheno'
|
---|
508 |
|
---|
509 |
|
---|
510 | class RexpBase( Html ):
|
---|
511 | """
|
---|
512 | base class for BioC data structures in Galaxy
|
---|
513 | must be constructed with the pheno data in place since that
|
---|
514 | goes into the metadata for each instance
|
---|
515 | """
|
---|
516 | MetadataElement( name="columns", default=0, desc="Number of columns", visible=True )
|
---|
517 | MetadataElement( name="column_names", default=[], desc="Column names", visible=True )
|
---|
518 | MetadataElement(name="pheCols",default=[],desc="Select list for potentially interesting variables",visible=True)
|
---|
519 | MetadataElement( name="base_name",
|
---|
520 | desc="base name for all transformed versions of this expression dataset", default='rexpression', set_in_upload=True)
|
---|
521 | MetadataElement( name="pheno_path", desc="Path to phenotype data for this experiment", default="rexpression.pheno", visible=True)
|
---|
522 | file_ext = 'rexpbase'
|
---|
523 | html_table = None
|
---|
524 | is_binary = True
|
---|
525 | composite_type = 'auto_primary_file'
|
---|
526 | allow_datatype_change = False
|
---|
527 |
|
---|
528 |
|
---|
529 | def __init__( self, **kwd ):
|
---|
530 | Html.__init__(self,**kwd)
|
---|
531 | self.add_composite_file( '%s.pheno', description = 'Phenodata tab text file',
|
---|
532 | substitute_name_with_metadata = 'base_name', is_binary=False)
|
---|
533 |
|
---|
534 | def generate_primary_file( self, dataset = None ):
|
---|
535 | """
|
---|
536 | This is called only at upload to write the html file
|
---|
537 | cannot rename the datasets here - they come with the default unfortunately
|
---|
538 | """
|
---|
539 | return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
|
---|
540 |
|
---|
541 | def get_mime(self):
|
---|
542 | """Returns the mime type of the datatype"""
|
---|
543 | return 'text/html'
|
---|
544 |
|
---|
545 | def get_phecols(self, phenolist=[], maxConc=20):
|
---|
546 | """
|
---|
547 | sept 2009: cannot use whitespace to split - make a more complex structure here
|
---|
548 | and adjust the methods that rely on this structure
|
---|
549 | return interesting phenotype column names for an rexpression eset or affybatch
|
---|
550 | to use in array subsetting and so on. Returns a data structure for a
|
---|
551 | dynamic Galaxy select parameter.
|
---|
552 | A column with only 1 value doesn't change, so is not interesting for
|
---|
553 | analysis. A column with a different value in every row is equivalent to a unique
|
---|
554 | identifier so is also not interesting for anova or limma analysis - both these
|
---|
555 | are removed after the concordance (count of unique terms) is constructed for each
|
---|
556 | column. Then a complication - each remaining pair of columns is tested for
|
---|
557 | redundancy - if two columns are always paired, then only one is needed :)
|
---|
558 | """
|
---|
559 | for nrows,row in enumerate(phenolist): # construct concordance
|
---|
560 | if len(row.strip()) == 0:
|
---|
561 | break
|
---|
562 | row = row.strip().split('\t')
|
---|
563 | if nrows == 0: # set up from header
|
---|
564 | head = row
|
---|
565 | totcols = len(row)
|
---|
566 | concordance = [{} for x in head] # list of dicts
|
---|
567 | else:
|
---|
568 | for col,code in enumerate(row): # keep column order correct
|
---|
569 | if col >= totcols:
|
---|
570 | gal_Log.warning('### get_phecols error in pheno file - row %d col %d (%s) longer than header %s' % (nrows, col, row, head))
|
---|
571 | else:
|
---|
572 | concordance[col].setdefault(code,0) # first one is zero
|
---|
573 | concordance[col][code] += 1
|
---|
574 | useCols = []
|
---|
575 | useConc = [] # columns of interest to keep
|
---|
576 | nrows = len(phenolist)
|
---|
577 | nrows -= 1 # drop head from count
|
---|
578 | for c,conc in enumerate(concordance): # c is column number
|
---|
579 | if (len(conc) > 1) and (len(conc) < min(nrows,maxConc)): # not all same and not all different!!
|
---|
580 | useConc.append(conc) # keep concordance
|
---|
581 | useCols.append(c) # keep column
|
---|
582 | nuse = len(useCols)
|
---|
583 | # now to check for pairs of concordant columns - drop one of these.
|
---|
584 | delme = []
|
---|
585 | p = phenolist[1:] # drop header
|
---|
586 | plist = [x.strip().split('\t') for x in p] # list of lists
|
---|
587 | phe = [[x[i] for i in useCols] for x in plist if len(x) >= totcols] # strip unused data
|
---|
588 | for i in range(0,(nuse-1)): # for each interesting column
|
---|
589 | for j in range(i+1,nuse):
|
---|
590 | kdict = {}
|
---|
591 | for row in phe: # row is a list of lists
|
---|
592 | k = '%s%s' % (row[i],row[j]) # composite key
|
---|
593 | kdict[k] = k
|
---|
594 | if (len(kdict.keys()) == len(concordance[useCols[j]])): # i and j are always matched
|
---|
595 | delme.append(j)
|
---|
596 | delme = list(set(delme)) # remove dupes
|
---|
597 | listCol = []
|
---|
598 | delme.sort()
|
---|
599 | delme.reverse() # must delete from far end!
|
---|
600 | for i in delme:
|
---|
601 | del useConc[i] # get rid of concordance
|
---|
602 | del useCols[i] # and usecols entry
|
---|
603 | for i,conc in enumerate(useConc): # these are all unique columns for the design matrix
|
---|
604 | ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
|
---|
605 | ccounts.sort()
|
---|
606 | cc = [(x[1],x[0]) for x in ccounts] # list of code count tuples
|
---|
607 | codeDetails = (head[useCols[i]],cc) # ('foo',[('a',3),('b',11),..])
|
---|
608 | listCol.append(codeDetails)
|
---|
609 | if len(listCol) > 0:
|
---|
610 | res = listCol
|
---|
611 | # metadata.pheCols becomes [('bar;22,zot;113','foo'), ...]
|
---|
612 | else:
|
---|
613 | res = [('no usable phenotype columns found',[('?',0),]),]
|
---|
614 | return res
|
---|
615 |
|
---|
616 |
|
---|
617 |
|
---|
618 | def get_pheno(self,dataset):
|
---|
619 | """
|
---|
620 | expects a .pheno file in the extra_files_dir - ugh
|
---|
621 | note that R is wierd and adds the row.name in
|
---|
622 | the header so the columns are all wrong - unless you tell it not to.
|
---|
623 | A file can be written as
|
---|
624 | write.table(file='foo.pheno',pData(foo),sep='\t',quote=F,row.names=F)
|
---|
625 | """
|
---|
626 | p = file(dataset.metadata.pheno_path,'r').readlines()
|
---|
627 | if len(p) > 0: # should only need to fix an R pheno file once
|
---|
628 | head = p[0].strip().split('\t')
|
---|
629 | line1 = p[1].strip().split('\t')
|
---|
630 | if len(head) < len(line1):
|
---|
631 | head.insert(0,'ChipFileName') # fix R write.table b0rken-ness
|
---|
632 | p[0] = '\t'.join(head)
|
---|
633 | else:
|
---|
634 | p = []
|
---|
635 | return '\n'.join(p)
|
---|
636 |
|
---|
637 | def set_peek( self, dataset, **kwd ):
|
---|
638 | """
|
---|
639 | expects a .pheno file in the extra_files_dir - ugh
|
---|
640 | note that R is wierd and does not include the row.name in
|
---|
641 | the header. why?"""
|
---|
642 | if not dataset.dataset.purged:
|
---|
643 | pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
|
---|
644 | try:
|
---|
645 | p = file(pp,'r').readlines()
|
---|
646 | except:
|
---|
647 | p = ['##failed to find %s' % pp,]
|
---|
648 | dataset.peek = ''.join(p[:5])
|
---|
649 | dataset.blurb = 'Galaxy Rexpression composite file'
|
---|
650 | else:
|
---|
651 | dataset.peek = 'file does not exist\n'
|
---|
652 | dataset.blurb = 'file purged from disk'
|
---|
653 |
|
---|
654 | def get_peek( self, dataset ):
|
---|
655 | """
|
---|
656 | expects a .pheno file in the extra_files_dir - ugh
|
---|
657 | """
|
---|
658 | pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
|
---|
659 | try:
|
---|
660 | p = file(pp,'r').readlines()
|
---|
661 | except:
|
---|
662 | p = ['##failed to find %s' % pp]
|
---|
663 | return ''.join(p[:5])
|
---|
664 |
|
---|
665 | def get_file_peek(self,filename):
|
---|
666 | """
|
---|
667 | can't really peek at a filename - need the extra_files_path and such?
|
---|
668 | """
|
---|
669 | h = '## rexpression get_file_peek: no file found'
|
---|
670 | try:
|
---|
671 | h = file(filename,'r').readlines()
|
---|
672 | except:
|
---|
673 | pass
|
---|
674 | return ''.join(h[:5])
|
---|
675 |
|
---|
676 | def regenerate_primary_file(self,dataset):
|
---|
677 | """
|
---|
678 | cannot do this until we are setting metadata
|
---|
679 | """
|
---|
680 | bn = dataset.metadata.base_name
|
---|
681 | flist = os.listdir(dataset.extra_files_path)
|
---|
682 | rval = ['<html><head><title>Files for Composite Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)]
|
---|
683 | for i,fname in enumerate(flist):
|
---|
684 | sfname = os.path.split(fname)[-1]
|
---|
685 | rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) )
|
---|
686 | rval.append( '</ul></html>' )
|
---|
687 | f = file(dataset.file_name,'w')
|
---|
688 | f.write("\n".join( rval ))
|
---|
689 | f.write('\n')
|
---|
690 | f.close()
|
---|
691 |
|
---|
692 | def init_meta( self, dataset, copy_from=None ):
|
---|
693 | if copy_from:
|
---|
694 | dataset.metadata = copy_from.metadata
|
---|
695 |
|
---|
696 | def set_meta( self, dataset, **kwd ):
|
---|
697 |
|
---|
698 | """
|
---|
699 | NOTE we apply the tabular machinary to the phenodata extracted
|
---|
700 | from a BioC eSet or affybatch.
|
---|
701 |
|
---|
702 | """
|
---|
703 | Html.set_meta(self, dataset, **kwd)
|
---|
704 | try:
|
---|
705 | flist = os.listdir(dataset.extra_files_path)
|
---|
706 | except:
|
---|
707 | if verbose:
|
---|
708 | gal_Log.debug('@@@rexpression set_meta failed - no dataset?')
|
---|
709 | return False
|
---|
710 | bn = dataset.metadata.base_name
|
---|
711 | if not bn:
|
---|
712 | for f in flist:
|
---|
713 | n = os.path.splitext(f)[0]
|
---|
714 | bn = n
|
---|
715 | dataset.metadata.base_name = bn
|
---|
716 | if not bn:
|
---|
717 | bn = '?'
|
---|
718 | dataset.metadata.base_name = bn
|
---|
719 | pn = '%s.pheno' % (bn)
|
---|
720 | pp = os.path.join(dataset.extra_files_path,pn)
|
---|
721 | dataset.metadata.pheno_path=pp
|
---|
722 | try:
|
---|
723 | pf = file(pp,'r').readlines() # read the basename.phenodata in the extra_files_path
|
---|
724 | except:
|
---|
725 | pf = None
|
---|
726 | if pf:
|
---|
727 | h = pf[0].strip()
|
---|
728 | h = h.split('\t') # hope is header
|
---|
729 | h = [escape(x) for x in h]
|
---|
730 | dataset.metadata.column_names = h
|
---|
731 | dataset.metadata.columns = len(h)
|
---|
732 | dataset.peek = ''.join(pf[:5])
|
---|
733 | else:
|
---|
734 | dataset.metadata.column_names = []
|
---|
735 | dataset.metadata.columns = 0
|
---|
736 | dataset.peek = 'No pheno file found'
|
---|
737 | if pf and len(pf) > 1:
|
---|
738 | dataset.metadata.pheCols = self.get_phecols(phenolist=pf)
|
---|
739 | else:
|
---|
740 | dataset.metadata.pheCols = [('','No useable phenotypes found',False),]
|
---|
741 | #self.regenerate_primary_file(dataset)
|
---|
742 | if not dataset.info:
|
---|
743 | dataset.info = 'Galaxy Expression datatype object'
|
---|
744 | if not dataset.blurb:
|
---|
745 | dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit'
|
---|
746 | return True
|
---|
747 |
|
---|
748 | def make_html_table( self, pp='nothing supplied from peek\n'):
|
---|
749 | """
|
---|
750 | Create HTML table, used for displaying peek
|
---|
751 | """
|
---|
752 | out = ['<table cellspacing="0" cellpadding="3">',]
|
---|
753 | p = pp.split('\n')
|
---|
754 | try:
|
---|
755 | # Generate column header
|
---|
756 | for i,row in enumerate(p):
|
---|
757 | lrow = row.strip().split('\t')
|
---|
758 | if i == 0:
|
---|
759 | orow = ['<th>%s</th>' % escape(x) for x in lrow]
|
---|
760 | orow.insert(0,'<tr>')
|
---|
761 | orow.append('</tr>')
|
---|
762 | else:
|
---|
763 | orow = ['<td>%s</td>' % escape(x) for x in lrow]
|
---|
764 | orow.insert(0,'<tr>')
|
---|
765 | orow.append('</tr>')
|
---|
766 | out.append(''.join(orow))
|
---|
767 | out.append( '</table>' )
|
---|
768 | out = "\n".join( out )
|
---|
769 | except Exception, exc:
|
---|
770 | out = "Can't create html table %s" % str( exc )
|
---|
771 | return out
|
---|
772 |
|
---|
773 | def display_peek( self, dataset ):
|
---|
774 | """
|
---|
775 | Returns formatted html of peek
|
---|
776 | """
|
---|
777 | out=self.make_html_table(dataset.peek)
|
---|
778 | return out
|
---|
779 |
|
---|
780 | def get_mime(self):
|
---|
781 | """
|
---|
782 | Returns the mime type of the datatype
|
---|
783 | """
|
---|
784 | return 'text/html'
|
---|
785 |
|
---|
786 |
|
---|
787 | class Affybatch( RexpBase ):
|
---|
788 | """
|
---|
789 | derived class for BioC data structures in Galaxy
|
---|
790 | """
|
---|
791 |
|
---|
792 | file_ext = "affybatch"
|
---|
793 |
|
---|
794 | def __init__( self, **kwd ):
|
---|
795 | RexpBase.__init__(self, **kwd)
|
---|
796 | self.add_composite_file( '%s.affybatch', description = 'AffyBatch R object saved to file',
|
---|
797 | substitute_name_with_metadata = 'base_name', is_binary=True )
|
---|
798 |
|
---|
799 | class Eset( RexpBase ):
|
---|
800 | """
|
---|
801 | derived class for BioC data structures in Galaxy
|
---|
802 | """
|
---|
803 | file_ext = "eset"
|
---|
804 |
|
---|
805 | def __init__( self, **kwd ):
|
---|
806 | RexpBase.__init__(self, **kwd)
|
---|
807 | self.add_composite_file( '%s.eset', description = 'ESet R object saved to file',
|
---|
808 | substitute_name_with_metadata = 'base_name', is_binary = True )
|
---|
809 |
|
---|
810 |
|
---|
811 | class MAlist( RexpBase ):
|
---|
812 | """
|
---|
813 | derived class for BioC data structures in Galaxy
|
---|
814 | """
|
---|
815 | file_ext = "malist"
|
---|
816 |
|
---|
817 | def __init__( self, **kwd ):
|
---|
818 | RexpBase.__init__(self, **kwd)
|
---|
819 | self.add_composite_file( '%s.malist', description = 'MAlist R object saved to file',
|
---|
820 | substitute_name_with_metadata = 'base_name', is_binary = True )
|
---|
821 |
|
---|
822 |
|
---|
823 | if __name__ == '__main__':
|
---|
824 | import doctest, sys
|
---|
825 | doctest.testmod(sys.modules[__name__])
|
---|
826 |
|
---|