root/galaxy-central/lib/galaxy/datatypes/genetics.py

リビジョン 2, 32.0 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2rgenetics datatypes
3Use at your peril
4Ross Lazarus
5for the rgenetics and galaxy projects
6
7genome graphs datatypes derived from Interval datatypes
8genome graphs datasets have a header row with appropriate columnames
9The first column is always the marker - eg columname = rs, first row= rs12345 if the rows are snps
10subsequent row values are all numeric ! Will fail if any non numeric (eg '+' or 'NA') values
11ross lazarus for rgenetics
12august 20 2007
13"""
14
15import logging, os, sys, time, tempfile, shutil, string, glob
16import data
17from galaxy import util
18from cgi import escape
19import urllib, binascii
20from galaxy.web import url_for
21from galaxy.datatypes import metadata
22from galaxy.datatypes.metadata import MetadataElement
23from galaxy.datatypes.data import Text
24from galaxy.datatypes.tabular import Tabular
25from galaxy.datatypes.images import Html
26from galaxy.datatypes.interval import Interval
27from galaxy.util.hash_util import *
28
29gal_Log = logging.getLogger(__name__)
30verbose = False
31
32class GenomeGraphs( Tabular ):
33    """
34    Tab delimited data containing a marker id and any number of numeric values
35    """
36
37    MetadataElement( name="markerCol", default=1, desc="Marker ID column", param=metadata.ColumnParameter )
38    MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True )
39    MetadataElement( name="column_types", default=[], desc="Column types", readonly=True, visible=False )
40    file_ext = 'gg'
41
42    def __init__(self, **kwd):
43        """
44        Initialize gg datatype, by adding UCSC display apps
45        """
46        Tabular.__init__(self, **kwd)
47        self.add_display_app ( 'ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links' )   
48
49   
50    def set_meta(self,dataset,**kwd):
51        Tabular.set_meta( self, dataset, **kwd)
52        dataset.metadata.markerCol = 1
53        header = file(dataset.file_name,'r').readlines()[0].strip().split('\t')
54        dataset.metadata.columns = len(header)
55        t = ['numeric' for x in header]
56        t[0] = 'string'
57        dataset.metadata.column_types = t
58        return True
59
60    def as_ucsc_display_file( self, dataset, **kwd ):
61        """
62        Returns file
63        """
64        return file(dataset.file_name,'r')
65
66    def ucsc_links( self, dataset, type, app, base_url ):
67        """
68        from the ever-helpful angie hinrichs angie@soe.ucsc.edu
69        a genome graphs call looks like this
70        http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname
71        &hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess
72        &hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals=
73        &hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index
74        &hgGenome_doSubmitUpload=submit
75                Galaxy gives this for an interval file
76        http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText=
77        http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc
78        """
79        ret_val = []
80        ggtail = 'hgGenome_doSubmitUpload=submit'
81        if not dataset.dbkey:
82              dataset.dbkey = 'hg18' # punt!
83        if dataset.has_data():
84              for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey):
85                    if site_name in app.config.ucsc_display_sites:
86                        site_url = site_url.replace('/hgTracks?','/hgGenome?') # for genome graphs
87                        internal_url = "%s" % url_for( controller='dataset',
88                                dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name )
89                        display_url = "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type)
90                        display_url = urllib.quote_plus( display_url )
91                        # was display_url = urllib.quote_plus( "%s/display_as?id=%i&display_app=%s" % (base_url, dataset.id, type) )
92                        #redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) )
93                        sl = ["%sdb=%s" % (site_url,dataset.dbkey ),]
94                        #sl.append("&hgt.customText=%s")
95                        sl.append("&hgGenome_dataSetName=%s&hgGenome_dataSetDescription=%s" % (dataset.name, 'GalaxyGG_data'))
96                        sl.append("&hgGenome_formatType=best guess&hgGenome_markerType=best guess")
97                        sl.append("&hgGenome_columnLabels=first row&hgGenome_maxVal=&hgGenome_labelVals=")
98                        sl.append("&hgGenome_doSubmitUpload=submit")
99                        sl.append("&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%s" % display_url)
100                        s = ''.join(sl)
101                        s = urllib.quote_plus(s)
102                        redirect_url = s
103                        log.debug('## rg gg ucsc rdurl=%s; s = %s' % (redirect_url,s))
104                        link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
105                        ret_val.append( (site_name, link) )
106        return ret_val
107
108    def make_html_table( self, dataset, skipchars=[] ):
109        """
110        Create HTML table, used for displaying peek
111        """
112        npeek = 5
113        out = ['<table cellspacing="0" cellpadding="3">']
114        f = open(dataset.file_name,'r')
115        d = f.readlines()[:5]
116        if len(d) == 0:
117            out = "Cannot find anything to parse in %s" % dataset.name
118            return out
119        hasheader = 0
120        try:
121            test = ['%f' % x for x in d[0][1:]] # first is name - see if starts all numerics
122        except:
123            hasheader = 1
124        try:
125            # Generate column header
126            out.append( '<tr>' )
127            if hasheader:
128               for i, name in enumerate(d[0].split() ):
129                  out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
130               d.pop(0)
131               out.append('</tr>')
132            for row in d:
133               out.append('<tr>')
134               out.append(''.join(['<td>%s</td>' % x for x in row.split()]))
135               out.append('</tr>')
136            out.append( '</table>' )
137            out = "".join( out )
138        except Exception, exc:
139            out = "Can't create peek %s" % exc
140        return out
141       
142    def validate( self, dataset ):
143        """
144        Validate a gg file - all numeric after header row
145        """
146        errors = list()
147        infile = open(dataset.file_name, "r")
148        header= infile.next() # header
149        for i,row in enumerate(infile):
150           ll = row.strip().split('\t')[1:] # first is alpha feature identifier
151           badvals = []
152           for j,x in enumerate(ll):
153              try:
154                x = float(x)
155              except:
156                badval.append('col%d:%s' % (j+1,x))
157        if len(badvals) > 0:
158            errors.append('row %d, %s' % (' '.join(badvals)))
159            return errors
160       
161    def sniff( self, filename ):
162        """
163        Determines whether the file is in gg format
164        """
165        f = open(filename,'r')
166        headers = f.readline().split()
167        rows = [f.readline().split()[1:] for x in range(3)] # small sample
168        #headers = get_headers( filename, '\t' )
169        for row in rows:
170            try:
171                nums = [float(x) for x in row] # first col has been removed
172            except:
173                return false
174        return true
175
176    def get_mime(self):
177        """Returns the mime type of the datatype"""
178        return 'application/vnd.ms-excel'
179
180
181class rgTabList(Tabular):
182    """
183    for sampleid and for featureid lists of exclusions or inclusions in the clean tool
184    featureid subsets on statistical criteria -> specialized display such as gg
185    """   
186    file_ext = "rgTList"
187
188
189    def __init__(self, **kwd):
190        """
191        Initialize featurelistt datatype
192        """
193        Tabular.__init__( self, **kwd )
194        self.column_names = []
195
196    def make_html_table( self, dataset, skipchars=[] ):
197        """
198        Create HTML table, used for displaying peek
199        """
200        out = ['<table cellspacing="0" cellpadding="3">']
201        comments = []
202        try:
203            # Generate column header
204            out.append( '<tr>' )
205            for i, name in enumerate( self.column_names ):
206                out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
207            if dataset.metadata.columns - len( self.column_names ) > 0:
208                for i in range( len( self.column_names ), dataset.metadata.columns ):
209                    out.append( '<th>%s</th>' % str( i+1 ) )
210                out.append( '</tr>' )
211            out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
212            out.append( '</table>' )
213            out = "".join( out )
214        except Exception, exc:
215            out = "Can't create peek %s" % exc
216        return out
217
218    def get_mime(self):
219        """Returns the mime type of the datatype"""
220        return 'text/html'
221
222
223class rgSampleList(rgTabList):
224    """
225    for sampleid exclusions or inclusions in the clean tool
226    output from QC eg excess het, gender error, ibd pair member,eigen outlier,excess mendel errors,...
227    since they can be uploaded, should be flexible
228    but they are persistent at least
229    same infrastructure for expression?
230    """   
231    file_ext = "rgSList"
232
233    def __init__(self, **kwd):
234        """
235        Initialize samplelist datatype
236        """
237        rgTabList.__init__( self, **kwd )
238        self.column_names[0] = 'FID'
239        self.column_names[1] = 'IID'
240        # this is what Plink wants as at 2009
241   
242    def sniff(self,filename):
243        infile = open(dataset.file_name, "r")
244        header= infile.next() # header
245        if header[0] == 'FID' and header[1] == 'IID':
246            return True
247        else:
248            return False
249       
250class rgFeatureList( rgTabList ):
251    """
252    for featureid lists of exclusions or inclusions in the clean tool
253    output from QC eg low maf, high missingness, bad hwe in controls, excess mendel errors,...
254    featureid subsets on statistical criteria -> specialized display such as gg
255    same infrastructure for expression?
256    """   
257    file_ext = "rgFList"
258
259    def __init__(self, **kwd):
260        """Initialize featurelist datatype"""
261        rgTabList.__init__( self, **kwd )
262        for i,s in enumerate(['#FeatureId', 'Chr', 'Genpos', 'Mappos']):
263            self.column_names[i] = s
264 
265
266class Rgenetics(Html):     
267    """
268    base class to use for rgenetics datatypes
269    derived from html - composite datatype elements
270    stored in extra files path
271    """
272   
273    MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default='RgeneticsData',
274    readonly=True, set_in_upload=True)
275   
276    composite_type = 'auto_primary_file'
277    allow_datatype_change = False
278    file_ext = 'rgenetics'
279
280    def generate_primary_file( self, dataset = None ):
281        rval = ['<html><head><title>Rgenetics Galaxy Composite Dataset </title></head><p/>']
282        rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
283        for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
284            fn = composite_name
285            opt_text = ''
286            if composite_file.optional:
287                opt_text = ' (optional)'
288            if composite_file.get('description'):
289                rval.append( '<li><a href="%s" type="application/binary">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
290            else:
291                rval.append( '<li><a href="%s" type="application/binary">%s</a>%s</li>' % ( fn, fn, opt_text ) )
292        rval.append( '</ul></div></html>' )
293        return "\n".join( rval )
294
295    def regenerate_primary_file(self,dataset):
296        """
297        cannot do this until we are setting metadata
298        """
299        bn = dataset.metadata.base_name
300        efp = dataset.extra_files_path
301        flist = os.listdir(efp)
302        rval = ['<html><head><title>Files for Composite Dataset %s</title></head><body><p/>Composite %s contains:<p/><ul>' % (dataset.name,dataset.name)]
303        for i,fname in enumerate(flist):
304            sfname = os.path.split(fname)[-1]
305            f,e = os.path.splitext(fname)
306            rval.append( '<li><a href="%s">%s</a></li>' % ( sfname, sfname) )
307        rval.append( '</ul></body></html>' )
308        f = file(dataset.file_name,'w')
309        f.write("\n".join( rval ))
310        f.write('\n')
311        f.close()
312
313    def get_mime(self):
314        """Returns the mime type of the datatype"""
315        return 'text/html'
316
317
318    def set_meta( self, dataset, **kwd ):
319
320        """
321        for lped/pbed eg
322
323        """
324        Html.set_meta( self, dataset, **kwd )
325        if kwd.get('overwrite') == False:
326            if verbose:
327                gal_Log.debug('@@@ rgenetics set_meta called with overwrite = False')
328            return True
329        try:
330            efp = dataset.extra_files_path
331        except:
332            if verbose:               
333               gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0], dataset.name))
334            return False
335        try:
336            flist = os.listdir(efp)
337        except:
338            if verbose: gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0],dataset.name))
339            return False
340        if len(flist) == 0:
341            if verbose:
342                gal_Log.debug('@@@rgenetics set_meta failed - %s efp %s is empty?' % (dataset.name,efp))
343            return False
344        self.regenerate_primary_file(dataset)
345        if not dataset.info:           
346                dataset.info = 'Galaxy genotype datatype object'
347        if not dataset.blurb:
348               dataset.blurb = 'Composite file - Rgenetics Galaxy toolkit'
349        return True
350
351
352
353class SNPMatrix(Rgenetics):
354    """
355    BioC SNPMatrix Rgenetics data collections
356    """
357    file_ext="snpmatrix"
358
359    def set_peek( self, dataset, **kwd ):
360        if not dataset.dataset.purged:
361            dataset.peek  = "Binary RGenetics file"
362            dataset.blurb = data.nice_size( dataset.get_size() )
363        else:
364            dataset.peek = 'file does not exist'
365            dataset.blurb = 'file purged from disk'
366           
367    def sniff(self,filename):
368        """ need to check the file header hex code
369        """
370        infile = open(dataset.file_name, "b")
371        head = infile.read(16)
372        head = [hex(x) for x in head]
373        if head <> '':
374            return False
375        else:
376            return True
377
378
379class Lped(Rgenetics):
380    """
381    linkage pedigree (ped,map) Rgenetics data collections
382    """
383    file_ext="lped"
384   
385    def __init__( self, **kwd ):
386        Rgenetics.__init__(self, **kwd)
387        self.add_composite_file( '%s.ped', description = 'Pedigree File', substitute_name_with_metadata = 'base_name', is_binary = False )
388        self.add_composite_file( '%s.map', description = 'Map File', substitute_name_with_metadata = 'base_name', is_binary = False )
389
390
391class Pphe(Rgenetics):
392    """
393    Plink phenotype file - header must have FID\tIID... Rgenetics data collections
394    """
395    file_ext="pphe"
396
397    def __init__( self, **kwd ):
398        Rgenetics.__init__(self, **kwd)
399        self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name', is_binary = False )
400
401
402
403
404class Fphe(Rgenetics):
405    """
406    fbat pedigree file - mad format with ! as first char on header row
407    Rgenetics data collections
408    """
409    file_ext="fphe"
410
411    def __init__( self, **kwd ):
412        Rgenetics.__init__(self, **kwd)
413        self.add_composite_file( '%s.fphe', description = 'FBAT Phenotype File', substitute_name_with_metadata = 'base_name' )
414
415class Phe(Rgenetics):
416    """
417    Phenotype file
418    """
419    file_ext="phe"
420
421    def __init__( self, **kwd ):
422        Rgenetics.__init__(self, **kwd)
423        self.add_composite_file( '%s.phe', description = 'Phenotype File', substitute_name_with_metadata = 'base_name',
424             is_binary = False )
425
426
427
428class Fped(Rgenetics):
429    """
430    FBAT pedigree format - single file, map is header row of rs numbers. Strange.
431    Rgenetics data collections
432    """
433    file_ext="fped"
434
435    def __init__( self, **kwd ):
436        Rgenetics.__init__(self, **kwd)
437        self.add_composite_file( '%s.fped', description = 'FBAT format pedfile', substitute_name_with_metadata = 'base_name',
438              is_binary = False )
439
440
441class Pbed(Rgenetics):
442    """
443    Plink Binary compressed 2bit/geno Rgenetics data collections
444    """
445    file_ext="pbed"
446   
447    def __init__( self, **kwd ):
448        Rgenetics.__init__(self, **kwd)
449        self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
450        self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
451        self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
452
453class ldIndep(Rgenetics):
454    """
455    LD (a good measure of redundancy of information) depleted Plink Binary compressed 2bit/geno
456    This is really a plink binary, but some tools work better with less redundancy so are constrained to
457    these files
458    """
459    file_ext="ldreduced"
460
461    def __init__( self, **kwd ):
462        Rgenetics.__init__(self, **kwd)
463        self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
464        self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
465        self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
466
467
468class Eigenstratgeno(Rgenetics):
469    """
470    Eigenstrat format - may be able to get rid of this
471    if we move to shellfish
472    Rgenetics data collections
473    """
474    file_ext="eigenstratgeno"
475   
476    def __init__( self, **kwd ):
477        Rgenetics.__init__(self, **kwd)
478        self.add_composite_file( '%s.eigenstratgeno', substitute_name_with_metadata = 'base_name', is_binary = False )
479        self.add_composite_file( '%s.ind', substitute_name_with_metadata = 'base_name', is_binary = False )
480        self.add_composite_file( '%s.map', substitute_name_with_metadata = 'base_name', is_binary = False )
481       
482
483
484class Eigenstratpca(Rgenetics):
485    """
486    Eigenstrat PCA file for case control adjustment
487    Rgenetics data collections
488    """
489    file_ext="eigenstratpca"
490
491    def __init__( self, **kwd ):
492        Rgenetics.__init__(self, **kwd)
493        self.add_composite_file( '%s.eigenstratpca', description = 'Eigenstrat PCA file', substitute_name_with_metadata = 'base_name' )
494
495
496class Snptest(Rgenetics):
497    """
498    BioC snptest Rgenetics data collections
499    """
500    file_ext="snptest"
501
502
503class Pheno(Tabular):
504    """
505    base class for pheno files
506    """
507    file_ext = 'pheno'
508
509
510class RexpBase( Html ):
511    """
512    base class for BioC data structures in Galaxy
513    must be constructed with the pheno data in place since that
514    goes into the metadata for each instance
515    """
516    MetadataElement( name="columns", default=0, desc="Number of columns",  visible=True )
517    MetadataElement( name="column_names", default=[], desc="Column names", visible=True )
518    MetadataElement(name="pheCols",default=[],desc="Select list for potentially interesting variables",visible=True)
519    MetadataElement( name="base_name",
520    desc="base name for all transformed versions of this expression dataset", default='rexpression', set_in_upload=True)
521    MetadataElement( name="pheno_path", desc="Path to phenotype data for this experiment", default="rexpression.pheno", visible=True)
522    file_ext = 'rexpbase'
523    html_table = None
524    is_binary = True
525    composite_type = 'auto_primary_file'
526    allow_datatype_change = False
527   
528   
529    def __init__( self, **kwd ):
530        Html.__init__(self,**kwd)
531        self.add_composite_file( '%s.pheno', description = 'Phenodata tab text file',
532          substitute_name_with_metadata = 'base_name', is_binary=False)
533
534    def generate_primary_file( self, dataset = None ):
535        """
536        This is called only at upload to write the html file
537        cannot rename the datasets here - they come with the default unfortunately
538        """
539        return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
540
541    def get_mime(self):
542        """Returns the mime type of the datatype"""
543        return 'text/html'
544   
545    def get_phecols(self, phenolist=[], maxConc=20):
546        """
547        sept 2009: cannot use whitespace to split - make a more complex structure here
548        and adjust the methods that rely on this structure
549        return interesting phenotype column names for an rexpression eset or affybatch
550        to use in array subsetting and so on. Returns a data structure for a
551        dynamic Galaxy select parameter.
552        A column with only 1 value doesn't change, so is not interesting for
553        analysis. A column with a different value in every row is equivalent to a unique
554        identifier so is also not interesting for anova or limma analysis - both these
555        are removed after the concordance (count of unique terms) is constructed for each
556        column. Then a complication - each remaining pair of columns is tested for
557        redundancy - if two columns are always paired, then only one is needed :)
558        """
559        for nrows,row in enumerate(phenolist): # construct concordance
560            if len(row.strip()) == 0:
561                break
562            row = row.strip().split('\t')
563            if nrows == 0: # set up from header
564               head = row
565               totcols = len(row)
566               concordance = [{} for x in head] # list of dicts
567            else:
568                for col,code in enumerate(row): # keep column order correct
569                    if col >= totcols:
570                          gal_Log.warning('### get_phecols error in pheno file - row %d col %d (%s) longer than header %s' % (nrows, col, row, head))
571                    else:
572                        concordance[col].setdefault(code,0) # first one is zero
573                        concordance[col][code] += 1
574        useCols = []
575        useConc = [] # columns of interest to keep
576        nrows = len(phenolist)
577        nrows -= 1 # drop head from count
578        for c,conc in enumerate(concordance): # c is column number
579            if (len(conc) > 1) and (len(conc) < min(nrows,maxConc)): # not all same and not all different!!
580                useConc.append(conc) # keep concordance
581                useCols.append(c) # keep column
582        nuse = len(useCols)
583        # now to check for pairs of concordant columns - drop one of these.
584        delme = []
585        p = phenolist[1:] # drop header
586        plist = [x.strip().split('\t') for x in p] # list of lists
587        phe = [[x[i] for i in useCols] for x in plist if len(x) >= totcols] # strip unused data
588        for i in range(0,(nuse-1)): # for each interesting column
589            for j in range(i+1,nuse):
590                kdict = {}
591                for row in phe: # row is a list of lists
592                    k = '%s%s' % (row[i],row[j]) # composite key
593                    kdict[k] = k
594                if (len(kdict.keys()) == len(concordance[useCols[j]])): # i and j are always matched
595                    delme.append(j)
596        delme = list(set(delme)) # remove dupes     
597        listCol = []
598        delme.sort()
599        delme.reverse() # must delete from far end!
600        for i in delme:
601            del useConc[i] # get rid of concordance
602            del useCols[i] # and usecols entry
603        for i,conc in enumerate(useConc): # these are all unique columns for the design matrix
604                ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
605                ccounts.sort()
606                cc = [(x[1],x[0]) for x in ccounts] # list of code count tuples
607                codeDetails = (head[useCols[i]],cc) # ('foo',[('a',3),('b',11),..])
608                listCol.append(codeDetails)
609        if len(listCol) > 0:
610            res = listCol
611            # metadata.pheCols becomes [('bar;22,zot;113','foo'), ...]
612        else:
613            res = [('no usable phenotype columns found',[('?',0),]),]     
614        return res
615
616   
617
618    def get_pheno(self,dataset):
619        """
620        expects a .pheno file in the extra_files_dir - ugh
621        note that R is wierd and adds the row.name in
622        the header so the columns are all wrong - unless you tell it not to.
623        A file can be written as 
624        write.table(file='foo.pheno',pData(foo),sep='\t',quote=F,row.names=F)
625        """
626        p = file(dataset.metadata.pheno_path,'r').readlines()
627        if len(p) > 0: # should only need to fix an R pheno file once
628            head = p[0].strip().split('\t')
629            line1 = p[1].strip().split('\t')
630            if len(head) < len(line1):
631                head.insert(0,'ChipFileName') # fix R write.table b0rken-ness
632                p[0] = '\t'.join(head)
633        else:
634            p = []
635        return '\n'.join(p)
636
637    def set_peek( self, dataset, **kwd ):
638        """
639        expects a .pheno file in the extra_files_dir - ugh
640        note that R is wierd and does not include the row.name in
641        the header. why?"""
642        if not dataset.dataset.purged:
643            pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
644            try:
645                p = file(pp,'r').readlines()
646            except:
647                p = ['##failed to find %s' % pp,]
648            dataset.peek = ''.join(p[:5])
649            dataset.blurb = 'Galaxy Rexpression composite file'
650        else:
651            dataset.peek = 'file does not exist\n'
652            dataset.blurb = 'file purged from disk'
653
654    def get_peek( self, dataset ):
655        """
656        expects a .pheno file in the extra_files_dir - ugh
657        """
658        pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
659        try:
660            p = file(pp,'r').readlines()
661        except:
662            p = ['##failed to find %s' % pp]
663        return ''.join(p[:5])
664
665    def get_file_peek(self,filename):
666        """
667        can't really peek at a filename - need the extra_files_path and such?
668        """
669        h = '## rexpression get_file_peek: no file found'
670        try:
671            h = file(filename,'r').readlines()
672        except:
673            pass
674        return ''.join(h[:5])
675
676    def regenerate_primary_file(self,dataset):
677        """
678        cannot do this until we are setting metadata
679        """
680        bn = dataset.metadata.base_name
681        flist = os.listdir(dataset.extra_files_path)
682        rval = ['<html><head><title>Files for Composite Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)]
683        for i,fname in enumerate(flist):
684            sfname = os.path.split(fname)[-1]
685            rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) )
686        rval.append( '</ul></html>' )
687        f = file(dataset.file_name,'w')
688        f.write("\n".join( rval ))
689        f.write('\n')
690        f.close()
691
692    def init_meta( self, dataset, copy_from=None ):
693        if copy_from:
694            dataset.metadata = copy_from.metadata     
695
696    def set_meta( self, dataset, **kwd ):         
697
698        """
699        NOTE we apply the tabular machinary to the phenodata extracted
700        from a BioC eSet or affybatch.
701
702        """
703        Html.set_meta(self, dataset, **kwd)
704        try:
705            flist = os.listdir(dataset.extra_files_path)
706        except:
707            if verbose:
708                gal_Log.debug('@@@rexpression set_meta failed - no dataset?')
709            return False
710        bn = dataset.metadata.base_name
711        if not bn:
712           for f in flist:
713               n = os.path.splitext(f)[0]
714               bn = n
715               dataset.metadata.base_name = bn
716        if not bn:
717            bn = '?'
718            dataset.metadata.base_name = bn
719        pn = '%s.pheno' % (bn)
720        pp = os.path.join(dataset.extra_files_path,pn)
721        dataset.metadata.pheno_path=pp
722        try:
723            pf = file(pp,'r').readlines() # read the basename.phenodata in the extra_files_path
724        except:
725            pf = None
726        if pf:
727            h = pf[0].strip()
728            h = h.split('\t') # hope is header
729            h = [escape(x) for x in h]
730            dataset.metadata.column_names = h
731            dataset.metadata.columns = len(h)
732            dataset.peek = ''.join(pf[:5])
733        else:
734            dataset.metadata.column_names = []
735            dataset.metadata.columns = 0
736            dataset.peek = 'No pheno file found'
737        if pf and len(pf) > 1:
738            dataset.metadata.pheCols = self.get_phecols(phenolist=pf)
739        else:
740            dataset.metadata.pheCols = [('','No useable phenotypes found',False),]
741        #self.regenerate_primary_file(dataset)
742        if not dataset.info:
743                dataset.info = 'Galaxy Expression datatype object'
744        if not dataset.blurb:
745               dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit'
746        return True
747           
748    def make_html_table( self, pp='nothing supplied from peek\n'):
749        """
750        Create HTML table, used for displaying peek
751        """
752        out = ['<table cellspacing="0" cellpadding="3">',]
753        p = pp.split('\n')
754        try:
755            # Generate column header
756            for i,row in enumerate(p):
757                lrow = row.strip().split('\t')
758                if i == 0:               
759                    orow = ['<th>%s</th>' % escape(x) for x in lrow]
760                    orow.insert(0,'<tr>')
761                    orow.append('</tr>')
762                else:           
763                    orow = ['<td>%s</td>' % escape(x) for x in lrow]
764                    orow.insert(0,'<tr>')
765                    orow.append('</tr>')
766                out.append(''.join(orow))
767            out.append( '</table>' )
768            out = "\n".join( out )
769        except Exception, exc:
770            out = "Can't create html table %s" % str( exc )
771        return out
772   
773    def display_peek( self, dataset ):
774        """
775        Returns formatted html of peek
776        """
777        out=self.make_html_table(dataset.peek)
778        return out
779   
780    def get_mime(self):
781        """
782        Returns the mime type of the datatype
783        """
784        return 'text/html'
785   
786
787class Affybatch( RexpBase ):
788    """
789    derived class for BioC data structures in Galaxy
790    """
791
792    file_ext = "affybatch"
793
794    def __init__( self, **kwd ):
795        RexpBase.__init__(self, **kwd)
796        self.add_composite_file( '%s.affybatch', description = 'AffyBatch R object saved to file',
797        substitute_name_with_metadata = 'base_name', is_binary=True )
798   
799class Eset( RexpBase ):
800    """
801    derived class for BioC data structures in Galaxy
802    """
803    file_ext = "eset"
804
805    def __init__( self, **kwd ):
806        RexpBase.__init__(self, **kwd)
807        self.add_composite_file( '%s.eset', description = 'ESet R object saved to file',
808        substitute_name_with_metadata = 'base_name', is_binary = True )
809
810
811class MAlist( RexpBase ):
812    """
813    derived class for BioC data structures in Galaxy
814    """
815    file_ext = "malist"   
816
817    def __init__( self, **kwd ):
818        RexpBase.__init__(self, **kwd)
819        self.add_composite_file( '%s.malist', description = 'MAlist R object saved to file',
820        substitute_name_with_metadata = 'base_name', is_binary = True )
821
822
823if __name__ == '__main__':
824    import doctest, sys
825    doctest.testmod(sys.modules[__name__])
826
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。