[2] | 1 | """
|
---|
| 2 | rgenetics datatypes
|
---|
| 3 | Use at your peril
|
---|
| 4 | Ross Lazarus
|
---|
| 5 | for the rgenetics and galaxy projects
|
---|
| 6 |
|
---|
| 7 | genome graphs datatypes derived from Interval datatypes
|
---|
| 8 | genome graphs datasets have a header row with appropriate columnames
|
---|
| 9 | The first column is always the marker - eg columname = rs, first row= rs12345 if the rows are snps
|
---|
| 10 | subsequent row values are all numeric ! Will fail if any non numeric (eg '+' or 'NA') values
|
---|
| 11 | ross lazarus for rgenetics
|
---|
| 12 | august 20 2007
|
---|
| 13 | """
|
---|
| 14 |
|
---|
| 15 | import logging, os, sys, time, tempfile, shutil, string, glob
|
---|
| 16 | import data
|
---|
| 17 | from galaxy import util
|
---|
| 18 | from cgi import escape
|
---|
| 19 | import urllib, binascii
|
---|
| 20 | from galaxy.web import url_for
|
---|
| 21 | from galaxy.datatypes import metadata
|
---|
| 22 | from galaxy.datatypes.metadata import MetadataElement
|
---|
| 23 | from galaxy.datatypes.data import Text
|
---|
| 24 | from galaxy.datatypes.tabular import Tabular
|
---|
| 25 | from galaxy.datatypes.images import Html
|
---|
| 26 | from galaxy.datatypes.interval import Interval
|
---|
| 27 | from galaxy.util.hash_util import *
|
---|
| 28 |
|
---|
| 29 | gal_Log = logging.getLogger(__name__)
|
---|
| 30 | verbose = False
|
---|
| 31 |
|
---|
| 32 | class GenomeGraphs( Tabular ):
|
---|
| 33 | """
|
---|
| 34 | Tab delimited data containing a marker id and any number of numeric values
|
---|
| 35 | """
|
---|
| 36 |
|
---|
| 37 | MetadataElement( name="markerCol", default=1, desc="Marker ID column", param=metadata.ColumnParameter )
|
---|
| 38 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True )
|
---|
| 39 | MetadataElement( name="column_types", default=[], desc="Column types", readonly=True, visible=False )
|
---|
| 40 | file_ext = 'gg'
|
---|
| 41 |
|
---|
| 42 | def __init__(self, **kwd):
|
---|
| 43 | """
|
---|
| 44 | Initialize gg datatype, by adding UCSC display apps
|
---|
| 45 | """
|
---|
| 46 | Tabular.__init__(self, **kwd)
|
---|
| 47 | self.add_display_app ( 'ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links' )
|
---|
| 48 |
|
---|
| 49 |
|
---|
| 50 | def set_meta(self,dataset,**kwd):
|
---|
| 51 | Tabular.set_meta( self, dataset, **kwd)
|
---|
| 52 | dataset.metadata.markerCol = 1
|
---|
| 53 | header = file(dataset.file_name,'r').readlines()[0].strip().split('\t')
|
---|
| 54 | dataset.metadata.columns = len(header)
|
---|
| 55 | t = ['numeric' for x in header]
|
---|
| 56 | t[0] = 'string'
|
---|
| 57 | dataset.metadata.column_types = t
|
---|
| 58 | return True
|
---|
| 59 |
|
---|
| 60 | def as_ucsc_display_file( self, dataset, **kwd ):
|
---|
| 61 | """
|
---|
| 62 | Returns file
|
---|
| 63 | """
|
---|
| 64 | return file(dataset.file_name,'r')
|
---|
| 65 |
|
---|
| 66 | def ucsc_links( self, dataset, type, app, base_url ):
|
---|
| 67 | """
|
---|
| 68 | from the ever-helpful angie hinrichs angie@soe.ucsc.edu
|
---|
| 69 | a genome graphs call looks like this
|
---|
| 70 | http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname
|
---|
| 71 | &hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess
|
---|
| 72 | &hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals=
|
---|
| 73 | &hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index
|
---|
| 74 | &hgGenome_doSubmitUpload=submit
|
---|
| 75 | Galaxy gives this for an interval file
|
---|
| 76 | http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText=
|
---|
| 77 | http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc
|
---|
| 78 | """
|
---|
| 79 | ret_val = []
|
---|
| 80 | ggtail = 'hgGenome_doSubmitUpload=submit'
|
---|
| 81 | if not dataset.dbkey:
|
---|
| 82 | dataset.dbkey = 'hg18' # punt!
|
---|
| 83 | if dataset.has_data():
|
---|
| 84 | for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey):
|
---|
| 85 | if site_name in app.config.ucsc_display_sites:
|
---|
| 86 | site_url = site_url.replace('/hgTracks?','/hgGenome?') # for genome graphs
|
---|
| 87 | internal_url = "%s" % url_for( controller='dataset',
|
---|
| 88 | dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name )
|
---|
| 89 | display_url = "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type)
|
---|
| 90 | display_url = urllib.quote_plus( display_url )
|
---|
| 91 | # was display_url = urllib.quote_plus( "%s/display_as?id=%i&display_app=%s" % (base_url, dataset.id, type) )
|
---|
| 92 | #redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) )
|
---|
| 93 | sl = ["%sdb=%s" % (site_url,dataset.dbkey ),]
|
---|
| 94 | #sl.append("&hgt.customText=%s")
|
---|
| 95 | sl.append("&hgGenome_dataSetName=%s&hgGenome_dataSetDescription=%s" % (dataset.name, 'GalaxyGG_data'))
|
---|
| 96 | sl.append("&hgGenome_formatType=best guess&hgGenome_markerType=best guess")
|
---|
| 97 | sl.append("&hgGenome_columnLabels=first row&hgGenome_maxVal=&hgGenome_labelVals=")
|
---|
| 98 | sl.append("&hgGenome_doSubmitUpload=submit")
|
---|
| 99 | sl.append("&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%s" % display_url)
|
---|
| 100 | s = ''.join(sl)
|
---|
| 101 | s = urllib.quote_plus(s)
|
---|
| 102 | redirect_url = s
|
---|
| 103 | log.debug('## rg gg ucsc rdurl=%s; s = %s' % (redirect_url,s))
|
---|
| 104 | link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url )
|
---|
| 105 | ret_val.append( (site_name, link) )
|
---|
| 106 | return ret_val
|
---|
| 107 |
|
---|
| 108 | def make_html_table( self, dataset, skipchars=[] ):
|
---|
| 109 | """
|
---|
| 110 | Create HTML table, used for displaying peek
|
---|
| 111 | """
|
---|
| 112 | npeek = 5
|
---|
| 113 | out = ['<table cellspacing="0" cellpadding="3">']
|
---|
| 114 | f = open(dataset.file_name,'r')
|
---|
| 115 | d = f.readlines()[:5]
|
---|
| 116 | if len(d) == 0:
|
---|
| 117 | out = "Cannot find anything to parse in %s" % dataset.name
|
---|
| 118 | return out
|
---|
| 119 | hasheader = 0
|
---|
| 120 | try:
|
---|
| 121 | test = ['%f' % x for x in d[0][1:]] # first is name - see if starts all numerics
|
---|
| 122 | except:
|
---|
| 123 | hasheader = 1
|
---|
| 124 | try:
|
---|
| 125 | # Generate column header
|
---|
| 126 | out.append( '<tr>' )
|
---|
| 127 | if hasheader:
|
---|
| 128 | for i, name in enumerate(d[0].split() ):
|
---|
| 129 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
|
---|
| 130 | d.pop(0)
|
---|
| 131 | out.append('</tr>')
|
---|
| 132 | for row in d:
|
---|
| 133 | out.append('<tr>')
|
---|
| 134 | out.append(''.join(['<td>%s</td>' % x for x in row.split()]))
|
---|
| 135 | out.append('</tr>')
|
---|
| 136 | out.append( '</table>' )
|
---|
| 137 | out = "".join( out )
|
---|
| 138 | except Exception, exc:
|
---|
| 139 | out = "Can't create peek %s" % exc
|
---|
| 140 | return out
|
---|
| 141 |
|
---|
| 142 | def validate( self, dataset ):
|
---|
| 143 | """
|
---|
| 144 | Validate a gg file - all numeric after header row
|
---|
| 145 | """
|
---|
| 146 | errors = list()
|
---|
| 147 | infile = open(dataset.file_name, "r")
|
---|
| 148 | header= infile.next() # header
|
---|
| 149 | for i,row in enumerate(infile):
|
---|
| 150 | ll = row.strip().split('\t')[1:] # first is alpha feature identifier
|
---|
| 151 | badvals = []
|
---|
| 152 | for j,x in enumerate(ll):
|
---|
| 153 | try:
|
---|
| 154 | x = float(x)
|
---|
| 155 | except:
|
---|
| 156 | badval.append('col%d:%s' % (j+1,x))
|
---|
| 157 | if len(badvals) > 0:
|
---|
| 158 | errors.append('row %d, %s' % (' '.join(badvals)))
|
---|
| 159 | return errors
|
---|
| 160 |
|
---|
| 161 | def sniff( self, filename ):
|
---|
| 162 | """
|
---|
| 163 | Determines whether the file is in gg format
|
---|
| 164 | """
|
---|
| 165 | f = open(filename,'r')
|
---|
| 166 | headers = f.readline().split()
|
---|
| 167 | rows = [f.readline().split()[1:] for x in range(3)] # small sample
|
---|
| 168 | #headers = get_headers( filename, '\t' )
|
---|
| 169 | for row in rows:
|
---|
| 170 | try:
|
---|
| 171 | nums = [float(x) for x in row] # first col has been removed
|
---|
| 172 | except:
|
---|
| 173 | return false
|
---|
| 174 | return true
|
---|
| 175 |
|
---|
| 176 | def get_mime(self):
|
---|
| 177 | """Returns the mime type of the datatype"""
|
---|
| 178 | return 'application/vnd.ms-excel'
|
---|
| 179 |
|
---|
| 180 |
|
---|
| 181 | class rgTabList(Tabular):
|
---|
| 182 | """
|
---|
| 183 | for sampleid and for featureid lists of exclusions or inclusions in the clean tool
|
---|
| 184 | featureid subsets on statistical criteria -> specialized display such as gg
|
---|
| 185 | """
|
---|
| 186 | file_ext = "rgTList"
|
---|
| 187 |
|
---|
| 188 |
|
---|
| 189 | def __init__(self, **kwd):
|
---|
| 190 | """
|
---|
| 191 | Initialize featurelistt datatype
|
---|
| 192 | """
|
---|
| 193 | Tabular.__init__( self, **kwd )
|
---|
| 194 | self.column_names = []
|
---|
| 195 |
|
---|
| 196 | def make_html_table( self, dataset, skipchars=[] ):
|
---|
| 197 | """
|
---|
| 198 | Create HTML table, used for displaying peek
|
---|
| 199 | """
|
---|
| 200 | out = ['<table cellspacing="0" cellpadding="3">']
|
---|
| 201 | comments = []
|
---|
| 202 | try:
|
---|
| 203 | # Generate column header
|
---|
| 204 | out.append( '<tr>' )
|
---|
| 205 | for i, name in enumerate( self.column_names ):
|
---|
| 206 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
|
---|
| 207 | if dataset.metadata.columns - len( self.column_names ) > 0:
|
---|
| 208 | for i in range( len( self.column_names ), dataset.metadata.columns ):
|
---|
| 209 | out.append( '<th>%s</th>' % str( i+1 ) )
|
---|
| 210 | out.append( '</tr>' )
|
---|
| 211 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
|
---|
| 212 | out.append( '</table>' )
|
---|
| 213 | out = "".join( out )
|
---|
| 214 | except Exception, exc:
|
---|
| 215 | out = "Can't create peek %s" % exc
|
---|
| 216 | return out
|
---|
| 217 |
|
---|
| 218 | def get_mime(self):
|
---|
| 219 | """Returns the mime type of the datatype"""
|
---|
| 220 | return 'text/html'
|
---|
| 221 |
|
---|
| 222 |
|
---|
| 223 | class rgSampleList(rgTabList):
|
---|
| 224 | """
|
---|
| 225 | for sampleid exclusions or inclusions in the clean tool
|
---|
| 226 | output from QC eg excess het, gender error, ibd pair member,eigen outlier,excess mendel errors,...
|
---|
| 227 | since they can be uploaded, should be flexible
|
---|
| 228 | but they are persistent at least
|
---|
| 229 | same infrastructure for expression?
|
---|
| 230 | """
|
---|
| 231 | file_ext = "rgSList"
|
---|
| 232 |
|
---|
| 233 | def __init__(self, **kwd):
|
---|
| 234 | """
|
---|
| 235 | Initialize samplelist datatype
|
---|
| 236 | """
|
---|
| 237 | rgTabList.__init__( self, **kwd )
|
---|
| 238 | self.column_names[0] = 'FID'
|
---|
| 239 | self.column_names[1] = 'IID'
|
---|
| 240 | # this is what Plink wants as at 2009
|
---|
| 241 |
|
---|
| 242 | def sniff(self,filename):
|
---|
| 243 | infile = open(dataset.file_name, "r")
|
---|
| 244 | header= infile.next() # header
|
---|
| 245 | if header[0] == 'FID' and header[1] == 'IID':
|
---|
| 246 | return True
|
---|
| 247 | else:
|
---|
| 248 | return False
|
---|
| 249 |
|
---|
| 250 | class rgFeatureList( rgTabList ):
|
---|
| 251 | """
|
---|
| 252 | for featureid lists of exclusions or inclusions in the clean tool
|
---|
| 253 | output from QC eg low maf, high missingness, bad hwe in controls, excess mendel errors,...
|
---|
| 254 | featureid subsets on statistical criteria -> specialized display such as gg
|
---|
| 255 | same infrastructure for expression?
|
---|
| 256 | """
|
---|
| 257 | file_ext = "rgFList"
|
---|
| 258 |
|
---|
| 259 | def __init__(self, **kwd):
|
---|
| 260 | """Initialize featurelist datatype"""
|
---|
| 261 | rgTabList.__init__( self, **kwd )
|
---|
| 262 | for i,s in enumerate(['#FeatureId', 'Chr', 'Genpos', 'Mappos']):
|
---|
| 263 | self.column_names[i] = s
|
---|
| 264 |
|
---|
| 265 |
|
---|
| 266 | class Rgenetics(Html):
|
---|
| 267 | """
|
---|
| 268 | base class to use for rgenetics datatypes
|
---|
| 269 | derived from html - composite datatype elements
|
---|
| 270 | stored in extra files path
|
---|
| 271 | """
|
---|
| 272 |
|
---|
| 273 | MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default='RgeneticsData',
|
---|
| 274 | readonly=True, set_in_upload=True)
|
---|
| 275 |
|
---|
| 276 | composite_type = 'auto_primary_file'
|
---|
| 277 | allow_datatype_change = False
|
---|
| 278 | file_ext = 'rgenetics'
|
---|
| 279 |
|
---|
| 280 | def generate_primary_file( self, dataset = None ):
|
---|
| 281 | rval = ['<html><head><title>Rgenetics Galaxy Composite Dataset </title></head><p/>']
|
---|
| 282 | rval.append('<div>This composite dataset is composed of the following files:<p/><ul>')
|
---|
| 283 | for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems():
|
---|
| 284 | fn = composite_name
|
---|
| 285 | opt_text = ''
|
---|
| 286 | if composite_file.optional:
|
---|
| 287 | opt_text = ' (optional)'
|
---|
| 288 | if composite_file.get('description'):
|
---|
| 289 | rval.append( '<li><a href="%s" type="application/binary">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) )
|
---|
| 290 | else:
|
---|
| 291 | rval.append( '<li><a href="%s" type="application/binary">%s</a>%s</li>' % ( fn, fn, opt_text ) )
|
---|
| 292 | rval.append( '</ul></div></html>' )
|
---|
| 293 | return "\n".join( rval )
|
---|
| 294 |
|
---|
| 295 | def regenerate_primary_file(self,dataset):
|
---|
| 296 | """
|
---|
| 297 | cannot do this until we are setting metadata
|
---|
| 298 | """
|
---|
| 299 | bn = dataset.metadata.base_name
|
---|
| 300 | efp = dataset.extra_files_path
|
---|
| 301 | flist = os.listdir(efp)
|
---|
| 302 | rval = ['<html><head><title>Files for Composite Dataset %s</title></head><body><p/>Composite %s contains:<p/><ul>' % (dataset.name,dataset.name)]
|
---|
| 303 | for i,fname in enumerate(flist):
|
---|
| 304 | sfname = os.path.split(fname)[-1]
|
---|
| 305 | f,e = os.path.splitext(fname)
|
---|
| 306 | rval.append( '<li><a href="%s">%s</a></li>' % ( sfname, sfname) )
|
---|
| 307 | rval.append( '</ul></body></html>' )
|
---|
| 308 | f = file(dataset.file_name,'w')
|
---|
| 309 | f.write("\n".join( rval ))
|
---|
| 310 | f.write('\n')
|
---|
| 311 | f.close()
|
---|
| 312 |
|
---|
| 313 | def get_mime(self):
|
---|
| 314 | """Returns the mime type of the datatype"""
|
---|
| 315 | return 'text/html'
|
---|
| 316 |
|
---|
| 317 |
|
---|
| 318 | def set_meta( self, dataset, **kwd ):
|
---|
| 319 |
|
---|
| 320 | """
|
---|
| 321 | for lped/pbed eg
|
---|
| 322 |
|
---|
| 323 | """
|
---|
| 324 | Html.set_meta( self, dataset, **kwd )
|
---|
| 325 | if kwd.get('overwrite') == False:
|
---|
| 326 | if verbose:
|
---|
| 327 | gal_Log.debug('@@@ rgenetics set_meta called with overwrite = False')
|
---|
| 328 | return True
|
---|
| 329 | try:
|
---|
| 330 | efp = dataset.extra_files_path
|
---|
| 331 | except:
|
---|
| 332 | if verbose:
|
---|
| 333 | gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0], dataset.name))
|
---|
| 334 | return False
|
---|
| 335 | try:
|
---|
| 336 | flist = os.listdir(efp)
|
---|
| 337 | except:
|
---|
| 338 | if verbose: gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0],dataset.name))
|
---|
| 339 | return False
|
---|
| 340 | if len(flist) == 0:
|
---|
| 341 | if verbose:
|
---|
| 342 | gal_Log.debug('@@@rgenetics set_meta failed - %s efp %s is empty?' % (dataset.name,efp))
|
---|
| 343 | return False
|
---|
| 344 | self.regenerate_primary_file(dataset)
|
---|
| 345 | if not dataset.info:
|
---|
| 346 | dataset.info = 'Galaxy genotype datatype object'
|
---|
| 347 | if not dataset.blurb:
|
---|
| 348 | dataset.blurb = 'Composite file - Rgenetics Galaxy toolkit'
|
---|
| 349 | return True
|
---|
| 350 |
|
---|
| 351 |
|
---|
| 352 |
|
---|
| 353 | class SNPMatrix(Rgenetics):
|
---|
| 354 | """
|
---|
| 355 | BioC SNPMatrix Rgenetics data collections
|
---|
| 356 | """
|
---|
| 357 | file_ext="snpmatrix"
|
---|
| 358 |
|
---|
| 359 | def set_peek( self, dataset, **kwd ):
|
---|
| 360 | if not dataset.dataset.purged:
|
---|
| 361 | dataset.peek = "Binary RGenetics file"
|
---|
| 362 | dataset.blurb = data.nice_size( dataset.get_size() )
|
---|
| 363 | else:
|
---|
| 364 | dataset.peek = 'file does not exist'
|
---|
| 365 | dataset.blurb = 'file purged from disk'
|
---|
| 366 |
|
---|
| 367 | def sniff(self,filename):
|
---|
| 368 | """ need to check the file header hex code
|
---|
| 369 | """
|
---|
| 370 | infile = open(dataset.file_name, "b")
|
---|
| 371 | head = infile.read(16)
|
---|
| 372 | head = [hex(x) for x in head]
|
---|
| 373 | if head <> '':
|
---|
| 374 | return False
|
---|
| 375 | else:
|
---|
| 376 | return True
|
---|
| 377 |
|
---|
| 378 |
|
---|
| 379 | class Lped(Rgenetics):
|
---|
| 380 | """
|
---|
| 381 | linkage pedigree (ped,map) Rgenetics data collections
|
---|
| 382 | """
|
---|
| 383 | file_ext="lped"
|
---|
| 384 |
|
---|
| 385 | def __init__( self, **kwd ):
|
---|
| 386 | Rgenetics.__init__(self, **kwd)
|
---|
| 387 | self.add_composite_file( '%s.ped', description = 'Pedigree File', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 388 | self.add_composite_file( '%s.map', description = 'Map File', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 389 |
|
---|
| 390 |
|
---|
| 391 | class Pphe(Rgenetics):
|
---|
| 392 | """
|
---|
| 393 | Plink phenotype file - header must have FID\tIID... Rgenetics data collections
|
---|
| 394 | """
|
---|
| 395 | file_ext="pphe"
|
---|
| 396 |
|
---|
| 397 | def __init__( self, **kwd ):
|
---|
| 398 | Rgenetics.__init__(self, **kwd)
|
---|
| 399 | self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 400 |
|
---|
| 401 |
|
---|
| 402 |
|
---|
| 403 |
|
---|
| 404 | class Fphe(Rgenetics):
|
---|
| 405 | """
|
---|
| 406 | fbat pedigree file - mad format with ! as first char on header row
|
---|
| 407 | Rgenetics data collections
|
---|
| 408 | """
|
---|
| 409 | file_ext="fphe"
|
---|
| 410 |
|
---|
| 411 | def __init__( self, **kwd ):
|
---|
| 412 | Rgenetics.__init__(self, **kwd)
|
---|
| 413 | self.add_composite_file( '%s.fphe', description = 'FBAT Phenotype File', substitute_name_with_metadata = 'base_name' )
|
---|
| 414 |
|
---|
| 415 | class Phe(Rgenetics):
|
---|
| 416 | """
|
---|
| 417 | Phenotype file
|
---|
| 418 | """
|
---|
| 419 | file_ext="phe"
|
---|
| 420 |
|
---|
| 421 | def __init__( self, **kwd ):
|
---|
| 422 | Rgenetics.__init__(self, **kwd)
|
---|
| 423 | self.add_composite_file( '%s.phe', description = 'Phenotype File', substitute_name_with_metadata = 'base_name',
|
---|
| 424 | is_binary = False )
|
---|
| 425 |
|
---|
| 426 |
|
---|
| 427 |
|
---|
| 428 | class Fped(Rgenetics):
|
---|
| 429 | """
|
---|
| 430 | FBAT pedigree format - single file, map is header row of rs numbers. Strange.
|
---|
| 431 | Rgenetics data collections
|
---|
| 432 | """
|
---|
| 433 | file_ext="fped"
|
---|
| 434 |
|
---|
| 435 | def __init__( self, **kwd ):
|
---|
| 436 | Rgenetics.__init__(self, **kwd)
|
---|
| 437 | self.add_composite_file( '%s.fped', description = 'FBAT format pedfile', substitute_name_with_metadata = 'base_name',
|
---|
| 438 | is_binary = False )
|
---|
| 439 |
|
---|
| 440 |
|
---|
| 441 | class Pbed(Rgenetics):
|
---|
| 442 | """
|
---|
| 443 | Plink Binary compressed 2bit/geno Rgenetics data collections
|
---|
| 444 | """
|
---|
| 445 | file_ext="pbed"
|
---|
| 446 |
|
---|
| 447 | def __init__( self, **kwd ):
|
---|
| 448 | Rgenetics.__init__(self, **kwd)
|
---|
| 449 | self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 450 | self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
|
---|
| 451 | self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 452 |
|
---|
| 453 | class ldIndep(Rgenetics):
|
---|
| 454 | """
|
---|
| 455 | LD (a good measure of redundancy of information) depleted Plink Binary compressed 2bit/geno
|
---|
| 456 | This is really a plink binary, but some tools work better with less redundancy so are constrained to
|
---|
| 457 | these files
|
---|
| 458 | """
|
---|
| 459 | file_ext="ldreduced"
|
---|
| 460 |
|
---|
| 461 | def __init__( self, **kwd ):
|
---|
| 462 | Rgenetics.__init__(self, **kwd)
|
---|
| 463 | self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 464 | self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
|
---|
| 465 | self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 466 |
|
---|
| 467 |
|
---|
| 468 | class Eigenstratgeno(Rgenetics):
|
---|
| 469 | """
|
---|
| 470 | Eigenstrat format - may be able to get rid of this
|
---|
| 471 | if we move to shellfish
|
---|
| 472 | Rgenetics data collections
|
---|
| 473 | """
|
---|
| 474 | file_ext="eigenstratgeno"
|
---|
| 475 |
|
---|
| 476 | def __init__( self, **kwd ):
|
---|
| 477 | Rgenetics.__init__(self, **kwd)
|
---|
| 478 | self.add_composite_file( '%s.eigenstratgeno', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 479 | self.add_composite_file( '%s.ind', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 480 | self.add_composite_file( '%s.map', substitute_name_with_metadata = 'base_name', is_binary = False )
|
---|
| 481 |
|
---|
| 482 |
|
---|
| 483 |
|
---|
| 484 | class Eigenstratpca(Rgenetics):
|
---|
| 485 | """
|
---|
| 486 | Eigenstrat PCA file for case control adjustment
|
---|
| 487 | Rgenetics data collections
|
---|
| 488 | """
|
---|
| 489 | file_ext="eigenstratpca"
|
---|
| 490 |
|
---|
| 491 | def __init__( self, **kwd ):
|
---|
| 492 | Rgenetics.__init__(self, **kwd)
|
---|
| 493 | self.add_composite_file( '%s.eigenstratpca', description = 'Eigenstrat PCA file', substitute_name_with_metadata = 'base_name' )
|
---|
| 494 |
|
---|
| 495 |
|
---|
| 496 | class Snptest(Rgenetics):
|
---|
| 497 | """
|
---|
| 498 | BioC snptest Rgenetics data collections
|
---|
| 499 | """
|
---|
| 500 | file_ext="snptest"
|
---|
| 501 |
|
---|
| 502 |
|
---|
| 503 | class Pheno(Tabular):
|
---|
| 504 | """
|
---|
| 505 | base class for pheno files
|
---|
| 506 | """
|
---|
| 507 | file_ext = 'pheno'
|
---|
| 508 |
|
---|
| 509 |
|
---|
| 510 | class RexpBase( Html ):
|
---|
| 511 | """
|
---|
| 512 | base class for BioC data structures in Galaxy
|
---|
| 513 | must be constructed with the pheno data in place since that
|
---|
| 514 | goes into the metadata for each instance
|
---|
| 515 | """
|
---|
| 516 | MetadataElement( name="columns", default=0, desc="Number of columns", visible=True )
|
---|
| 517 | MetadataElement( name="column_names", default=[], desc="Column names", visible=True )
|
---|
| 518 | MetadataElement(name="pheCols",default=[],desc="Select list for potentially interesting variables",visible=True)
|
---|
| 519 | MetadataElement( name="base_name",
|
---|
| 520 | desc="base name for all transformed versions of this expression dataset", default='rexpression', set_in_upload=True)
|
---|
| 521 | MetadataElement( name="pheno_path", desc="Path to phenotype data for this experiment", default="rexpression.pheno", visible=True)
|
---|
| 522 | file_ext = 'rexpbase'
|
---|
| 523 | html_table = None
|
---|
| 524 | is_binary = True
|
---|
| 525 | composite_type = 'auto_primary_file'
|
---|
| 526 | allow_datatype_change = False
|
---|
| 527 |
|
---|
| 528 |
|
---|
| 529 | def __init__( self, **kwd ):
|
---|
| 530 | Html.__init__(self,**kwd)
|
---|
| 531 | self.add_composite_file( '%s.pheno', description = 'Phenodata tab text file',
|
---|
| 532 | substitute_name_with_metadata = 'base_name', is_binary=False)
|
---|
| 533 |
|
---|
| 534 | def generate_primary_file( self, dataset = None ):
|
---|
| 535 | """
|
---|
| 536 | This is called only at upload to write the html file
|
---|
| 537 | cannot rename the datasets here - they come with the default unfortunately
|
---|
| 538 | """
|
---|
| 539 | return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>'
|
---|
| 540 |
|
---|
| 541 | def get_mime(self):
|
---|
| 542 | """Returns the mime type of the datatype"""
|
---|
| 543 | return 'text/html'
|
---|
| 544 |
|
---|
| 545 | def get_phecols(self, phenolist=[], maxConc=20):
|
---|
| 546 | """
|
---|
| 547 | sept 2009: cannot use whitespace to split - make a more complex structure here
|
---|
| 548 | and adjust the methods that rely on this structure
|
---|
| 549 | return interesting phenotype column names for an rexpression eset or affybatch
|
---|
| 550 | to use in array subsetting and so on. Returns a data structure for a
|
---|
| 551 | dynamic Galaxy select parameter.
|
---|
| 552 | A column with only 1 value doesn't change, so is not interesting for
|
---|
| 553 | analysis. A column with a different value in every row is equivalent to a unique
|
---|
| 554 | identifier so is also not interesting for anova or limma analysis - both these
|
---|
| 555 | are removed after the concordance (count of unique terms) is constructed for each
|
---|
| 556 | column. Then a complication - each remaining pair of columns is tested for
|
---|
| 557 | redundancy - if two columns are always paired, then only one is needed :)
|
---|
| 558 | """
|
---|
| 559 | for nrows,row in enumerate(phenolist): # construct concordance
|
---|
| 560 | if len(row.strip()) == 0:
|
---|
| 561 | break
|
---|
| 562 | row = row.strip().split('\t')
|
---|
| 563 | if nrows == 0: # set up from header
|
---|
| 564 | head = row
|
---|
| 565 | totcols = len(row)
|
---|
| 566 | concordance = [{} for x in head] # list of dicts
|
---|
| 567 | else:
|
---|
| 568 | for col,code in enumerate(row): # keep column order correct
|
---|
| 569 | if col >= totcols:
|
---|
| 570 | gal_Log.warning('### get_phecols error in pheno file - row %d col %d (%s) longer than header %s' % (nrows, col, row, head))
|
---|
| 571 | else:
|
---|
| 572 | concordance[col].setdefault(code,0) # first one is zero
|
---|
| 573 | concordance[col][code] += 1
|
---|
| 574 | useCols = []
|
---|
| 575 | useConc = [] # columns of interest to keep
|
---|
| 576 | nrows = len(phenolist)
|
---|
| 577 | nrows -= 1 # drop head from count
|
---|
| 578 | for c,conc in enumerate(concordance): # c is column number
|
---|
| 579 | if (len(conc) > 1) and (len(conc) < min(nrows,maxConc)): # not all same and not all different!!
|
---|
| 580 | useConc.append(conc) # keep concordance
|
---|
| 581 | useCols.append(c) # keep column
|
---|
| 582 | nuse = len(useCols)
|
---|
| 583 | # now to check for pairs of concordant columns - drop one of these.
|
---|
| 584 | delme = []
|
---|
| 585 | p = phenolist[1:] # drop header
|
---|
| 586 | plist = [x.strip().split('\t') for x in p] # list of lists
|
---|
| 587 | phe = [[x[i] for i in useCols] for x in plist if len(x) >= totcols] # strip unused data
|
---|
| 588 | for i in range(0,(nuse-1)): # for each interesting column
|
---|
| 589 | for j in range(i+1,nuse):
|
---|
| 590 | kdict = {}
|
---|
| 591 | for row in phe: # row is a list of lists
|
---|
| 592 | k = '%s%s' % (row[i],row[j]) # composite key
|
---|
| 593 | kdict[k] = k
|
---|
| 594 | if (len(kdict.keys()) == len(concordance[useCols[j]])): # i and j are always matched
|
---|
| 595 | delme.append(j)
|
---|
| 596 | delme = list(set(delme)) # remove dupes
|
---|
| 597 | listCol = []
|
---|
| 598 | delme.sort()
|
---|
| 599 | delme.reverse() # must delete from far end!
|
---|
| 600 | for i in delme:
|
---|
| 601 | del useConc[i] # get rid of concordance
|
---|
| 602 | del useCols[i] # and usecols entry
|
---|
| 603 | for i,conc in enumerate(useConc): # these are all unique columns for the design matrix
|
---|
| 604 | ccounts = [(conc.get(code,0),code) for code in conc.keys()] # decorate
|
---|
| 605 | ccounts.sort()
|
---|
| 606 | cc = [(x[1],x[0]) for x in ccounts] # list of code count tuples
|
---|
| 607 | codeDetails = (head[useCols[i]],cc) # ('foo',[('a',3),('b',11),..])
|
---|
| 608 | listCol.append(codeDetails)
|
---|
| 609 | if len(listCol) > 0:
|
---|
| 610 | res = listCol
|
---|
| 611 | # metadata.pheCols becomes [('bar;22,zot;113','foo'), ...]
|
---|
| 612 | else:
|
---|
| 613 | res = [('no usable phenotype columns found',[('?',0),]),]
|
---|
| 614 | return res
|
---|
| 615 |
|
---|
| 616 |
|
---|
| 617 |
|
---|
| 618 | def get_pheno(self,dataset):
|
---|
| 619 | """
|
---|
| 620 | expects a .pheno file in the extra_files_dir - ugh
|
---|
| 621 | note that R is wierd and adds the row.name in
|
---|
| 622 | the header so the columns are all wrong - unless you tell it not to.
|
---|
| 623 | A file can be written as
|
---|
| 624 | write.table(file='foo.pheno',pData(foo),sep='\t',quote=F,row.names=F)
|
---|
| 625 | """
|
---|
| 626 | p = file(dataset.metadata.pheno_path,'r').readlines()
|
---|
| 627 | if len(p) > 0: # should only need to fix an R pheno file once
|
---|
| 628 | head = p[0].strip().split('\t')
|
---|
| 629 | line1 = p[1].strip().split('\t')
|
---|
| 630 | if len(head) < len(line1):
|
---|
| 631 | head.insert(0,'ChipFileName') # fix R write.table b0rken-ness
|
---|
| 632 | p[0] = '\t'.join(head)
|
---|
| 633 | else:
|
---|
| 634 | p = []
|
---|
| 635 | return '\n'.join(p)
|
---|
| 636 |
|
---|
| 637 | def set_peek( self, dataset, **kwd ):
|
---|
| 638 | """
|
---|
| 639 | expects a .pheno file in the extra_files_dir - ugh
|
---|
| 640 | note that R is wierd and does not include the row.name in
|
---|
| 641 | the header. why?"""
|
---|
| 642 | if not dataset.dataset.purged:
|
---|
| 643 | pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
|
---|
| 644 | try:
|
---|
| 645 | p = file(pp,'r').readlines()
|
---|
| 646 | except:
|
---|
| 647 | p = ['##failed to find %s' % pp,]
|
---|
| 648 | dataset.peek = ''.join(p[:5])
|
---|
| 649 | dataset.blurb = 'Galaxy Rexpression composite file'
|
---|
| 650 | else:
|
---|
| 651 | dataset.peek = 'file does not exist\n'
|
---|
| 652 | dataset.blurb = 'file purged from disk'
|
---|
| 653 |
|
---|
| 654 | def get_peek( self, dataset ):
|
---|
| 655 | """
|
---|
| 656 | expects a .pheno file in the extra_files_dir - ugh
|
---|
| 657 | """
|
---|
| 658 | pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name)
|
---|
| 659 | try:
|
---|
| 660 | p = file(pp,'r').readlines()
|
---|
| 661 | except:
|
---|
| 662 | p = ['##failed to find %s' % pp]
|
---|
| 663 | return ''.join(p[:5])
|
---|
| 664 |
|
---|
| 665 | def get_file_peek(self,filename):
|
---|
| 666 | """
|
---|
| 667 | can't really peek at a filename - need the extra_files_path and such?
|
---|
| 668 | """
|
---|
| 669 | h = '## rexpression get_file_peek: no file found'
|
---|
| 670 | try:
|
---|
| 671 | h = file(filename,'r').readlines()
|
---|
| 672 | except:
|
---|
| 673 | pass
|
---|
| 674 | return ''.join(h[:5])
|
---|
| 675 |
|
---|
| 676 | def regenerate_primary_file(self,dataset):
|
---|
| 677 | """
|
---|
| 678 | cannot do this until we are setting metadata
|
---|
| 679 | """
|
---|
| 680 | bn = dataset.metadata.base_name
|
---|
| 681 | flist = os.listdir(dataset.extra_files_path)
|
---|
| 682 | rval = ['<html><head><title>Files for Composite Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)]
|
---|
| 683 | for i,fname in enumerate(flist):
|
---|
| 684 | sfname = os.path.split(fname)[-1]
|
---|
| 685 | rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) )
|
---|
| 686 | rval.append( '</ul></html>' )
|
---|
| 687 | f = file(dataset.file_name,'w')
|
---|
| 688 | f.write("\n".join( rval ))
|
---|
| 689 | f.write('\n')
|
---|
| 690 | f.close()
|
---|
| 691 |
|
---|
| 692 | def init_meta( self, dataset, copy_from=None ):
|
---|
| 693 | if copy_from:
|
---|
| 694 | dataset.metadata = copy_from.metadata
|
---|
| 695 |
|
---|
| 696 | def set_meta( self, dataset, **kwd ):
|
---|
| 697 |
|
---|
| 698 | """
|
---|
| 699 | NOTE we apply the tabular machinary to the phenodata extracted
|
---|
| 700 | from a BioC eSet or affybatch.
|
---|
| 701 |
|
---|
| 702 | """
|
---|
| 703 | Html.set_meta(self, dataset, **kwd)
|
---|
| 704 | try:
|
---|
| 705 | flist = os.listdir(dataset.extra_files_path)
|
---|
| 706 | except:
|
---|
| 707 | if verbose:
|
---|
| 708 | gal_Log.debug('@@@rexpression set_meta failed - no dataset?')
|
---|
| 709 | return False
|
---|
| 710 | bn = dataset.metadata.base_name
|
---|
| 711 | if not bn:
|
---|
| 712 | for f in flist:
|
---|
| 713 | n = os.path.splitext(f)[0]
|
---|
| 714 | bn = n
|
---|
| 715 | dataset.metadata.base_name = bn
|
---|
| 716 | if not bn:
|
---|
| 717 | bn = '?'
|
---|
| 718 | dataset.metadata.base_name = bn
|
---|
| 719 | pn = '%s.pheno' % (bn)
|
---|
| 720 | pp = os.path.join(dataset.extra_files_path,pn)
|
---|
| 721 | dataset.metadata.pheno_path=pp
|
---|
| 722 | try:
|
---|
| 723 | pf = file(pp,'r').readlines() # read the basename.phenodata in the extra_files_path
|
---|
| 724 | except:
|
---|
| 725 | pf = None
|
---|
| 726 | if pf:
|
---|
| 727 | h = pf[0].strip()
|
---|
| 728 | h = h.split('\t') # hope is header
|
---|
| 729 | h = [escape(x) for x in h]
|
---|
| 730 | dataset.metadata.column_names = h
|
---|
| 731 | dataset.metadata.columns = len(h)
|
---|
| 732 | dataset.peek = ''.join(pf[:5])
|
---|
| 733 | else:
|
---|
| 734 | dataset.metadata.column_names = []
|
---|
| 735 | dataset.metadata.columns = 0
|
---|
| 736 | dataset.peek = 'No pheno file found'
|
---|
| 737 | if pf and len(pf) > 1:
|
---|
| 738 | dataset.metadata.pheCols = self.get_phecols(phenolist=pf)
|
---|
| 739 | else:
|
---|
| 740 | dataset.metadata.pheCols = [('','No useable phenotypes found',False),]
|
---|
| 741 | #self.regenerate_primary_file(dataset)
|
---|
| 742 | if not dataset.info:
|
---|
| 743 | dataset.info = 'Galaxy Expression datatype object'
|
---|
| 744 | if not dataset.blurb:
|
---|
| 745 | dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit'
|
---|
| 746 | return True
|
---|
| 747 |
|
---|
| 748 | def make_html_table( self, pp='nothing supplied from peek\n'):
|
---|
| 749 | """
|
---|
| 750 | Create HTML table, used for displaying peek
|
---|
| 751 | """
|
---|
| 752 | out = ['<table cellspacing="0" cellpadding="3">',]
|
---|
| 753 | p = pp.split('\n')
|
---|
| 754 | try:
|
---|
| 755 | # Generate column header
|
---|
| 756 | for i,row in enumerate(p):
|
---|
| 757 | lrow = row.strip().split('\t')
|
---|
| 758 | if i == 0:
|
---|
| 759 | orow = ['<th>%s</th>' % escape(x) for x in lrow]
|
---|
| 760 | orow.insert(0,'<tr>')
|
---|
| 761 | orow.append('</tr>')
|
---|
| 762 | else:
|
---|
| 763 | orow = ['<td>%s</td>' % escape(x) for x in lrow]
|
---|
| 764 | orow.insert(0,'<tr>')
|
---|
| 765 | orow.append('</tr>')
|
---|
| 766 | out.append(''.join(orow))
|
---|
| 767 | out.append( '</table>' )
|
---|
| 768 | out = "\n".join( out )
|
---|
| 769 | except Exception, exc:
|
---|
| 770 | out = "Can't create html table %s" % str( exc )
|
---|
| 771 | return out
|
---|
| 772 |
|
---|
| 773 | def display_peek( self, dataset ):
|
---|
| 774 | """
|
---|
| 775 | Returns formatted html of peek
|
---|
| 776 | """
|
---|
| 777 | out=self.make_html_table(dataset.peek)
|
---|
| 778 | return out
|
---|
| 779 |
|
---|
| 780 | def get_mime(self):
|
---|
| 781 | """
|
---|
| 782 | Returns the mime type of the datatype
|
---|
| 783 | """
|
---|
| 784 | return 'text/html'
|
---|
| 785 |
|
---|
| 786 |
|
---|
| 787 | class Affybatch( RexpBase ):
|
---|
| 788 | """
|
---|
| 789 | derived class for BioC data structures in Galaxy
|
---|
| 790 | """
|
---|
| 791 |
|
---|
| 792 | file_ext = "affybatch"
|
---|
| 793 |
|
---|
| 794 | def __init__( self, **kwd ):
|
---|
| 795 | RexpBase.__init__(self, **kwd)
|
---|
| 796 | self.add_composite_file( '%s.affybatch', description = 'AffyBatch R object saved to file',
|
---|
| 797 | substitute_name_with_metadata = 'base_name', is_binary=True )
|
---|
| 798 |
|
---|
| 799 | class Eset( RexpBase ):
|
---|
| 800 | """
|
---|
| 801 | derived class for BioC data structures in Galaxy
|
---|
| 802 | """
|
---|
| 803 | file_ext = "eset"
|
---|
| 804 |
|
---|
| 805 | def __init__( self, **kwd ):
|
---|
| 806 | RexpBase.__init__(self, **kwd)
|
---|
| 807 | self.add_composite_file( '%s.eset', description = 'ESet R object saved to file',
|
---|
| 808 | substitute_name_with_metadata = 'base_name', is_binary = True )
|
---|
| 809 |
|
---|
| 810 |
|
---|
| 811 | class MAlist( RexpBase ):
|
---|
| 812 | """
|
---|
| 813 | derived class for BioC data structures in Galaxy
|
---|
| 814 | """
|
---|
| 815 | file_ext = "malist"
|
---|
| 816 |
|
---|
| 817 | def __init__( self, **kwd ):
|
---|
| 818 | RexpBase.__init__(self, **kwd)
|
---|
| 819 | self.add_composite_file( '%s.malist', description = 'MAlist R object saved to file',
|
---|
| 820 | substitute_name_with_metadata = 'base_name', is_binary = True )
|
---|
| 821 |
|
---|
| 822 |
|
---|
| 823 | if __name__ == '__main__':
|
---|
| 824 | import doctest, sys
|
---|
| 825 | doctest.testmod(sys.modules[__name__])
|
---|
| 826 |
|
---|