[2] | 1 | """ |
---|
| 2 | Interval datatypes |
---|
| 3 | """ |
---|
| 4 | |
---|
| 5 | import pkg_resources |
---|
| 6 | pkg_resources.require( "bx-python" ) |
---|
| 7 | |
---|
| 8 | import logging, os, sys, time, tempfile, shutil |
---|
| 9 | import data |
---|
| 10 | from galaxy import util |
---|
| 11 | from galaxy.datatypes.sniff import * |
---|
| 12 | from galaxy.web import url_for |
---|
| 13 | from cgi import escape |
---|
| 14 | import urllib |
---|
| 15 | from bx.intervals.io import * |
---|
| 16 | from galaxy.datatypes import metadata |
---|
| 17 | from galaxy.datatypes.metadata import MetadataElement |
---|
| 18 | from galaxy.datatypes.tabular import Tabular |
---|
| 19 | import math |
---|
| 20 | |
---|
| 21 | log = logging.getLogger(__name__) |
---|
| 22 | |
---|
| 23 | # Contains the meta columns and the words that map to it; list aliases on the |
---|
| 24 | # right side of the : in decreasing order of priority |
---|
| 25 | alias_spec = { |
---|
| 26 | 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name' ], |
---|
| 27 | 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)' ], |
---|
| 28 | 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)' ], |
---|
| 29 | 'strandCol' : [ 'strand', 'STRAND', 'Strand' ], |
---|
| 30 | 'nameCol' : [ 'name', 'NAME', 'Name', 'name2', 'NAME2', 'Name2', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Peptide ID' ] |
---|
| 31 | } |
---|
| 32 | |
---|
| 33 | # a little faster lookup |
---|
| 34 | alias_helper = {} |
---|
| 35 | for key, value in alias_spec.items(): |
---|
| 36 | for elem in value: |
---|
| 37 | alias_helper[elem] = key |
---|
| 38 | |
---|
| 39 | # Constants for configuring viewport generation: If a line is greater than |
---|
| 40 | # VIEWPORT_MAX_READS_PER_LINE * VIEWPORT_READLINE_BUFFER_SIZE bytes in size, |
---|
| 41 | # then we will not generate a viewport for that dataset |
---|
| 42 | VIEWPORT_READLINE_BUFFER_SIZE = 1048576 # 1MB |
---|
| 43 | VIEWPORT_MAX_READS_PER_LINE = 10 |
---|
| 44 | |
---|
| 45 | class Interval( Tabular ): |
---|
| 46 | """Tab delimited data containing interval information""" |
---|
| 47 | file_ext = "interval" |
---|
| 48 | |
---|
| 49 | """Add metadata elements""" |
---|
| 50 | MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter ) |
---|
| 51 | MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter ) |
---|
| 52 | MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter ) |
---|
| 53 | MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 ) |
---|
| 54 | MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 ) |
---|
| 55 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) |
---|
| 56 | |
---|
| 57 | def __init__(self, **kwd): |
---|
| 58 | """Initialize interval datatype, by adding UCSC display apps""" |
---|
| 59 | Tabular.__init__(self, **kwd) |
---|
| 60 | self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) |
---|
| 61 | def init_meta( self, dataset, copy_from=None ): |
---|
| 62 | Tabular.init_meta( self, dataset, copy_from=copy_from ) |
---|
| 63 | def set_peek( self, dataset, line_count=None, is_multi_byte=False ): |
---|
| 64 | """Set the peek and blurb text""" |
---|
| 65 | if not dataset.dataset.purged: |
---|
| 66 | dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
---|
| 67 | if line_count is None: |
---|
| 68 | # See if line_count is stored in the metadata |
---|
| 69 | if dataset.metadata.data_lines: |
---|
| 70 | dataset.blurb = "%s regions" % util.commaify( str( dataset.metadata.data_lines ) ) |
---|
| 71 | else: |
---|
| 72 | # Number of lines is not known ( this should not happen ), and auto-detect is |
---|
| 73 | # needed to set metadata |
---|
| 74 | dataset.blurb = "? regions" |
---|
| 75 | else: |
---|
| 76 | dataset.blurb = "%s regions" % util.commaify( str( line_count ) ) |
---|
| 77 | else: |
---|
| 78 | dataset.peek = 'file does not exist' |
---|
| 79 | dataset.blurb = 'file purged from disk' |
---|
| 80 | def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ): |
---|
| 81 | """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand""" |
---|
| 82 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 ) |
---|
| 83 | if dataset.has_data(): |
---|
| 84 | empty_line_count = 0 |
---|
| 85 | num_check_lines = 100 # only check up to this many non empty lines |
---|
| 86 | for i, line in enumerate( file( dataset.file_name ) ): |
---|
| 87 | line = line.rstrip( '\r\n' ) |
---|
| 88 | if line: |
---|
| 89 | if ( first_line_is_header or line[0] == '#' ): |
---|
| 90 | self.init_meta( dataset ) |
---|
| 91 | line = line.strip( '#' ) |
---|
| 92 | elems = line.split( '\t' ) |
---|
| 93 | for meta_name, header_list in alias_spec.iteritems(): |
---|
| 94 | for header_val in header_list: |
---|
| 95 | if header_val in elems: |
---|
| 96 | #found highest priority header to meta_name |
---|
| 97 | setattr( dataset.metadata, meta_name, elems.index( header_val ) + 1 ) |
---|
| 98 | break #next meta_name |
---|
| 99 | break # Our metadata is set, so break out of the outer loop |
---|
| 100 | else: |
---|
| 101 | # Header lines in Interval files are optional. For example, BED is Interval but has no header. |
---|
| 102 | # We'll make a best guess at the location of the metadata columns. |
---|
| 103 | metadata_is_set = False |
---|
| 104 | elems = line.split( '\t' ) |
---|
| 105 | if len( elems ) > 2: |
---|
| 106 | for str in data.col1_startswith: |
---|
| 107 | if line.lower().startswith( str ): |
---|
| 108 | if overwrite or not dataset.metadata.element_is_set( 'chromCol' ): |
---|
| 109 | dataset.metadata.chromCol = 1 |
---|
| 110 | try: |
---|
| 111 | int( elems[1] ) |
---|
| 112 | if overwrite or not dataset.metadata.element_is_set( 'startCol' ): |
---|
| 113 | dataset.metadata.startCol = 2 |
---|
| 114 | except: |
---|
| 115 | pass # Metadata default will be used |
---|
| 116 | try: |
---|
| 117 | int( elems[2] ) |
---|
| 118 | if overwrite or not dataset.metadata.element_is_set( 'endCol' ): |
---|
| 119 | dataset.metadata.endCol = 3 |
---|
| 120 | except: |
---|
| 121 | pass # Metadata default will be used |
---|
| 122 | #we no longer want to guess that this column is the 'name', name must now be set manually for interval files |
---|
| 123 | #we will still guess at the strand, as we can make a more educated guess |
---|
| 124 | #if len( elems ) > 3: |
---|
| 125 | # try: |
---|
| 126 | # int( elems[3] ) |
---|
| 127 | # except: |
---|
| 128 | # if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): |
---|
| 129 | # dataset.metadata.nameCol = 4 |
---|
| 130 | if len( elems ) < 6 or elems[5] not in data.valid_strand: |
---|
| 131 | if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): |
---|
| 132 | dataset.metadata.strandCol = 0 |
---|
| 133 | else: |
---|
| 134 | if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): |
---|
| 135 | dataset.metadata.strandCol = 6 |
---|
| 136 | metadata_is_set = True |
---|
| 137 | break |
---|
| 138 | if metadata_is_set or ( i - empty_line_count ) > num_check_lines: |
---|
| 139 | break # Our metadata is set or we examined 100 non-empty lines, so break out of the outer loop |
---|
| 140 | else: |
---|
| 141 | empty_line_count += 1 |
---|
| 142 | def displayable( self, dataset ): |
---|
| 143 | try: |
---|
| 144 | return dataset.has_data() \ |
---|
| 145 | and dataset.state == dataset.states.OK \ |
---|
| 146 | and dataset.metadata.columns > 0 \ |
---|
| 147 | and dataset.metadata.data_lines > 0 \ |
---|
| 148 | and dataset.metadata.chromCol \ |
---|
| 149 | and dataset.metadata.startCol \ |
---|
| 150 | and dataset.metadata.endCol |
---|
| 151 | except: |
---|
| 152 | return False |
---|
| 153 | |
---|
| 154 | def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ): |
---|
| 155 | """Return a chrom, start, stop tuple for viewing a file.""" |
---|
| 156 | viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines |
---|
| 157 | max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines |
---|
| 158 | if not self.displayable( dataset ): |
---|
| 159 | return ( None, None, None ) |
---|
| 160 | try: |
---|
| 161 | # If column indexes were not passwed, determine from metadata |
---|
| 162 | if chrom_col is None: |
---|
| 163 | chrom_col = int( dataset.metadata.chromCol ) - 1 |
---|
| 164 | if start_col is None: |
---|
| 165 | start_col = int( dataset.metadata.startCol ) - 1 |
---|
| 166 | if end_col is None: |
---|
| 167 | end_col = int( dataset.metadata.endCol ) - 1 |
---|
| 168 | # Scan lines of file to find a reasonable chromosome and range |
---|
| 169 | chrom = None |
---|
| 170 | start = sys.maxint |
---|
| 171 | end = 0 |
---|
| 172 | max_col = max( chrom_col, start_col, end_col ) |
---|
| 173 | fh = open( dataset.file_name ) |
---|
| 174 | while True: |
---|
| 175 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
| 176 | # Stop if at end of file |
---|
| 177 | if not line: |
---|
| 178 | break |
---|
| 179 | # Skip comment lines |
---|
| 180 | if not line.startswith( '#' ): |
---|
| 181 | try: |
---|
| 182 | fields = line.rstrip().split( '\t' ) |
---|
| 183 | if len( fields ) > max_col: |
---|
| 184 | if chrom is None or chrom == fields[ chrom_col ]: |
---|
| 185 | start = min( start, int( fields[ start_col ] ) ) |
---|
| 186 | end = max( end, int( fields[ end_col ] ) ) |
---|
| 187 | # Set chrom last, in case start and end are not integers |
---|
| 188 | chrom = fields[ chrom_col ] |
---|
| 189 | viewport_feature_count -= 1 |
---|
| 190 | except Exception, e: |
---|
| 191 | # Most likely a non-integer field has been encountered |
---|
| 192 | # for start / stop. Just ignore and make sure we finish |
---|
| 193 | # reading the line and decrementing the counters. |
---|
| 194 | pass |
---|
| 195 | # Make sure we are at the next new line |
---|
| 196 | readline_count = VIEWPORT_MAX_READS_PER_LINE |
---|
| 197 | while line.rstrip( '\n\r' ) == line: |
---|
| 198 | assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id ) |
---|
| 199 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
| 200 | if not line: break #EOF |
---|
| 201 | readline_count -= 1 |
---|
| 202 | max_line_count -= 1 |
---|
| 203 | if not viewport_feature_count or not max_line_count: |
---|
| 204 | #exceeded viewport or total line count to check |
---|
| 205 | break |
---|
| 206 | if chrom is not None: |
---|
| 207 | return ( chrom, str( start ), str( end ) ) # Necessary to return strings? |
---|
| 208 | except Exception, e: |
---|
| 209 | # Unexpected error, possibly missing metadata |
---|
| 210 | log.exception( "Exception caught attempting to generate viewport for dataset '%d'", dataset.id ) |
---|
| 211 | return ( None, None, None ) |
---|
| 212 | |
---|
| 213 | def as_ucsc_display_file( self, dataset, **kwd ): |
---|
| 214 | """Returns file contents with only the bed data""" |
---|
| 215 | fd, temp_name = tempfile.mkstemp() |
---|
| 216 | c, s, e, t, n = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol or 0, dataset.metadata.nameCol or 0 |
---|
| 217 | c, s, e, t, n = int(c)-1, int(s)-1, int(e)-1, int(t)-1, int(n)-1 |
---|
| 218 | if t >= 0: # strand column (should) exists |
---|
| 219 | for i, elems in enumerate( util.file_iter(dataset.file_name) ): |
---|
| 220 | strand = "+" |
---|
| 221 | name = "region_%i" % i |
---|
| 222 | if n >= 0 and n < len( elems ): name = elems[n] |
---|
| 223 | if t<len(elems): strand = elems[t] |
---|
| 224 | tmp = [ elems[c], elems[s], elems[e], name, '0', strand ] |
---|
| 225 | os.write(fd, '%s\n' % '\t'.join(tmp) ) |
---|
| 226 | elif n >= 0: # name column (should) exists |
---|
| 227 | for i, elems in enumerate( util.file_iter(dataset.file_name) ): |
---|
| 228 | name = "region_%i" % i |
---|
| 229 | if n >= 0 and n < len( elems ): name = elems[n] |
---|
| 230 | tmp = [ elems[c], elems[s], elems[e], name ] |
---|
| 231 | os.write(fd, '%s\n' % '\t'.join(tmp) ) |
---|
| 232 | else: |
---|
| 233 | for elems in util.file_iter(dataset.file_name): |
---|
| 234 | tmp = [ elems[c], elems[s], elems[e] ] |
---|
| 235 | os.write(fd, '%s\n' % '\t'.join(tmp) ) |
---|
| 236 | os.close(fd) |
---|
| 237 | return open(temp_name) |
---|
| 238 | def make_html_table( self, dataset, skipchars=[] ): |
---|
| 239 | """Create HTML table, used for displaying peek""" |
---|
| 240 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
| 241 | comments = [] |
---|
| 242 | try: |
---|
| 243 | # Generate column header |
---|
| 244 | out.append('<tr>') |
---|
| 245 | for i in range( 1, dataset.metadata.columns+1 ): |
---|
| 246 | if i == dataset.metadata.chromCol: |
---|
| 247 | out.append( '<th>%s.Chrom</th>' % i ) |
---|
| 248 | elif i == dataset.metadata.startCol: |
---|
| 249 | out.append( '<th>%s.Start</th>' % i ) |
---|
| 250 | elif i == dataset.metadata.endCol: |
---|
| 251 | out.append( '<th>%s.End</th>' % i ) |
---|
| 252 | elif dataset.metadata.strandCol and i == dataset.metadata.strandCol: |
---|
| 253 | out.append( '<th>%s.Strand</th>' % i ) |
---|
| 254 | elif dataset.metadata.nameCol and i == dataset.metadata.nameCol: |
---|
| 255 | out.append( '<th>%s.Name</th>' % i ) |
---|
| 256 | else: |
---|
| 257 | out.append( '<th>%s</th>' % i ) |
---|
| 258 | out.append('</tr>') |
---|
| 259 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
| 260 | out.append( '</table>' ) |
---|
| 261 | out = "".join( out ) |
---|
| 262 | except Exception, exc: |
---|
| 263 | out = "Can't create peek %s" % str( exc ) |
---|
| 264 | return out |
---|
| 265 | def ucsc_links( self, dataset, type, app, base_url ): |
---|
| 266 | """ |
---|
| 267 | Generate links to UCSC genome browser sites based on the dbkey |
---|
| 268 | and content of dataset. |
---|
| 269 | """ |
---|
| 270 | # Filter UCSC sites to only those that are supported by this build and |
---|
| 271 | # enabled. |
---|
| 272 | valid_sites = [ ( name, url ) |
---|
| 273 | for name, url in util.get_ucsc_by_build( dataset.dbkey ) |
---|
| 274 | if name in app.config.ucsc_display_sites ] |
---|
| 275 | if not valid_sites: |
---|
| 276 | return [] |
---|
| 277 | # If there are any valid sites, we need to generate the estimated |
---|
| 278 | # viewport |
---|
| 279 | chrom, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
| 280 | if chrom is None: |
---|
| 281 | return [] |
---|
| 282 | # Accumulate links for valid sites |
---|
| 283 | ret_val = [] |
---|
| 284 | for site_name, site_url in valid_sites: |
---|
| 285 | internal_url = url_for( controller='/dataset', dataset_id=dataset.id, |
---|
| 286 | action='display_at', filename='ucsc_' + site_name ) |
---|
| 287 | # HACK: UCSC doesn't support https, so force http even if our URL |
---|
| 288 | # scheme is https. Making this work requires additional |
---|
| 289 | # hackery in your upstream proxy. If UCSC ever supports |
---|
| 290 | # https, remove this hack. |
---|
| 291 | if base_url.startswith( 'https://' ): |
---|
| 292 | base_url = base_url.replace( 'https', 'http', 1 ) |
---|
| 293 | display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" |
---|
| 294 | % (base_url, url_for( controller='root' ), dataset.id, type) ) |
---|
| 295 | redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" |
---|
| 296 | % (site_url, dataset.dbkey, chrom, start, stop ) ) |
---|
| 297 | link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url ) |
---|
| 298 | ret_val.append( ( site_name, link ) ) |
---|
| 299 | return ret_val |
---|
| 300 | def validate( self, dataset ): |
---|
| 301 | """Validate an interval file using the bx GenomicIntervalReader""" |
---|
| 302 | errors = list() |
---|
| 303 | c, s, e, t = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol |
---|
| 304 | c, s, e, t = int(c)-1, int(s)-1, int(e)-1, int(t)-1 |
---|
| 305 | infile = open(dataset.file_name, "r") |
---|
| 306 | reader = GenomicIntervalReader( |
---|
| 307 | infile, |
---|
| 308 | chrom_col = c, |
---|
| 309 | start_col = s, |
---|
| 310 | end_col = e, |
---|
| 311 | strand_col = t) |
---|
| 312 | |
---|
| 313 | while True: |
---|
| 314 | try: |
---|
| 315 | reader.next() |
---|
| 316 | except ParseError, e: |
---|
| 317 | errors.append(e) |
---|
| 318 | except StopIteration: |
---|
| 319 | infile.close() |
---|
| 320 | return errors |
---|
| 321 | |
---|
| 322 | def repair_methods( self, dataset ): |
---|
| 323 | """Return options for removing errors along with a description""" |
---|
| 324 | return [("lines","Remove erroneous lines")] |
---|
| 325 | |
---|
| 326 | def sniff( self, filename ): |
---|
| 327 | """ |
---|
| 328 | Checks for 'intervalness' |
---|
| 329 | |
---|
| 330 | This format is mostly used by galaxy itself. Valid interval files should include |
---|
| 331 | a valid header comment, but this seems to be loosely regulated. |
---|
| 332 | |
---|
| 333 | >>> fname = get_test_fname( 'test_space.txt' ) |
---|
| 334 | >>> Interval().sniff( fname ) |
---|
| 335 | False |
---|
| 336 | >>> fname = get_test_fname( 'interval.interval' ) |
---|
| 337 | >>> Interval().sniff( fname ) |
---|
| 338 | True |
---|
| 339 | """ |
---|
| 340 | headers = get_headers( filename, '\t' ) |
---|
| 341 | try: |
---|
| 342 | """ |
---|
| 343 | If we got here, we already know the file is_column_based and is not bed, |
---|
| 344 | so we'll just look for some valid data. |
---|
| 345 | """ |
---|
| 346 | for hdr in headers: |
---|
| 347 | if hdr and not hdr[0].startswith( '#' ): |
---|
| 348 | if len(hdr) < 3: |
---|
| 349 | return False |
---|
| 350 | try: |
---|
| 351 | # Assume chrom start and end are in column positions 1 and 2 |
---|
| 352 | # respectively ( for 0 based columns ) |
---|
| 353 | check = int( hdr[1] ) |
---|
| 354 | check = int( hdr[2] ) |
---|
| 355 | except: |
---|
| 356 | return False |
---|
| 357 | return True |
---|
| 358 | except: |
---|
| 359 | return False |
---|
| 360 | |
---|
| 361 | def get_track_window(self, dataset, data, start, end): |
---|
| 362 | """ |
---|
| 363 | Assumes the incoming track data is sorted already. |
---|
| 364 | """ |
---|
| 365 | window = list() |
---|
| 366 | for record in data: |
---|
| 367 | fields = record.rstrip("\n\r").split("\t") |
---|
| 368 | record_chrom = fields[dataset.metadata.chromCol-1] |
---|
| 369 | record_start = int(fields[dataset.metadata.startCol-1]) |
---|
| 370 | record_end = int(fields[dataset.metadata.endCol-1]) |
---|
| 371 | if record_start < end and record_end > start: |
---|
| 372 | window.append( (record_chrom, record_start, record_end) ) #Yes I did want to use a generator here, but it doesn't work downstream |
---|
| 373 | return window |
---|
| 374 | |
---|
| 375 | def get_track_resolution( self, dataset, start, end): |
---|
| 376 | return None |
---|
| 377 | |
---|
| 378 | class BedGraph( Interval ): |
---|
| 379 | """Tab delimited chrom/start/end/datavalue dataset""" |
---|
| 380 | |
---|
| 381 | file_ext = "bedgraph" |
---|
| 382 | |
---|
| 383 | def get_track_type( self ): |
---|
| 384 | return "LineTrack", {"data": "array_tree"} |
---|
| 385 | |
---|
| 386 | def as_ucsc_display_file( self, dataset, **kwd ): |
---|
| 387 | """ |
---|
| 388 | Returns file contents as is with no modifications. |
---|
| 389 | TODO: this is a functional stub and will need to be enhanced moving forward to provide additional support for bedgraph. |
---|
| 390 | """ |
---|
| 391 | return open( dataset.file_name ) |
---|
| 392 | |
---|
| 393 | def get_estimated_display_viewport( self, dataset, chrom_col = 0, start_col = 1, end_col = 2 ): |
---|
| 394 | """ |
---|
| 395 | Set viewport based on dataset's first 100 lines. |
---|
| 396 | """ |
---|
| 397 | return Interval.get_estimated_display_viewport( self, dataset, chrom_col = chrom_col, start_col = start_col, end_col = end_col ) |
---|
| 398 | |
---|
| 399 | class Bed( Interval ): |
---|
| 400 | """Tab delimited data in BED format""" |
---|
| 401 | file_ext = "bed" |
---|
| 402 | |
---|
| 403 | """Add metadata elements""" |
---|
| 404 | MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter ) |
---|
| 405 | MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter ) |
---|
| 406 | MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter ) |
---|
| 407 | MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 ) |
---|
| 408 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) |
---|
| 409 | MetadataElement( name="viz_filter_cols", default=[4], param=metadata.ColumnParameter, multiple=True ) |
---|
| 410 | ###do we need to repeat these? they are the same as should be inherited from interval type |
---|
| 411 | |
---|
| 412 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
| 413 | """Sets the metadata information for datasets previously determined to be in bed format.""" |
---|
| 414 | i = 0 |
---|
| 415 | if dataset.has_data(): |
---|
| 416 | for i, line in enumerate( file(dataset.file_name) ): |
---|
| 417 | metadata_set = False |
---|
| 418 | line = line.rstrip('\r\n') |
---|
| 419 | if line and not line.startswith('#'): |
---|
| 420 | elems = line.split('\t') |
---|
| 421 | if len(elems) > 2: |
---|
| 422 | for startswith in data.col1_startswith: |
---|
| 423 | if line.lower().startswith( startswith ): |
---|
| 424 | if len( elems ) > 3: |
---|
| 425 | if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): |
---|
| 426 | dataset.metadata.nameCol = 4 |
---|
| 427 | if len(elems) < 6: |
---|
| 428 | if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): |
---|
| 429 | dataset.metadata.strandCol = 0 |
---|
| 430 | else: |
---|
| 431 | if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): |
---|
| 432 | dataset.metadata.strandCol = 6 |
---|
| 433 | metadata_set = True |
---|
| 434 | break |
---|
| 435 | if metadata_set: break |
---|
| 436 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i ) |
---|
| 437 | |
---|
| 438 | def as_ucsc_display_file( self, dataset, **kwd ): |
---|
| 439 | """Returns file contents with only the bed data. If bed 6+, treat as interval.""" |
---|
| 440 | for line in open(dataset.file_name): |
---|
| 441 | line = line.strip() |
---|
| 442 | if line == "" or line.startswith("#"): |
---|
| 443 | continue |
---|
| 444 | fields = line.split('\t') |
---|
| 445 | """check to see if this file doesn't conform to strict genome browser accepted bed""" |
---|
| 446 | try: |
---|
| 447 | if len(fields) > 12: |
---|
| 448 | return Interval.as_ucsc_display_file(self, dataset) #too many fields |
---|
| 449 | if len(fields) > 6: |
---|
| 450 | int(fields[6]) |
---|
| 451 | if len(fields) > 7: |
---|
| 452 | int(fields[7]) |
---|
| 453 | if len(fields) > 8: |
---|
| 454 | if int(fields[8]) != 0: |
---|
| 455 | return Interval.as_ucsc_display_file(self, dataset) |
---|
| 456 | if len(fields) > 9: |
---|
| 457 | int(fields[9]) |
---|
| 458 | if len(fields) > 10: |
---|
| 459 | fields2 = fields[10].rstrip(",").split(",") #remove trailing comma and split on comma |
---|
| 460 | for field in fields2: |
---|
| 461 | int(field) |
---|
| 462 | if len(fields) > 11: |
---|
| 463 | fields2 = fields[11].rstrip(",").split(",") #remove trailing comma and split on comma |
---|
| 464 | for field in fields2: |
---|
| 465 | int(field) |
---|
| 466 | except: return Interval.as_ucsc_display_file(self, dataset) |
---|
| 467 | #only check first line for proper form |
---|
| 468 | break |
---|
| 469 | |
---|
| 470 | try: return open(dataset.file_name) |
---|
| 471 | except: return "This item contains no content" |
---|
| 472 | |
---|
| 473 | def sniff( self, filename ): |
---|
| 474 | """ |
---|
| 475 | Checks for 'bedness' |
---|
| 476 | |
---|
| 477 | BED lines have three required fields and nine additional optional fields. |
---|
| 478 | The number of fields per line must be consistent throughout any single set of data in |
---|
| 479 | an annotation track. The order of the optional fields is binding: lower-numbered |
---|
| 480 | fields must always be populated if higher-numbered fields are used. The data type of |
---|
| 481 | all 12 columns is: |
---|
| 482 | 1-str, 2-int, 3-int, 4-str, 5-int, 6-str, 7-int, 8-int, 9-int or list, 10-int, 11-list, 12-list |
---|
| 483 | |
---|
| 484 | For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format1 |
---|
| 485 | |
---|
| 486 | >>> fname = get_test_fname( 'test_tab.bed' ) |
---|
| 487 | >>> Bed().sniff( fname ) |
---|
| 488 | True |
---|
| 489 | >>> fname = get_test_fname( 'interval1.bed' ) |
---|
| 490 | >>> Bed().sniff( fname ) |
---|
| 491 | True |
---|
| 492 | >>> fname = get_test_fname( 'complete.bed' ) |
---|
| 493 | >>> Bed().sniff( fname ) |
---|
| 494 | True |
---|
| 495 | """ |
---|
| 496 | headers = get_headers( filename, '\t' ) |
---|
| 497 | try: |
---|
| 498 | if not headers: return False |
---|
| 499 | for hdr in headers: |
---|
| 500 | if (hdr[0] == '' or hdr[0].startswith( '#' )): |
---|
| 501 | continue |
---|
| 502 | valid_col1 = False |
---|
| 503 | if len(hdr) < 3 or len(hdr) > 12: |
---|
| 504 | return False |
---|
| 505 | for str in data.col1_startswith: |
---|
| 506 | if hdr[0].lower().startswith(str): |
---|
| 507 | valid_col1 = True |
---|
| 508 | break |
---|
| 509 | if valid_col1: |
---|
| 510 | try: |
---|
| 511 | int( hdr[1] ) |
---|
| 512 | int( hdr[2] ) |
---|
| 513 | except: |
---|
| 514 | return False |
---|
| 515 | if len( hdr ) > 4: |
---|
| 516 | #hdr[3] is a string, 'name', which defines the name of the BED line - difficult to test for this. |
---|
| 517 | #hdr[4] is an int, 'score', a score between 0 and 1000. |
---|
| 518 | try: |
---|
| 519 | if int( hdr[4] ) < 0 or int( hdr[4] ) > 1000: return False |
---|
| 520 | except: |
---|
| 521 | return False |
---|
| 522 | if len( hdr ) > 5: |
---|
| 523 | #hdr[5] is strand |
---|
| 524 | if hdr[5] not in data.valid_strand: return False |
---|
| 525 | if len( hdr ) > 6: |
---|
| 526 | #hdr[6] is thickStart, the starting position at which the feature is drawn thickly. |
---|
| 527 | try: int( hdr[6] ) |
---|
| 528 | except: return False |
---|
| 529 | if len( hdr ) > 7: |
---|
| 530 | #hdr[7] is thickEnd, the ending position at which the feature is drawn thickly |
---|
| 531 | try: int( hdr[7] ) |
---|
| 532 | except: return False |
---|
| 533 | if len( hdr ) > 8: |
---|
| 534 | #hdr[8] is itemRgb, an RGB value of the form R,G,B (e.g. 255,0,0). However, this could also be an int (e.g., 0) |
---|
| 535 | try: int( hdr[8] ) |
---|
| 536 | except: |
---|
| 537 | try: hdr[8].split(',') |
---|
| 538 | except: return False |
---|
| 539 | if len( hdr ) > 9: |
---|
| 540 | #hdr[9] is blockCount, the number of blocks (exons) in the BED line. |
---|
| 541 | try: block_count = int( hdr[9] ) |
---|
| 542 | except: return False |
---|
| 543 | if len( hdr ) > 10: |
---|
| 544 | #hdr[10] is blockSizes - A comma-separated list of the block sizes. |
---|
| 545 | #Sometimes the blosck_sizes and block_starts lists end in extra commas |
---|
| 546 | try: block_sizes = hdr[10].rstrip(',').split(',') |
---|
| 547 | except: return False |
---|
| 548 | if len( hdr ) > 11: |
---|
| 549 | #hdr[11] is blockStarts - A comma-separated list of block starts. |
---|
| 550 | try: block_starts = hdr[11].rstrip(',').split(',') |
---|
| 551 | except: return False |
---|
| 552 | if len(block_sizes) != block_count or len(block_starts) != block_count: return False |
---|
| 553 | else: return False |
---|
| 554 | return True |
---|
| 555 | except: return False |
---|
| 556 | |
---|
| 557 | def get_track_type( self ): |
---|
| 558 | return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"} |
---|
| 559 | |
---|
| 560 | class BedStrict( Bed ): |
---|
| 561 | """Tab delimited data in strict BED format - no non-standard columns allowed""" |
---|
| 562 | |
---|
| 563 | file_ext = "bedstrict" |
---|
| 564 | |
---|
| 565 | #no user change of datatype allowed |
---|
| 566 | allow_datatype_change = False |
---|
| 567 | |
---|
| 568 | #Read only metadata elements |
---|
| 569 | MetadataElement( name="chromCol", default=1, desc="Chrom column", readonly=True, param=metadata.MetadataParameter ) |
---|
| 570 | MetadataElement( name="startCol", default=2, desc="Start column", readonly=True, param=metadata.MetadataParameter ) #TODO: start and end should be able to be set to these or the proper thick[start/end]? |
---|
| 571 | MetadataElement( name="endCol", default=3, desc="End column", readonly=True, param=metadata.MetadataParameter ) |
---|
| 572 | MetadataElement( name="strandCol", desc="Strand column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True ) |
---|
| 573 | MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True ) |
---|
| 574 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) |
---|
| 575 | |
---|
| 576 | def __init__( self, **kwd ): |
---|
| 577 | Tabular.__init__( self, **kwd ) |
---|
| 578 | self.clear_display_apps() #only new style display applications for this datatype |
---|
| 579 | |
---|
| 580 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
| 581 | Tabular.set_meta( self, dataset, overwrite = overwrite, **kwd) #need column count first |
---|
| 582 | if dataset.metadata.columns >= 4: |
---|
| 583 | dataset.metadata.nameCol = 4 |
---|
| 584 | if dataset.metadata.columns >= 6: |
---|
| 585 | dataset.metadata.strandCol = 6 |
---|
| 586 | |
---|
| 587 | def sniff( self, filename ): |
---|
| 588 | return False #NOTE: This would require aggressively validating the entire file |
---|
| 589 | |
---|
| 590 | class Bed6( BedStrict ): |
---|
| 591 | """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 6""" |
---|
| 592 | |
---|
| 593 | file_ext = "bed6" |
---|
| 594 | |
---|
| 595 | class Bed12( BedStrict ): |
---|
| 596 | """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 12""" |
---|
| 597 | |
---|
| 598 | file_ext = "bed12" |
---|
| 599 | |
---|
| 600 | class _RemoteCallMixin: |
---|
| 601 | def _get_remote_call_url( self, redirect_url, site_name, dataset, type, app, base_url ): |
---|
| 602 | """Retrieve the URL to call out to an external site and retrieve data. |
---|
| 603 | This routes our external URL through a local galaxy instance which makes |
---|
| 604 | the data available, followed by redirecting to the remote site with a |
---|
| 605 | link back to the available information. |
---|
| 606 | """ |
---|
| 607 | internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='%s_%s' % ( type, site_name ) ) |
---|
| 608 | base_url = app.config.get( "display_at_callback", base_url ) |
---|
| 609 | if base_url.startswith( 'https://' ): |
---|
| 610 | base_url = base_url.replace( 'https', 'http', 1 ) |
---|
| 611 | display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % \ |
---|
| 612 | ( base_url, url_for( controller='root' ), dataset.id, type ) ) |
---|
| 613 | link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url ) |
---|
| 614 | return link |
---|
| 615 | |
---|
| 616 | class Gff( Tabular, _RemoteCallMixin ): |
---|
| 617 | """Tab delimited data in Gff format""" |
---|
| 618 | file_ext = "gff" |
---|
| 619 | column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] |
---|
| 620 | |
---|
| 621 | """Add metadata elements""" |
---|
| 622 | MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False ) |
---|
| 623 | MetadataElement( name="column_types", default=['str','str','str','int','int','int','str','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False ) |
---|
| 624 | |
---|
| 625 | def __init__( self, **kwd ): |
---|
| 626 | """Initialize datatype, by adding GBrowse display app""" |
---|
| 627 | Tabular.__init__(self, **kwd) |
---|
| 628 | self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) |
---|
| 629 | self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' ) |
---|
| 630 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
| 631 | i = 0 |
---|
| 632 | for i, line in enumerate( file ( dataset.file_name ) ): |
---|
| 633 | line = line.rstrip('\r\n') |
---|
| 634 | if line and not line.startswith( '#' ): |
---|
| 635 | elems = line.split( '\t' ) |
---|
| 636 | if len(elems) == 9: |
---|
| 637 | try: |
---|
| 638 | int( elems[3] ) |
---|
| 639 | int( elems[4] ) |
---|
| 640 | break |
---|
| 641 | except: |
---|
| 642 | pass |
---|
| 643 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i ) |
---|
| 644 | def make_html_table( self, dataset, skipchars=[] ): |
---|
| 645 | """Create HTML table, used for displaying peek""" |
---|
| 646 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
| 647 | comments = [] |
---|
| 648 | try: |
---|
| 649 | # Generate column header |
---|
| 650 | out.append( '<tr>' ) |
---|
| 651 | for i, name in enumerate( self.column_names ): |
---|
| 652 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) ) |
---|
| 653 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
| 654 | out.append( '</table>' ) |
---|
| 655 | out = "".join( out ) |
---|
| 656 | except Exception, exc: |
---|
| 657 | out = "Can't create peek %s" % exc |
---|
| 658 | return out |
---|
| 659 | def get_estimated_display_viewport( self, dataset ): |
---|
| 660 | """ |
---|
| 661 | Return a chrom, start, stop tuple for viewing a file. There are slight differences between gff 2 and gff 3 |
---|
| 662 | formats. This function should correctly handle both... |
---|
| 663 | """ |
---|
| 664 | viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines |
---|
| 665 | max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines |
---|
| 666 | if self.displayable( dataset ): |
---|
| 667 | try: |
---|
| 668 | seqid = None |
---|
| 669 | start = sys.maxint |
---|
| 670 | stop = 0 |
---|
| 671 | fh = open( dataset.file_name ) |
---|
| 672 | while True: |
---|
| 673 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
| 674 | if not line: break #EOF |
---|
| 675 | try: |
---|
| 676 | if line.startswith( '##sequence-region' ): # ##sequence-region IV 6000000 6030000 |
---|
| 677 | elems = line.rstrip( '\n\r' ).split() |
---|
| 678 | if len( elems ) > 3: |
---|
| 679 | # line looks like: |
---|
| 680 | # ##sequence-region ctg123 1 1497228 |
---|
| 681 | seqid = elems[1] # IV |
---|
| 682 | start = int( elems[2] )# 6000000 |
---|
| 683 | stop = int( elems[3] ) # 6030000 |
---|
| 684 | break #use location declared in file |
---|
| 685 | elif len( elems ) == 2 and elems[1].find( '..' ) > 0: |
---|
| 686 | # line looks like this: |
---|
| 687 | # ##sequence-region X:120000..140000 |
---|
| 688 | elems = elems[1].split( ':' ) |
---|
| 689 | seqid = elems[0] |
---|
| 690 | start = int( elems[1].split( '..' )[0] ) |
---|
| 691 | stop = int( elems[1].split( '..' )[1] ) |
---|
| 692 | break #use location declared in file |
---|
| 693 | else: |
---|
| 694 | log.exception( "line (%s) uses an unsupported ##sequence-region definition." % str( line ) ) |
---|
| 695 | #break #no break, if bad definition, we try another method |
---|
| 696 | elif line.startswith("browser position"): |
---|
| 697 | # Allow UCSC style browser and track info in the GFF file |
---|
| 698 | pos_info = line.split()[-1] |
---|
| 699 | seqid, startend = pos_info.split(":") |
---|
| 700 | start, stop = map( int, startend.split("-") ) |
---|
| 701 | break #use location declared in file |
---|
| 702 | elif True not in map( line.startswith, ( '#', 'track', 'browser' ) ):# line.startswith() does not accept iterator in python2.4 |
---|
| 703 | viewport_feature_count -= 1 |
---|
| 704 | elems = line.rstrip( '\n\r' ).split( '\t' ) |
---|
| 705 | if len( elems ) > 3: |
---|
| 706 | if not seqid: |
---|
| 707 | # We can only set the viewport for a single chromosome |
---|
| 708 | seqid = elems[0] |
---|
| 709 | if seqid == elems[0]: |
---|
| 710 | # Make sure we have not spanned chromosomes |
---|
| 711 | start = min( start, int( elems[3] ) ) |
---|
| 712 | stop = max( stop, int( elems[4] ) ) |
---|
| 713 | except: |
---|
| 714 | #most likely start/stop is not an int or not enough fields |
---|
| 715 | pass |
---|
| 716 | #make sure we are at the next new line |
---|
| 717 | readline_count = VIEWPORT_MAX_READS_PER_LINE |
---|
| 718 | while line.rstrip( '\n\r' ) == line: |
---|
| 719 | assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id ) |
---|
| 720 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
| 721 | if not line: break #EOF |
---|
| 722 | readline_count -= 1 |
---|
| 723 | max_line_count -= 1 |
---|
| 724 | if not viewport_feature_count or not max_line_count: |
---|
| 725 | #exceeded viewport or total line count to check |
---|
| 726 | break |
---|
| 727 | if seqid is not None: |
---|
| 728 | return ( seqid, str( start ), str( stop ) ) #Necessary to return strings? |
---|
| 729 | except Exception, e: |
---|
| 730 | #unexpected error |
---|
| 731 | log.exception( str( e ) ) |
---|
| 732 | return ( None, None, None ) #could not determine viewport |
---|
| 733 | def ucsc_links( self, dataset, type, app, base_url ): |
---|
| 734 | ret_val = [] |
---|
| 735 | seqid, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
| 736 | if seqid is not None: |
---|
| 737 | for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ): |
---|
| 738 | if site_name in app.config.ucsc_display_sites: |
---|
| 739 | redirect_url = urllib.quote_plus( |
---|
| 740 | "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % |
---|
| 741 | ( site_url, dataset.dbkey, seqid, start, stop ) ) |
---|
| 742 | link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url ) |
---|
| 743 | ret_val.append( ( site_name, link ) ) |
---|
| 744 | return ret_val |
---|
| 745 | def gbrowse_links( self, dataset, type, app, base_url ): |
---|
| 746 | ret_val = [] |
---|
| 747 | seqid, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
| 748 | if seqid is not None: |
---|
| 749 | for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ): |
---|
| 750 | if site_name in app.config.gbrowse_display_sites: |
---|
| 751 | if seqid.startswith( 'chr' ) and len ( seqid ) > 3: |
---|
| 752 | seqid = seqid[3:] |
---|
| 753 | redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, seqid, start, stop ) ) |
---|
| 754 | link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url ) |
---|
| 755 | ret_val.append( ( site_name, link ) ) |
---|
| 756 | return ret_val |
---|
| 757 | def sniff( self, filename ): |
---|
| 758 | """ |
---|
| 759 | Determines whether the file is in gff format |
---|
| 760 | |
---|
| 761 | GFF lines have nine required fields that must be tab-separated. |
---|
| 762 | |
---|
| 763 | For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format3 |
---|
| 764 | |
---|
| 765 | >>> fname = get_test_fname( 'gff_version_3.gff' ) |
---|
| 766 | >>> Gff().sniff( fname ) |
---|
| 767 | False |
---|
| 768 | >>> fname = get_test_fname( 'test.gff' ) |
---|
| 769 | >>> Gff().sniff( fname ) |
---|
| 770 | True |
---|
| 771 | """ |
---|
| 772 | headers = get_headers( filename, '\t' ) |
---|
| 773 | try: |
---|
| 774 | if len(headers) < 2: |
---|
| 775 | return False |
---|
| 776 | for hdr in headers: |
---|
| 777 | if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0: |
---|
| 778 | return False |
---|
| 779 | if hdr and hdr[0] and not hdr[0].startswith( '#' ): |
---|
| 780 | if len(hdr) != 9: |
---|
| 781 | return False |
---|
| 782 | try: |
---|
| 783 | int( hdr[3] ) |
---|
| 784 | int( hdr[4] ) |
---|
| 785 | except: |
---|
| 786 | return False |
---|
| 787 | if hdr[5] != '.': |
---|
| 788 | try: |
---|
| 789 | score = float( hdr[5] ) |
---|
| 790 | except: |
---|
| 791 | return False |
---|
| 792 | if hdr[6] not in data.valid_strand: |
---|
| 793 | return False |
---|
| 794 | return True |
---|
| 795 | except: |
---|
| 796 | return False |
---|
| 797 | |
---|
| 798 | def get_track_type( self ): |
---|
| 799 | return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"} |
---|
| 800 | |
---|
| 801 | |
---|
| 802 | class Gff3( Gff ): |
---|
| 803 | """Tab delimited data in Gff3 format""" |
---|
| 804 | file_ext = "gff3" |
---|
| 805 | valid_gff3_strand = ['+', '-', '.', '?'] |
---|
| 806 | valid_gff3_phase = ['.', '0', '1', '2'] |
---|
| 807 | column_names = [ 'Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes' ] |
---|
| 808 | |
---|
| 809 | """Add metadata elements""" |
---|
| 810 | MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False ) |
---|
| 811 | |
---|
| 812 | def __init__(self, **kwd): |
---|
| 813 | """Initialize datatype, by adding GBrowse display app""" |
---|
| 814 | Gff.__init__(self, **kwd) |
---|
| 815 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
| 816 | i = 0 |
---|
| 817 | for i, line in enumerate( file ( dataset.file_name ) ): |
---|
| 818 | line = line.rstrip('\r\n') |
---|
| 819 | if line and not line.startswith( '#' ): |
---|
| 820 | elems = line.split( '\t' ) |
---|
| 821 | valid_start = False |
---|
| 822 | valid_end = False |
---|
| 823 | if len( elems ) == 9: |
---|
| 824 | try: |
---|
| 825 | start = int( elems[3] ) |
---|
| 826 | valid_start = True |
---|
| 827 | except: |
---|
| 828 | if elems[3] == '.': |
---|
| 829 | valid_start = True |
---|
| 830 | try: |
---|
| 831 | end = int( elems[4] ) |
---|
| 832 | valid_end = True |
---|
| 833 | except: |
---|
| 834 | if elems[4] == '.': |
---|
| 835 | valid_end = True |
---|
| 836 | strand = elems[6] |
---|
| 837 | phase = elems[7] |
---|
| 838 | if valid_start and valid_end and start < end and strand in self.valid_gff3_strand and phase in self.valid_gff3_phase: |
---|
| 839 | break |
---|
| 840 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i ) |
---|
| 841 | def sniff( self, filename ): |
---|
| 842 | """ |
---|
| 843 | Determines whether the file is in gff version 3 format |
---|
| 844 | |
---|
| 845 | GFF 3 format: |
---|
| 846 | |
---|
| 847 | 1) adds a mechanism for representing more than one level |
---|
| 848 | of hierarchical grouping of features and subfeatures. |
---|
| 849 | 2) separates the ideas of group membership and feature name/id |
---|
| 850 | 3) constrains the feature type field to be taken from a controlled |
---|
| 851 | vocabulary. |
---|
| 852 | 4) allows a single feature, such as an exon, to belong to more than |
---|
| 853 | one group at a time. |
---|
| 854 | 5) provides an explicit convention for pairwise alignments |
---|
| 855 | 6) provides an explicit convention for features that occupy disjunct regions |
---|
| 856 | |
---|
| 857 | The format consists of 9 columns, separated by tabs (NOT spaces). |
---|
| 858 | |
---|
| 859 | Undefined fields are replaced with the "." character, as described in the original GFF spec. |
---|
| 860 | |
---|
| 861 | For complete details see http://song.sourceforge.net/gff3.shtml |
---|
| 862 | |
---|
| 863 | >>> fname = get_test_fname( 'test.gff' ) |
---|
| 864 | >>> Gff3().sniff( fname ) |
---|
| 865 | False |
---|
| 866 | >>> fname = get_test_fname('gff_version_3.gff') |
---|
| 867 | >>> Gff3().sniff( fname ) |
---|
| 868 | True |
---|
| 869 | """ |
---|
| 870 | headers = get_headers( filename, '\t' ) |
---|
| 871 | try: |
---|
| 872 | if len(headers) < 2: |
---|
| 873 | return False |
---|
| 874 | for hdr in headers: |
---|
| 875 | if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) >= 0: |
---|
| 876 | return True |
---|
| 877 | elif hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) < 0: |
---|
| 878 | return False |
---|
| 879 | # Header comments may have been stripped, so inspect the data |
---|
| 880 | if hdr and hdr[0] and not hdr[0].startswith( '#' ): |
---|
| 881 | if len(hdr) != 9: |
---|
| 882 | return False |
---|
| 883 | try: |
---|
| 884 | int( hdr[3] ) |
---|
| 885 | except: |
---|
| 886 | if hdr[3] != '.': |
---|
| 887 | return False |
---|
| 888 | try: |
---|
| 889 | int( hdr[4] ) |
---|
| 890 | except: |
---|
| 891 | if hdr[4] != '.': |
---|
| 892 | return False |
---|
| 893 | if hdr[5] != '.': |
---|
| 894 | try: |
---|
| 895 | score = float( hdr[5] ) |
---|
| 896 | except: |
---|
| 897 | return False |
---|
| 898 | if hdr[6] not in self.valid_gff3_strand: |
---|
| 899 | return False |
---|
| 900 | if hdr[7] not in self.valid_gff3_phase: |
---|
| 901 | return False |
---|
| 902 | return True |
---|
| 903 | except: |
---|
| 904 | return False |
---|
| 905 | |
---|
| 906 | class Gtf( Gff ): |
---|
| 907 | """Tab delimited data in Gtf format""" |
---|
| 908 | file_ext = "gtf" |
---|
| 909 | column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Attributes' ] |
---|
| 910 | |
---|
| 911 | """Add metadata elements""" |
---|
| 912 | MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False ) |
---|
| 913 | MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False ) |
---|
| 914 | |
---|
| 915 | |
---|
| 916 | def sniff( self, filename ): |
---|
| 917 | """ |
---|
| 918 | Determines whether the file is in gtf format |
---|
| 919 | |
---|
| 920 | GTF lines have nine required fields that must be tab-separated. The first eight GTF fields are the same as GFF. |
---|
| 921 | The group field has been expanded into a list of attributes. Each attribute consists of a type/value pair. |
---|
| 922 | Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space. |
---|
| 923 | The attribute list must begin with the two mandatory attributes: |
---|
| 924 | |
---|
| 925 | gene_id value - A globally unique identifier for the genomic source of the sequence. |
---|
| 926 | transcript_id value - A globally unique identifier for the predicted transcript. |
---|
| 927 | |
---|
| 928 | For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format4 |
---|
| 929 | |
---|
| 930 | >>> fname = get_test_fname( '1.bed' ) |
---|
| 931 | >>> Gtf().sniff( fname ) |
---|
| 932 | False |
---|
| 933 | >>> fname = get_test_fname( 'test.gff' ) |
---|
| 934 | >>> Gtf().sniff( fname ) |
---|
| 935 | False |
---|
| 936 | >>> fname = get_test_fname( 'test.gtf' ) |
---|
| 937 | >>> Gtf().sniff( fname ) |
---|
| 938 | True |
---|
| 939 | """ |
---|
| 940 | headers = get_headers( filename, '\t' ) |
---|
| 941 | try: |
---|
| 942 | if len(headers) < 2: |
---|
| 943 | return False |
---|
| 944 | for hdr in headers: |
---|
| 945 | if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0: |
---|
| 946 | return False |
---|
| 947 | if hdr and hdr[0] and not hdr[0].startswith( '#' ): |
---|
| 948 | if len(hdr) != 9: |
---|
| 949 | return False |
---|
| 950 | try: |
---|
| 951 | int( hdr[3] ) |
---|
| 952 | int( hdr[4] ) |
---|
| 953 | except: |
---|
| 954 | return False |
---|
| 955 | if hdr[5] != '.': |
---|
| 956 | try: |
---|
| 957 | score = float( hdr[5] ) |
---|
| 958 | except: |
---|
| 959 | return False |
---|
| 960 | if hdr[6] not in data.valid_strand: |
---|
| 961 | return False |
---|
| 962 | |
---|
| 963 | # Check attributes for gene_id, transcript_id |
---|
| 964 | attributes = hdr[8].split(";") |
---|
| 965 | if len( attributes ) >= 2: |
---|
| 966 | try: |
---|
| 967 | # Imprecise: should check for a single space per the spec. |
---|
| 968 | attr_name, attr_value = attributes[0].split(" ") |
---|
| 969 | if attr_name != 'gene_id': |
---|
| 970 | return False |
---|
| 971 | except: |
---|
| 972 | return False |
---|
| 973 | try: |
---|
| 974 | # Imprecise: should check for a single space per the spec. |
---|
| 975 | attr_name, attr_value = attributes[1][1:].split(" ") |
---|
| 976 | if attr_name != 'transcript_id': |
---|
| 977 | return False |
---|
| 978 | except: |
---|
| 979 | return False |
---|
| 980 | else: |
---|
| 981 | return False |
---|
| 982 | return True |
---|
| 983 | except: |
---|
| 984 | return False |
---|
| 985 | |
---|
| 986 | |
---|
| 987 | class Wiggle( Tabular, _RemoteCallMixin ): |
---|
| 988 | """Tab delimited data in wiggle format""" |
---|
| 989 | file_ext = "wig" |
---|
| 990 | |
---|
| 991 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) |
---|
| 992 | |
---|
| 993 | def __init__( self, **kwd ): |
---|
| 994 | Tabular.__init__( self, **kwd ) |
---|
| 995 | self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) |
---|
| 996 | self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' ) |
---|
| 997 | def get_estimated_display_viewport( self, dataset ): |
---|
| 998 | """Return a chrom, start, stop tuple for viewing a file.""" |
---|
| 999 | viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines |
---|
| 1000 | max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines |
---|
| 1001 | if self.displayable( dataset ): |
---|
| 1002 | try: |
---|
| 1003 | chrom = None |
---|
| 1004 | start = sys.maxint |
---|
| 1005 | end = 0 |
---|
| 1006 | span = 1 |
---|
| 1007 | step = None |
---|
| 1008 | fh = open( dataset.file_name ) |
---|
| 1009 | while True: |
---|
| 1010 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
| 1011 | if not line: break #EOF |
---|
| 1012 | try: |
---|
| 1013 | if line.startswith( "browser" ): |
---|
| 1014 | chr_info = line.rstrip( '\n\r' ).split()[-1] |
---|
| 1015 | chrom, coords = chr_info.split( ":" ) |
---|
| 1016 | start, end = map( int, coords.split( "-" ) ) |
---|
| 1017 | break # use the browser line |
---|
| 1018 | # variableStep chrom=chr20 |
---|
| 1019 | if line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ): |
---|
| 1020 | if chrom is not None: break #different chrom or different section of the chrom |
---|
| 1021 | chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0] |
---|
| 1022 | if 'span=' in line: |
---|
| 1023 | span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] ) |
---|
| 1024 | if 'step=' in line: |
---|
| 1025 | step = int( line.rstrip( '\n\r' ).split("step=")[1].split()[0] ) |
---|
| 1026 | start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] ) |
---|
| 1027 | else: |
---|
| 1028 | fields = line.rstrip( '\n\r' ).split() |
---|
| 1029 | if fields: |
---|
| 1030 | if step is not None: |
---|
| 1031 | if not end: |
---|
| 1032 | end = start + span |
---|
| 1033 | else: |
---|
| 1034 | end += step |
---|
| 1035 | else: |
---|
| 1036 | start = min( int( fields[0] ), start ) |
---|
| 1037 | end = max( end, int( fields[0] ) + span ) |
---|
| 1038 | viewport_feature_count -= 1 |
---|
| 1039 | except: |
---|
| 1040 | pass |
---|
| 1041 | #make sure we are at the next new line |
---|
| 1042 | readline_count = VIEWPORT_MAX_READS_PER_LINE |
---|
| 1043 | while line.rstrip( '\n\r' ) == line: |
---|
| 1044 | assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id ) |
---|
| 1045 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
| 1046 | if not line: break #EOF |
---|
| 1047 | readline_count -= 1 |
---|
| 1048 | max_line_count -= 1 |
---|
| 1049 | if not viewport_feature_count or not max_line_count: |
---|
| 1050 | #exceeded viewport or total line count to check |
---|
| 1051 | break |
---|
| 1052 | if chrom is not None: |
---|
| 1053 | return ( chrom, str( start ), str( end ) ) #Necessary to return strings? |
---|
| 1054 | except Exception, e: |
---|
| 1055 | #unexpected error |
---|
| 1056 | log.exception( str( e ) ) |
---|
| 1057 | return ( None, None, None ) #could not determine viewport |
---|
| 1058 | def gbrowse_links( self, dataset, type, app, base_url ): |
---|
| 1059 | ret_val = [] |
---|
| 1060 | chrom, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
| 1061 | if chrom is not None: |
---|
| 1062 | for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ): |
---|
| 1063 | if site_name in app.config.gbrowse_display_sites: |
---|
| 1064 | if chrom.startswith( 'chr' ) and len ( chrom ) > 3: |
---|
| 1065 | chrom = chrom[3:] |
---|
| 1066 | redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, chrom, start, stop ) ) |
---|
| 1067 | link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url ) |
---|
| 1068 | ret_val.append( ( site_name, link ) ) |
---|
| 1069 | return ret_val |
---|
| 1070 | def ucsc_links( self, dataset, type, app, base_url ): |
---|
| 1071 | ret_val = [] |
---|
| 1072 | chrom, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
| 1073 | if chrom is not None: |
---|
| 1074 | for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ): |
---|
| 1075 | if site_name in app.config.ucsc_display_sites: |
---|
| 1076 | redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % ( site_url, dataset.dbkey, chrom, start, stop ) ) |
---|
| 1077 | link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url ) |
---|
| 1078 | ret_val.append( ( site_name, link ) ) |
---|
| 1079 | return ret_val |
---|
| 1080 | def make_html_table( self, dataset ): |
---|
| 1081 | return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] ) |
---|
| 1082 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
| 1083 | max_data_lines = None |
---|
| 1084 | i = 0 |
---|
| 1085 | for i, line in enumerate( file ( dataset.file_name ) ): |
---|
| 1086 | line = line.rstrip('\r\n') |
---|
| 1087 | if line and not line.startswith( '#' ): |
---|
| 1088 | elems = line.split( '\t' ) |
---|
| 1089 | try: |
---|
| 1090 | float( elems[0] ) #"Wiggle track data values can be integer or real, positive or negative values" |
---|
| 1091 | break |
---|
| 1092 | except: |
---|
| 1093 | do_break = False |
---|
| 1094 | for col_startswith in data.col1_startswith: |
---|
| 1095 | if elems[0].lower().startswith( col_startswith ): |
---|
| 1096 | do_break = True |
---|
| 1097 | break |
---|
| 1098 | if do_break: |
---|
| 1099 | break |
---|
| 1100 | if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: |
---|
| 1101 | #we'll arbitrarily only use the first 100 data lines in this wig file to calculate tabular attributes (column types) |
---|
| 1102 | #this should be sufficient, except when we have mixed wig track types (bed, variable, fixed), |
---|
| 1103 | # but those cases are not a single table that would have consistant column definitions |
---|
| 1104 | #optional metadata values set in Tabular class will be 'None' |
---|
| 1105 | max_data_lines = 100 |
---|
| 1106 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i, max_data_lines = max_data_lines ) |
---|
| 1107 | def sniff( self, filename ): |
---|
| 1108 | """ |
---|
| 1109 | Determines wether the file is in wiggle format |
---|
| 1110 | |
---|
| 1111 | The .wig format is line-oriented. Wiggle data is preceeded by a track definition line, |
---|
| 1112 | which adds a number of options for controlling the default display of this track. |
---|
| 1113 | Following the track definition line is the track data, which can be entered in several |
---|
| 1114 | different formats. |
---|
| 1115 | |
---|
| 1116 | The track definition line begins with the word 'track' followed by the track type. |
---|
| 1117 | The track type with version is REQUIRED, and it currently must be wiggle_0. For example, |
---|
| 1118 | track type=wiggle_0... |
---|
| 1119 | |
---|
| 1120 | For complete details see http://genome.ucsc.edu/goldenPath/help/wiggle.html |
---|
| 1121 | |
---|
| 1122 | >>> fname = get_test_fname( 'interval1.bed' ) |
---|
| 1123 | >>> Wiggle().sniff( fname ) |
---|
| 1124 | False |
---|
| 1125 | >>> fname = get_test_fname( 'wiggle.wig' ) |
---|
| 1126 | >>> Wiggle().sniff( fname ) |
---|
| 1127 | True |
---|
| 1128 | """ |
---|
| 1129 | headers = get_headers( filename, None ) |
---|
| 1130 | try: |
---|
| 1131 | for hdr in headers: |
---|
| 1132 | if len(hdr) > 1 and hdr[0] == 'track' and hdr[1].startswith('type=wiggle'): |
---|
| 1133 | return True |
---|
| 1134 | return False |
---|
| 1135 | except: |
---|
| 1136 | return False |
---|
| 1137 | def get_track_window(self, dataset, data, start, end): |
---|
| 1138 | """ |
---|
| 1139 | Assumes we have a numpy file. |
---|
| 1140 | """ |
---|
| 1141 | # Maybe if we import here people will still be able to use Galaxy when numpy kills it |
---|
| 1142 | pkg_resources.require("numpy>=1.2.1") |
---|
| 1143 | #from numpy.lib import format |
---|
| 1144 | import numpy |
---|
| 1145 | |
---|
| 1146 | range = end - start |
---|
| 1147 | # Determine appropriate resolution to plot ~1000 points |
---|
| 1148 | resolution = ( 10 ** math.ceil( math.log10( range / 1000 ) ) ) |
---|
| 1149 | # Restrict to valid range |
---|
| 1150 | resolution = min( resolution, 100000 ) |
---|
| 1151 | resolution = max( resolution, 1 ) |
---|
| 1152 | # Memory map the array (don't load all the data) |
---|
| 1153 | data = numpy.load( data ) |
---|
| 1154 | # Grab just what we need |
---|
| 1155 | t_start = math.floor( start / resolution ) |
---|
| 1156 | t_end = math.ceil( end / resolution ) |
---|
| 1157 | x = numpy.arange( t_start, t_end ) * resolution |
---|
| 1158 | y = data[ t_start : t_end ] |
---|
| 1159 | |
---|
| 1160 | return zip(x.tolist(), y.tolist()) |
---|
| 1161 | def get_track_resolution( self, dataset, start, end): |
---|
| 1162 | range = end - start |
---|
| 1163 | # Determine appropriate resolution to plot ~1000 points |
---|
| 1164 | resolution = math.ceil( 10 ** math.ceil( math.log10( range / 1000 ) ) ) |
---|
| 1165 | # Restrict to valid range |
---|
| 1166 | resolution = min( resolution, 100000 ) |
---|
| 1167 | resolution = max( resolution, 1 ) |
---|
| 1168 | return resolution |
---|
| 1169 | def get_track_type( self ): |
---|
| 1170 | return "LineTrack", {"data": "array_tree"} |
---|
| 1171 | |
---|
| 1172 | class CustomTrack ( Tabular ): |
---|
| 1173 | """UCSC CustomTrack""" |
---|
| 1174 | file_ext = "customtrack" |
---|
| 1175 | |
---|
| 1176 | def __init__(self, **kwd): |
---|
| 1177 | """Initialize interval datatype, by adding UCSC display app""" |
---|
| 1178 | Tabular.__init__(self, **kwd) |
---|
| 1179 | self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) |
---|
| 1180 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
| 1181 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 ) |
---|
| 1182 | def display_peek( self, dataset ): |
---|
| 1183 | """Returns formated html of peek""" |
---|
| 1184 | return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] ) |
---|
| 1185 | def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ): |
---|
| 1186 | """Return a chrom, start, stop tuple for viewing a file.""" |
---|
| 1187 | #FIXME: only BED and WIG custom tracks are currently supported |
---|
| 1188 | #As per previously existing behavior, viewport will only be over the first intervals |
---|
| 1189 | max_line_count = 100 # maximum number of lines to check; includes comment lines |
---|
| 1190 | variable_step_wig = False |
---|
| 1191 | chrom = None |
---|
| 1192 | span = 1 |
---|
| 1193 | if self.displayable( dataset ): |
---|
| 1194 | try: |
---|
| 1195 | fh = open( dataset.file_name ) |
---|
| 1196 | while True: |
---|
| 1197 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
| 1198 | if not line: break #EOF |
---|
| 1199 | if not line.startswith( '#' ): |
---|
| 1200 | try: |
---|
| 1201 | if variable_step_wig: |
---|
| 1202 | fields = line.rstrip().split() |
---|
| 1203 | if len( fields ) == 2: |
---|
| 1204 | start = int( fields[ 0 ] ) |
---|
| 1205 | return ( chrom, str( start ), str( start + span ) ) |
---|
| 1206 | elif line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ): |
---|
| 1207 | chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0] |
---|
| 1208 | if 'span=' in line: |
---|
| 1209 | span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] ) |
---|
| 1210 | if 'start=' in line: |
---|
| 1211 | start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] ) |
---|
| 1212 | return ( chrom, str( start ), str( start + span ) ) |
---|
| 1213 | else: |
---|
| 1214 | variable_step_wig = True |
---|
| 1215 | else: |
---|
| 1216 | fields = line.rstrip().split( '\t' ) |
---|
| 1217 | if len( fields ) >= 3: |
---|
| 1218 | chrom = fields[ 0 ] |
---|
| 1219 | start = int( fields[ 1 ] ) |
---|
| 1220 | end = int( fields[ 2 ] ) |
---|
| 1221 | return ( chrom, str( start ), str( end ) ) |
---|
| 1222 | except Exception: |
---|
| 1223 | #most likely a non-integer field has been encountered for start / stop |
---|
| 1224 | continue |
---|
| 1225 | #make sure we are at the next new line |
---|
| 1226 | readline_count = VIEWPORT_MAX_READS_PER_LINE |
---|
| 1227 | while line.rstrip( '\n\r' ) == line: |
---|
| 1228 | assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id ) |
---|
| 1229 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
| 1230 | if not line: break #EOF |
---|
| 1231 | readline_count -= 1 |
---|
| 1232 | max_line_count -= 1 |
---|
| 1233 | if not max_line_count: |
---|
| 1234 | #exceeded viewport or total line count to check |
---|
| 1235 | break |
---|
| 1236 | except Exception, e: |
---|
| 1237 | #unexpected error |
---|
| 1238 | log.exception( str( e ) ) |
---|
| 1239 | return ( None, None, None ) #could not determine viewport |
---|
| 1240 | def ucsc_links( self, dataset, type, app, base_url ): |
---|
| 1241 | ret_val = [] |
---|
| 1242 | chrom, start, stop = self.get_estimated_display_viewport(dataset) |
---|
| 1243 | if chrom is not None: |
---|
| 1244 | for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey): |
---|
| 1245 | if site_name in app.config.ucsc_display_sites: |
---|
| 1246 | internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name ) |
---|
| 1247 | if base_url.startswith( 'https://' ): |
---|
| 1248 | base_url = base_url.replace( 'https', 'http', 1 ) |
---|
| 1249 | display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type) ) |
---|
| 1250 | redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop ) ) |
---|
| 1251 | link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url ) |
---|
| 1252 | ret_val.append( (site_name, link) ) |
---|
| 1253 | return ret_val |
---|
| 1254 | def sniff( self, filename ): |
---|
| 1255 | """ |
---|
| 1256 | Determines whether the file is in customtrack format. |
---|
| 1257 | |
---|
| 1258 | CustomTrack files are built within Galaxy and are basically bed or interval files with the first line looking |
---|
| 1259 | something like this. |
---|
| 1260 | |
---|
| 1261 | track name="User Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1 |
---|
| 1262 | |
---|
| 1263 | >>> fname = get_test_fname( 'complete.bed' ) |
---|
| 1264 | >>> CustomTrack().sniff( fname ) |
---|
| 1265 | False |
---|
| 1266 | >>> fname = get_test_fname( 'ucsc.customtrack' ) |
---|
| 1267 | >>> CustomTrack().sniff( fname ) |
---|
| 1268 | True |
---|
| 1269 | """ |
---|
| 1270 | headers = get_headers( filename, None ) |
---|
| 1271 | first_line = True |
---|
| 1272 | for hdr in headers: |
---|
| 1273 | if first_line: |
---|
| 1274 | first_line = False |
---|
| 1275 | try: |
---|
| 1276 | if hdr[0].startswith('track'): |
---|
| 1277 | color_found = False |
---|
| 1278 | visibility_found = False |
---|
| 1279 | for elem in hdr[1:]: |
---|
| 1280 | if elem.startswith('color'): color_found = True |
---|
| 1281 | if elem.startswith('visibility'): visibility_found = True |
---|
| 1282 | if color_found and visibility_found: break |
---|
| 1283 | if not color_found or not visibility_found: return False |
---|
| 1284 | else: return False |
---|
| 1285 | except: return False |
---|
| 1286 | else: |
---|
| 1287 | try: |
---|
| 1288 | if hdr[0] and not hdr[0].startswith( '#' ): |
---|
| 1289 | if len( hdr ) < 3: |
---|
| 1290 | return False |
---|
| 1291 | try: |
---|
| 1292 | int( hdr[1] ) |
---|
| 1293 | int( hdr[2] ) |
---|
| 1294 | except: |
---|
| 1295 | return False |
---|
| 1296 | except: |
---|
| 1297 | return False |
---|
| 1298 | return True |
---|
| 1299 | |
---|
| 1300 | if __name__ == '__main__': |
---|
| 1301 | import doctest, sys |
---|
| 1302 | doctest.testmod(sys.modules[__name__]) |
---|