1 | """ |
---|
2 | Interval datatypes |
---|
3 | """ |
---|
4 | |
---|
5 | import pkg_resources |
---|
6 | pkg_resources.require( "bx-python" ) |
---|
7 | |
---|
8 | import logging, os, sys, time, tempfile, shutil |
---|
9 | import data |
---|
10 | from galaxy import util |
---|
11 | from galaxy.datatypes.sniff import * |
---|
12 | from galaxy.web import url_for |
---|
13 | from cgi import escape |
---|
14 | import urllib |
---|
15 | from bx.intervals.io import * |
---|
16 | from galaxy.datatypes import metadata |
---|
17 | from galaxy.datatypes.metadata import MetadataElement |
---|
18 | from galaxy.datatypes.tabular import Tabular |
---|
19 | import math |
---|
20 | |
---|
21 | log = logging.getLogger(__name__) |
---|
22 | |
---|
23 | # Contains the meta columns and the words that map to it; list aliases on the |
---|
24 | # right side of the : in decreasing order of priority |
---|
25 | alias_spec = { |
---|
26 | 'chromCol' : [ 'chrom' , 'CHROMOSOME' , 'CHROM', 'Chromosome Name' ], |
---|
27 | 'startCol' : [ 'start' , 'START', 'chromStart', 'txStart', 'Start Position (bp)' ], |
---|
28 | 'endCol' : [ 'end' , 'END' , 'STOP', 'chromEnd', 'txEnd', 'End Position (bp)' ], |
---|
29 | 'strandCol' : [ 'strand', 'STRAND', 'Strand' ], |
---|
30 | 'nameCol' : [ 'name', 'NAME', 'Name', 'name2', 'NAME2', 'Name2', 'Ensembl Gene ID', 'Ensembl Transcript ID', 'Ensembl Peptide ID' ] |
---|
31 | } |
---|
32 | |
---|
33 | # a little faster lookup |
---|
34 | alias_helper = {} |
---|
35 | for key, value in alias_spec.items(): |
---|
36 | for elem in value: |
---|
37 | alias_helper[elem] = key |
---|
38 | |
---|
39 | # Constants for configuring viewport generation: If a line is greater than |
---|
40 | # VIEWPORT_MAX_READS_PER_LINE * VIEWPORT_READLINE_BUFFER_SIZE bytes in size, |
---|
41 | # then we will not generate a viewport for that dataset |
---|
42 | VIEWPORT_READLINE_BUFFER_SIZE = 1048576 # 1MB |
---|
43 | VIEWPORT_MAX_READS_PER_LINE = 10 |
---|
44 | |
---|
45 | class Interval( Tabular ): |
---|
46 | """Tab delimited data containing interval information""" |
---|
47 | file_ext = "interval" |
---|
48 | |
---|
49 | """Add metadata elements""" |
---|
50 | MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter ) |
---|
51 | MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter ) |
---|
52 | MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter ) |
---|
53 | MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 ) |
---|
54 | MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 ) |
---|
55 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) |
---|
56 | |
---|
57 | def __init__(self, **kwd): |
---|
58 | """Initialize interval datatype, by adding UCSC display apps""" |
---|
59 | Tabular.__init__(self, **kwd) |
---|
60 | self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) |
---|
61 | def init_meta( self, dataset, copy_from=None ): |
---|
62 | Tabular.init_meta( self, dataset, copy_from=copy_from ) |
---|
63 | def set_peek( self, dataset, line_count=None, is_multi_byte=False ): |
---|
64 | """Set the peek and blurb text""" |
---|
65 | if not dataset.dataset.purged: |
---|
66 | dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte ) |
---|
67 | if line_count is None: |
---|
68 | # See if line_count is stored in the metadata |
---|
69 | if dataset.metadata.data_lines: |
---|
70 | dataset.blurb = "%s regions" % util.commaify( str( dataset.metadata.data_lines ) ) |
---|
71 | else: |
---|
72 | # Number of lines is not known ( this should not happen ), and auto-detect is |
---|
73 | # needed to set metadata |
---|
74 | dataset.blurb = "? regions" |
---|
75 | else: |
---|
76 | dataset.blurb = "%s regions" % util.commaify( str( line_count ) ) |
---|
77 | else: |
---|
78 | dataset.peek = 'file does not exist' |
---|
79 | dataset.blurb = 'file purged from disk' |
---|
80 | def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ): |
---|
81 | """Tries to guess from the line the location number of the column for the chromosome, region start-end and strand""" |
---|
82 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 ) |
---|
83 | if dataset.has_data(): |
---|
84 | empty_line_count = 0 |
---|
85 | num_check_lines = 100 # only check up to this many non empty lines |
---|
86 | for i, line in enumerate( file( dataset.file_name ) ): |
---|
87 | line = line.rstrip( '\r\n' ) |
---|
88 | if line: |
---|
89 | if ( first_line_is_header or line[0] == '#' ): |
---|
90 | self.init_meta( dataset ) |
---|
91 | line = line.strip( '#' ) |
---|
92 | elems = line.split( '\t' ) |
---|
93 | for meta_name, header_list in alias_spec.iteritems(): |
---|
94 | for header_val in header_list: |
---|
95 | if header_val in elems: |
---|
96 | #found highest priority header to meta_name |
---|
97 | setattr( dataset.metadata, meta_name, elems.index( header_val ) + 1 ) |
---|
98 | break #next meta_name |
---|
99 | break # Our metadata is set, so break out of the outer loop |
---|
100 | else: |
---|
101 | # Header lines in Interval files are optional. For example, BED is Interval but has no header. |
---|
102 | # We'll make a best guess at the location of the metadata columns. |
---|
103 | metadata_is_set = False |
---|
104 | elems = line.split( '\t' ) |
---|
105 | if len( elems ) > 2: |
---|
106 | for str in data.col1_startswith: |
---|
107 | if line.lower().startswith( str ): |
---|
108 | if overwrite or not dataset.metadata.element_is_set( 'chromCol' ): |
---|
109 | dataset.metadata.chromCol = 1 |
---|
110 | try: |
---|
111 | int( elems[1] ) |
---|
112 | if overwrite or not dataset.metadata.element_is_set( 'startCol' ): |
---|
113 | dataset.metadata.startCol = 2 |
---|
114 | except: |
---|
115 | pass # Metadata default will be used |
---|
116 | try: |
---|
117 | int( elems[2] ) |
---|
118 | if overwrite or not dataset.metadata.element_is_set( 'endCol' ): |
---|
119 | dataset.metadata.endCol = 3 |
---|
120 | except: |
---|
121 | pass # Metadata default will be used |
---|
122 | #we no longer want to guess that this column is the 'name', name must now be set manually for interval files |
---|
123 | #we will still guess at the strand, as we can make a more educated guess |
---|
124 | #if len( elems ) > 3: |
---|
125 | # try: |
---|
126 | # int( elems[3] ) |
---|
127 | # except: |
---|
128 | # if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): |
---|
129 | # dataset.metadata.nameCol = 4 |
---|
130 | if len( elems ) < 6 or elems[5] not in data.valid_strand: |
---|
131 | if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): |
---|
132 | dataset.metadata.strandCol = 0 |
---|
133 | else: |
---|
134 | if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): |
---|
135 | dataset.metadata.strandCol = 6 |
---|
136 | metadata_is_set = True |
---|
137 | break |
---|
138 | if metadata_is_set or ( i - empty_line_count ) > num_check_lines: |
---|
139 | break # Our metadata is set or we examined 100 non-empty lines, so break out of the outer loop |
---|
140 | else: |
---|
141 | empty_line_count += 1 |
---|
142 | def displayable( self, dataset ): |
---|
143 | try: |
---|
144 | return dataset.has_data() \ |
---|
145 | and dataset.state == dataset.states.OK \ |
---|
146 | and dataset.metadata.columns > 0 \ |
---|
147 | and dataset.metadata.data_lines > 0 \ |
---|
148 | and dataset.metadata.chromCol \ |
---|
149 | and dataset.metadata.startCol \ |
---|
150 | and dataset.metadata.endCol |
---|
151 | except: |
---|
152 | return False |
---|
153 | |
---|
154 | def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ): |
---|
155 | """Return a chrom, start, stop tuple for viewing a file.""" |
---|
156 | viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines |
---|
157 | max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines |
---|
158 | if not self.displayable( dataset ): |
---|
159 | return ( None, None, None ) |
---|
160 | try: |
---|
161 | # If column indexes were not passwed, determine from metadata |
---|
162 | if chrom_col is None: |
---|
163 | chrom_col = int( dataset.metadata.chromCol ) - 1 |
---|
164 | if start_col is None: |
---|
165 | start_col = int( dataset.metadata.startCol ) - 1 |
---|
166 | if end_col is None: |
---|
167 | end_col = int( dataset.metadata.endCol ) - 1 |
---|
168 | # Scan lines of file to find a reasonable chromosome and range |
---|
169 | chrom = None |
---|
170 | start = sys.maxint |
---|
171 | end = 0 |
---|
172 | max_col = max( chrom_col, start_col, end_col ) |
---|
173 | fh = open( dataset.file_name ) |
---|
174 | while True: |
---|
175 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
176 | # Stop if at end of file |
---|
177 | if not line: |
---|
178 | break |
---|
179 | # Skip comment lines |
---|
180 | if not line.startswith( '#' ): |
---|
181 | try: |
---|
182 | fields = line.rstrip().split( '\t' ) |
---|
183 | if len( fields ) > max_col: |
---|
184 | if chrom is None or chrom == fields[ chrom_col ]: |
---|
185 | start = min( start, int( fields[ start_col ] ) ) |
---|
186 | end = max( end, int( fields[ end_col ] ) ) |
---|
187 | # Set chrom last, in case start and end are not integers |
---|
188 | chrom = fields[ chrom_col ] |
---|
189 | viewport_feature_count -= 1 |
---|
190 | except Exception, e: |
---|
191 | # Most likely a non-integer field has been encountered |
---|
192 | # for start / stop. Just ignore and make sure we finish |
---|
193 | # reading the line and decrementing the counters. |
---|
194 | pass |
---|
195 | # Make sure we are at the next new line |
---|
196 | readline_count = VIEWPORT_MAX_READS_PER_LINE |
---|
197 | while line.rstrip( '\n\r' ) == line: |
---|
198 | assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id ) |
---|
199 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
200 | if not line: break #EOF |
---|
201 | readline_count -= 1 |
---|
202 | max_line_count -= 1 |
---|
203 | if not viewport_feature_count or not max_line_count: |
---|
204 | #exceeded viewport or total line count to check |
---|
205 | break |
---|
206 | if chrom is not None: |
---|
207 | return ( chrom, str( start ), str( end ) ) # Necessary to return strings? |
---|
208 | except Exception, e: |
---|
209 | # Unexpected error, possibly missing metadata |
---|
210 | log.exception( "Exception caught attempting to generate viewport for dataset '%d'", dataset.id ) |
---|
211 | return ( None, None, None ) |
---|
212 | |
---|
213 | def as_ucsc_display_file( self, dataset, **kwd ): |
---|
214 | """Returns file contents with only the bed data""" |
---|
215 | fd, temp_name = tempfile.mkstemp() |
---|
216 | c, s, e, t, n = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol or 0, dataset.metadata.nameCol or 0 |
---|
217 | c, s, e, t, n = int(c)-1, int(s)-1, int(e)-1, int(t)-1, int(n)-1 |
---|
218 | if t >= 0: # strand column (should) exists |
---|
219 | for i, elems in enumerate( util.file_iter(dataset.file_name) ): |
---|
220 | strand = "+" |
---|
221 | name = "region_%i" % i |
---|
222 | if n >= 0 and n < len( elems ): name = elems[n] |
---|
223 | if t<len(elems): strand = elems[t] |
---|
224 | tmp = [ elems[c], elems[s], elems[e], name, '0', strand ] |
---|
225 | os.write(fd, '%s\n' % '\t'.join(tmp) ) |
---|
226 | elif n >= 0: # name column (should) exists |
---|
227 | for i, elems in enumerate( util.file_iter(dataset.file_name) ): |
---|
228 | name = "region_%i" % i |
---|
229 | if n >= 0 and n < len( elems ): name = elems[n] |
---|
230 | tmp = [ elems[c], elems[s], elems[e], name ] |
---|
231 | os.write(fd, '%s\n' % '\t'.join(tmp) ) |
---|
232 | else: |
---|
233 | for elems in util.file_iter(dataset.file_name): |
---|
234 | tmp = [ elems[c], elems[s], elems[e] ] |
---|
235 | os.write(fd, '%s\n' % '\t'.join(tmp) ) |
---|
236 | os.close(fd) |
---|
237 | return open(temp_name) |
---|
238 | def make_html_table( self, dataset, skipchars=[] ): |
---|
239 | """Create HTML table, used for displaying peek""" |
---|
240 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
241 | comments = [] |
---|
242 | try: |
---|
243 | # Generate column header |
---|
244 | out.append('<tr>') |
---|
245 | for i in range( 1, dataset.metadata.columns+1 ): |
---|
246 | if i == dataset.metadata.chromCol: |
---|
247 | out.append( '<th>%s.Chrom</th>' % i ) |
---|
248 | elif i == dataset.metadata.startCol: |
---|
249 | out.append( '<th>%s.Start</th>' % i ) |
---|
250 | elif i == dataset.metadata.endCol: |
---|
251 | out.append( '<th>%s.End</th>' % i ) |
---|
252 | elif dataset.metadata.strandCol and i == dataset.metadata.strandCol: |
---|
253 | out.append( '<th>%s.Strand</th>' % i ) |
---|
254 | elif dataset.metadata.nameCol and i == dataset.metadata.nameCol: |
---|
255 | out.append( '<th>%s.Name</th>' % i ) |
---|
256 | else: |
---|
257 | out.append( '<th>%s</th>' % i ) |
---|
258 | out.append('</tr>') |
---|
259 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
260 | out.append( '</table>' ) |
---|
261 | out = "".join( out ) |
---|
262 | except Exception, exc: |
---|
263 | out = "Can't create peek %s" % str( exc ) |
---|
264 | return out |
---|
265 | def ucsc_links( self, dataset, type, app, base_url ): |
---|
266 | """ |
---|
267 | Generate links to UCSC genome browser sites based on the dbkey |
---|
268 | and content of dataset. |
---|
269 | """ |
---|
270 | # Filter UCSC sites to only those that are supported by this build and |
---|
271 | # enabled. |
---|
272 | valid_sites = [ ( name, url ) |
---|
273 | for name, url in util.get_ucsc_by_build( dataset.dbkey ) |
---|
274 | if name in app.config.ucsc_display_sites ] |
---|
275 | if not valid_sites: |
---|
276 | return [] |
---|
277 | # If there are any valid sites, we need to generate the estimated |
---|
278 | # viewport |
---|
279 | chrom, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
280 | if chrom is None: |
---|
281 | return [] |
---|
282 | # Accumulate links for valid sites |
---|
283 | ret_val = [] |
---|
284 | for site_name, site_url in valid_sites: |
---|
285 | internal_url = url_for( controller='/dataset', dataset_id=dataset.id, |
---|
286 | action='display_at', filename='ucsc_' + site_name ) |
---|
287 | # HACK: UCSC doesn't support https, so force http even if our URL |
---|
288 | # scheme is https. Making this work requires additional |
---|
289 | # hackery in your upstream proxy. If UCSC ever supports |
---|
290 | # https, remove this hack. |
---|
291 | if base_url.startswith( 'https://' ): |
---|
292 | base_url = base_url.replace( 'https', 'http', 1 ) |
---|
293 | display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" |
---|
294 | % (base_url, url_for( controller='root' ), dataset.id, type) ) |
---|
295 | redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" |
---|
296 | % (site_url, dataset.dbkey, chrom, start, stop ) ) |
---|
297 | link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url ) |
---|
298 | ret_val.append( ( site_name, link ) ) |
---|
299 | return ret_val |
---|
300 | def validate( self, dataset ): |
---|
301 | """Validate an interval file using the bx GenomicIntervalReader""" |
---|
302 | errors = list() |
---|
303 | c, s, e, t = dataset.metadata.chromCol, dataset.metadata.startCol, dataset.metadata.endCol, dataset.metadata.strandCol |
---|
304 | c, s, e, t = int(c)-1, int(s)-1, int(e)-1, int(t)-1 |
---|
305 | infile = open(dataset.file_name, "r") |
---|
306 | reader = GenomicIntervalReader( |
---|
307 | infile, |
---|
308 | chrom_col = c, |
---|
309 | start_col = s, |
---|
310 | end_col = e, |
---|
311 | strand_col = t) |
---|
312 | |
---|
313 | while True: |
---|
314 | try: |
---|
315 | reader.next() |
---|
316 | except ParseError, e: |
---|
317 | errors.append(e) |
---|
318 | except StopIteration: |
---|
319 | infile.close() |
---|
320 | return errors |
---|
321 | |
---|
322 | def repair_methods( self, dataset ): |
---|
323 | """Return options for removing errors along with a description""" |
---|
324 | return [("lines","Remove erroneous lines")] |
---|
325 | |
---|
326 | def sniff( self, filename ): |
---|
327 | """ |
---|
328 | Checks for 'intervalness' |
---|
329 | |
---|
330 | This format is mostly used by galaxy itself. Valid interval files should include |
---|
331 | a valid header comment, but this seems to be loosely regulated. |
---|
332 | |
---|
333 | >>> fname = get_test_fname( 'test_space.txt' ) |
---|
334 | >>> Interval().sniff( fname ) |
---|
335 | False |
---|
336 | >>> fname = get_test_fname( 'interval.interval' ) |
---|
337 | >>> Interval().sniff( fname ) |
---|
338 | True |
---|
339 | """ |
---|
340 | headers = get_headers( filename, '\t' ) |
---|
341 | try: |
---|
342 | """ |
---|
343 | If we got here, we already know the file is_column_based and is not bed, |
---|
344 | so we'll just look for some valid data. |
---|
345 | """ |
---|
346 | for hdr in headers: |
---|
347 | if hdr and not hdr[0].startswith( '#' ): |
---|
348 | if len(hdr) < 3: |
---|
349 | return False |
---|
350 | try: |
---|
351 | # Assume chrom start and end are in column positions 1 and 2 |
---|
352 | # respectively ( for 0 based columns ) |
---|
353 | check = int( hdr[1] ) |
---|
354 | check = int( hdr[2] ) |
---|
355 | except: |
---|
356 | return False |
---|
357 | return True |
---|
358 | except: |
---|
359 | return False |
---|
360 | |
---|
361 | def get_track_window(self, dataset, data, start, end): |
---|
362 | """ |
---|
363 | Assumes the incoming track data is sorted already. |
---|
364 | """ |
---|
365 | window = list() |
---|
366 | for record in data: |
---|
367 | fields = record.rstrip("\n\r").split("\t") |
---|
368 | record_chrom = fields[dataset.metadata.chromCol-1] |
---|
369 | record_start = int(fields[dataset.metadata.startCol-1]) |
---|
370 | record_end = int(fields[dataset.metadata.endCol-1]) |
---|
371 | if record_start < end and record_end > start: |
---|
372 | window.append( (record_chrom, record_start, record_end) ) #Yes I did want to use a generator here, but it doesn't work downstream |
---|
373 | return window |
---|
374 | |
---|
375 | def get_track_resolution( self, dataset, start, end): |
---|
376 | return None |
---|
377 | |
---|
378 | class BedGraph( Interval ): |
---|
379 | """Tab delimited chrom/start/end/datavalue dataset""" |
---|
380 | |
---|
381 | file_ext = "bedgraph" |
---|
382 | |
---|
383 | def get_track_type( self ): |
---|
384 | return "LineTrack", {"data": "array_tree"} |
---|
385 | |
---|
386 | def as_ucsc_display_file( self, dataset, **kwd ): |
---|
387 | """ |
---|
388 | Returns file contents as is with no modifications. |
---|
389 | TODO: this is a functional stub and will need to be enhanced moving forward to provide additional support for bedgraph. |
---|
390 | """ |
---|
391 | return open( dataset.file_name ) |
---|
392 | |
---|
393 | def get_estimated_display_viewport( self, dataset, chrom_col = 0, start_col = 1, end_col = 2 ): |
---|
394 | """ |
---|
395 | Set viewport based on dataset's first 100 lines. |
---|
396 | """ |
---|
397 | return Interval.get_estimated_display_viewport( self, dataset, chrom_col = chrom_col, start_col = start_col, end_col = end_col ) |
---|
398 | |
---|
399 | class Bed( Interval ): |
---|
400 | """Tab delimited data in BED format""" |
---|
401 | file_ext = "bed" |
---|
402 | |
---|
403 | """Add metadata elements""" |
---|
404 | MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter ) |
---|
405 | MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter ) |
---|
406 | MetadataElement( name="endCol", default=3, desc="End column", param=metadata.ColumnParameter ) |
---|
407 | MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 ) |
---|
408 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) |
---|
409 | MetadataElement( name="viz_filter_cols", default=[4], param=metadata.ColumnParameter, multiple=True ) |
---|
410 | ###do we need to repeat these? they are the same as should be inherited from interval type |
---|
411 | |
---|
412 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
413 | """Sets the metadata information for datasets previously determined to be in bed format.""" |
---|
414 | i = 0 |
---|
415 | if dataset.has_data(): |
---|
416 | for i, line in enumerate( file(dataset.file_name) ): |
---|
417 | metadata_set = False |
---|
418 | line = line.rstrip('\r\n') |
---|
419 | if line and not line.startswith('#'): |
---|
420 | elems = line.split('\t') |
---|
421 | if len(elems) > 2: |
---|
422 | for startswith in data.col1_startswith: |
---|
423 | if line.lower().startswith( startswith ): |
---|
424 | if len( elems ) > 3: |
---|
425 | if overwrite or not dataset.metadata.element_is_set( 'nameCol' ): |
---|
426 | dataset.metadata.nameCol = 4 |
---|
427 | if len(elems) < 6: |
---|
428 | if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): |
---|
429 | dataset.metadata.strandCol = 0 |
---|
430 | else: |
---|
431 | if overwrite or not dataset.metadata.element_is_set( 'strandCol' ): |
---|
432 | dataset.metadata.strandCol = 6 |
---|
433 | metadata_set = True |
---|
434 | break |
---|
435 | if metadata_set: break |
---|
436 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i ) |
---|
437 | |
---|
438 | def as_ucsc_display_file( self, dataset, **kwd ): |
---|
439 | """Returns file contents with only the bed data. If bed 6+, treat as interval.""" |
---|
440 | for line in open(dataset.file_name): |
---|
441 | line = line.strip() |
---|
442 | if line == "" or line.startswith("#"): |
---|
443 | continue |
---|
444 | fields = line.split('\t') |
---|
445 | """check to see if this file doesn't conform to strict genome browser accepted bed""" |
---|
446 | try: |
---|
447 | if len(fields) > 12: |
---|
448 | return Interval.as_ucsc_display_file(self, dataset) #too many fields |
---|
449 | if len(fields) > 6: |
---|
450 | int(fields[6]) |
---|
451 | if len(fields) > 7: |
---|
452 | int(fields[7]) |
---|
453 | if len(fields) > 8: |
---|
454 | if int(fields[8]) != 0: |
---|
455 | return Interval.as_ucsc_display_file(self, dataset) |
---|
456 | if len(fields) > 9: |
---|
457 | int(fields[9]) |
---|
458 | if len(fields) > 10: |
---|
459 | fields2 = fields[10].rstrip(",").split(",") #remove trailing comma and split on comma |
---|
460 | for field in fields2: |
---|
461 | int(field) |
---|
462 | if len(fields) > 11: |
---|
463 | fields2 = fields[11].rstrip(",").split(",") #remove trailing comma and split on comma |
---|
464 | for field in fields2: |
---|
465 | int(field) |
---|
466 | except: return Interval.as_ucsc_display_file(self, dataset) |
---|
467 | #only check first line for proper form |
---|
468 | break |
---|
469 | |
---|
470 | try: return open(dataset.file_name) |
---|
471 | except: return "This item contains no content" |
---|
472 | |
---|
473 | def sniff( self, filename ): |
---|
474 | """ |
---|
475 | Checks for 'bedness' |
---|
476 | |
---|
477 | BED lines have three required fields and nine additional optional fields. |
---|
478 | The number of fields per line must be consistent throughout any single set of data in |
---|
479 | an annotation track. The order of the optional fields is binding: lower-numbered |
---|
480 | fields must always be populated if higher-numbered fields are used. The data type of |
---|
481 | all 12 columns is: |
---|
482 | 1-str, 2-int, 3-int, 4-str, 5-int, 6-str, 7-int, 8-int, 9-int or list, 10-int, 11-list, 12-list |
---|
483 | |
---|
484 | For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format1 |
---|
485 | |
---|
486 | >>> fname = get_test_fname( 'test_tab.bed' ) |
---|
487 | >>> Bed().sniff( fname ) |
---|
488 | True |
---|
489 | >>> fname = get_test_fname( 'interval1.bed' ) |
---|
490 | >>> Bed().sniff( fname ) |
---|
491 | True |
---|
492 | >>> fname = get_test_fname( 'complete.bed' ) |
---|
493 | >>> Bed().sniff( fname ) |
---|
494 | True |
---|
495 | """ |
---|
496 | headers = get_headers( filename, '\t' ) |
---|
497 | try: |
---|
498 | if not headers: return False |
---|
499 | for hdr in headers: |
---|
500 | if (hdr[0] == '' or hdr[0].startswith( '#' )): |
---|
501 | continue |
---|
502 | valid_col1 = False |
---|
503 | if len(hdr) < 3 or len(hdr) > 12: |
---|
504 | return False |
---|
505 | for str in data.col1_startswith: |
---|
506 | if hdr[0].lower().startswith(str): |
---|
507 | valid_col1 = True |
---|
508 | break |
---|
509 | if valid_col1: |
---|
510 | try: |
---|
511 | int( hdr[1] ) |
---|
512 | int( hdr[2] ) |
---|
513 | except: |
---|
514 | return False |
---|
515 | if len( hdr ) > 4: |
---|
516 | #hdr[3] is a string, 'name', which defines the name of the BED line - difficult to test for this. |
---|
517 | #hdr[4] is an int, 'score', a score between 0 and 1000. |
---|
518 | try: |
---|
519 | if int( hdr[4] ) < 0 or int( hdr[4] ) > 1000: return False |
---|
520 | except: |
---|
521 | return False |
---|
522 | if len( hdr ) > 5: |
---|
523 | #hdr[5] is strand |
---|
524 | if hdr[5] not in data.valid_strand: return False |
---|
525 | if len( hdr ) > 6: |
---|
526 | #hdr[6] is thickStart, the starting position at which the feature is drawn thickly. |
---|
527 | try: int( hdr[6] ) |
---|
528 | except: return False |
---|
529 | if len( hdr ) > 7: |
---|
530 | #hdr[7] is thickEnd, the ending position at which the feature is drawn thickly |
---|
531 | try: int( hdr[7] ) |
---|
532 | except: return False |
---|
533 | if len( hdr ) > 8: |
---|
534 | #hdr[8] is itemRgb, an RGB value of the form R,G,B (e.g. 255,0,0). However, this could also be an int (e.g., 0) |
---|
535 | try: int( hdr[8] ) |
---|
536 | except: |
---|
537 | try: hdr[8].split(',') |
---|
538 | except: return False |
---|
539 | if len( hdr ) > 9: |
---|
540 | #hdr[9] is blockCount, the number of blocks (exons) in the BED line. |
---|
541 | try: block_count = int( hdr[9] ) |
---|
542 | except: return False |
---|
543 | if len( hdr ) > 10: |
---|
544 | #hdr[10] is blockSizes - A comma-separated list of the block sizes. |
---|
545 | #Sometimes the blosck_sizes and block_starts lists end in extra commas |
---|
546 | try: block_sizes = hdr[10].rstrip(',').split(',') |
---|
547 | except: return False |
---|
548 | if len( hdr ) > 11: |
---|
549 | #hdr[11] is blockStarts - A comma-separated list of block starts. |
---|
550 | try: block_starts = hdr[11].rstrip(',').split(',') |
---|
551 | except: return False |
---|
552 | if len(block_sizes) != block_count or len(block_starts) != block_count: return False |
---|
553 | else: return False |
---|
554 | return True |
---|
555 | except: return False |
---|
556 | |
---|
557 | def get_track_type( self ): |
---|
558 | return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"} |
---|
559 | |
---|
560 | class BedStrict( Bed ): |
---|
561 | """Tab delimited data in strict BED format - no non-standard columns allowed""" |
---|
562 | |
---|
563 | file_ext = "bedstrict" |
---|
564 | |
---|
565 | #no user change of datatype allowed |
---|
566 | allow_datatype_change = False |
---|
567 | |
---|
568 | #Read only metadata elements |
---|
569 | MetadataElement( name="chromCol", default=1, desc="Chrom column", readonly=True, param=metadata.MetadataParameter ) |
---|
570 | MetadataElement( name="startCol", default=2, desc="Start column", readonly=True, param=metadata.MetadataParameter ) #TODO: start and end should be able to be set to these or the proper thick[start/end]? |
---|
571 | MetadataElement( name="endCol", default=3, desc="End column", readonly=True, param=metadata.MetadataParameter ) |
---|
572 | MetadataElement( name="strandCol", desc="Strand column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True ) |
---|
573 | MetadataElement( name="nameCol", desc="Name/Identifier column (click box & select)", readonly=True, param=metadata.MetadataParameter, no_value=0, optional=True ) |
---|
574 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) |
---|
575 | |
---|
576 | def __init__( self, **kwd ): |
---|
577 | Tabular.__init__( self, **kwd ) |
---|
578 | self.clear_display_apps() #only new style display applications for this datatype |
---|
579 | |
---|
580 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
581 | Tabular.set_meta( self, dataset, overwrite = overwrite, **kwd) #need column count first |
---|
582 | if dataset.metadata.columns >= 4: |
---|
583 | dataset.metadata.nameCol = 4 |
---|
584 | if dataset.metadata.columns >= 6: |
---|
585 | dataset.metadata.strandCol = 6 |
---|
586 | |
---|
587 | def sniff( self, filename ): |
---|
588 | return False #NOTE: This would require aggressively validating the entire file |
---|
589 | |
---|
590 | class Bed6( BedStrict ): |
---|
591 | """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 6""" |
---|
592 | |
---|
593 | file_ext = "bed6" |
---|
594 | |
---|
595 | class Bed12( BedStrict ): |
---|
596 | """Tab delimited data in strict BED format - no non-standard columns allowed; column count forced to 12""" |
---|
597 | |
---|
598 | file_ext = "bed12" |
---|
599 | |
---|
600 | class _RemoteCallMixin: |
---|
601 | def _get_remote_call_url( self, redirect_url, site_name, dataset, type, app, base_url ): |
---|
602 | """Retrieve the URL to call out to an external site and retrieve data. |
---|
603 | This routes our external URL through a local galaxy instance which makes |
---|
604 | the data available, followed by redirecting to the remote site with a |
---|
605 | link back to the available information. |
---|
606 | """ |
---|
607 | internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='%s_%s' % ( type, site_name ) ) |
---|
608 | base_url = app.config.get( "display_at_callback", base_url ) |
---|
609 | if base_url.startswith( 'https://' ): |
---|
610 | base_url = base_url.replace( 'https', 'http', 1 ) |
---|
611 | display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % \ |
---|
612 | ( base_url, url_for( controller='root' ), dataset.id, type ) ) |
---|
613 | link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url ) |
---|
614 | return link |
---|
615 | |
---|
616 | class Gff( Tabular, _RemoteCallMixin ): |
---|
617 | """Tab delimited data in Gff format""" |
---|
618 | file_ext = "gff" |
---|
619 | column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] |
---|
620 | |
---|
621 | """Add metadata elements""" |
---|
622 | MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False ) |
---|
623 | MetadataElement( name="column_types", default=['str','str','str','int','int','int','str','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False ) |
---|
624 | |
---|
625 | def __init__( self, **kwd ): |
---|
626 | """Initialize datatype, by adding GBrowse display app""" |
---|
627 | Tabular.__init__(self, **kwd) |
---|
628 | self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) |
---|
629 | self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' ) |
---|
630 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
631 | i = 0 |
---|
632 | for i, line in enumerate( file ( dataset.file_name ) ): |
---|
633 | line = line.rstrip('\r\n') |
---|
634 | if line and not line.startswith( '#' ): |
---|
635 | elems = line.split( '\t' ) |
---|
636 | if len(elems) == 9: |
---|
637 | try: |
---|
638 | int( elems[3] ) |
---|
639 | int( elems[4] ) |
---|
640 | break |
---|
641 | except: |
---|
642 | pass |
---|
643 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i ) |
---|
644 | def make_html_table( self, dataset, skipchars=[] ): |
---|
645 | """Create HTML table, used for displaying peek""" |
---|
646 | out = ['<table cellspacing="0" cellpadding="3">'] |
---|
647 | comments = [] |
---|
648 | try: |
---|
649 | # Generate column header |
---|
650 | out.append( '<tr>' ) |
---|
651 | for i, name in enumerate( self.column_names ): |
---|
652 | out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) ) |
---|
653 | out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) |
---|
654 | out.append( '</table>' ) |
---|
655 | out = "".join( out ) |
---|
656 | except Exception, exc: |
---|
657 | out = "Can't create peek %s" % exc |
---|
658 | return out |
---|
659 | def get_estimated_display_viewport( self, dataset ): |
---|
660 | """ |
---|
661 | Return a chrom, start, stop tuple for viewing a file. There are slight differences between gff 2 and gff 3 |
---|
662 | formats. This function should correctly handle both... |
---|
663 | """ |
---|
664 | viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines |
---|
665 | max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines |
---|
666 | if self.displayable( dataset ): |
---|
667 | try: |
---|
668 | seqid = None |
---|
669 | start = sys.maxint |
---|
670 | stop = 0 |
---|
671 | fh = open( dataset.file_name ) |
---|
672 | while True: |
---|
673 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
674 | if not line: break #EOF |
---|
675 | try: |
---|
676 | if line.startswith( '##sequence-region' ): # ##sequence-region IV 6000000 6030000 |
---|
677 | elems = line.rstrip( '\n\r' ).split() |
---|
678 | if len( elems ) > 3: |
---|
679 | # line looks like: |
---|
680 | # ##sequence-region ctg123 1 1497228 |
---|
681 | seqid = elems[1] # IV |
---|
682 | start = int( elems[2] )# 6000000 |
---|
683 | stop = int( elems[3] ) # 6030000 |
---|
684 | break #use location declared in file |
---|
685 | elif len( elems ) == 2 and elems[1].find( '..' ) > 0: |
---|
686 | # line looks like this: |
---|
687 | # ##sequence-region X:120000..140000 |
---|
688 | elems = elems[1].split( ':' ) |
---|
689 | seqid = elems[0] |
---|
690 | start = int( elems[1].split( '..' )[0] ) |
---|
691 | stop = int( elems[1].split( '..' )[1] ) |
---|
692 | break #use location declared in file |
---|
693 | else: |
---|
694 | log.exception( "line (%s) uses an unsupported ##sequence-region definition." % str( line ) ) |
---|
695 | #break #no break, if bad definition, we try another method |
---|
696 | elif line.startswith("browser position"): |
---|
697 | # Allow UCSC style browser and track info in the GFF file |
---|
698 | pos_info = line.split()[-1] |
---|
699 | seqid, startend = pos_info.split(":") |
---|
700 | start, stop = map( int, startend.split("-") ) |
---|
701 | break #use location declared in file |
---|
702 | elif True not in map( line.startswith, ( '#', 'track', 'browser' ) ):# line.startswith() does not accept iterator in python2.4 |
---|
703 | viewport_feature_count -= 1 |
---|
704 | elems = line.rstrip( '\n\r' ).split( '\t' ) |
---|
705 | if len( elems ) > 3: |
---|
706 | if not seqid: |
---|
707 | # We can only set the viewport for a single chromosome |
---|
708 | seqid = elems[0] |
---|
709 | if seqid == elems[0]: |
---|
710 | # Make sure we have not spanned chromosomes |
---|
711 | start = min( start, int( elems[3] ) ) |
---|
712 | stop = max( stop, int( elems[4] ) ) |
---|
713 | except: |
---|
714 | #most likely start/stop is not an int or not enough fields |
---|
715 | pass |
---|
716 | #make sure we are at the next new line |
---|
717 | readline_count = VIEWPORT_MAX_READS_PER_LINE |
---|
718 | while line.rstrip( '\n\r' ) == line: |
---|
719 | assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id ) |
---|
720 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
721 | if not line: break #EOF |
---|
722 | readline_count -= 1 |
---|
723 | max_line_count -= 1 |
---|
724 | if not viewport_feature_count or not max_line_count: |
---|
725 | #exceeded viewport or total line count to check |
---|
726 | break |
---|
727 | if seqid is not None: |
---|
728 | return ( seqid, str( start ), str( stop ) ) #Necessary to return strings? |
---|
729 | except Exception, e: |
---|
730 | #unexpected error |
---|
731 | log.exception( str( e ) ) |
---|
732 | return ( None, None, None ) #could not determine viewport |
---|
733 | def ucsc_links( self, dataset, type, app, base_url ): |
---|
734 | ret_val = [] |
---|
735 | seqid, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
736 | if seqid is not None: |
---|
737 | for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ): |
---|
738 | if site_name in app.config.ucsc_display_sites: |
---|
739 | redirect_url = urllib.quote_plus( |
---|
740 | "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % |
---|
741 | ( site_url, dataset.dbkey, seqid, start, stop ) ) |
---|
742 | link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url ) |
---|
743 | ret_val.append( ( site_name, link ) ) |
---|
744 | return ret_val |
---|
745 | def gbrowse_links( self, dataset, type, app, base_url ): |
---|
746 | ret_val = [] |
---|
747 | seqid, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
748 | if seqid is not None: |
---|
749 | for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ): |
---|
750 | if site_name in app.config.gbrowse_display_sites: |
---|
751 | if seqid.startswith( 'chr' ) and len ( seqid ) > 3: |
---|
752 | seqid = seqid[3:] |
---|
753 | redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, seqid, start, stop ) ) |
---|
754 | link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url ) |
---|
755 | ret_val.append( ( site_name, link ) ) |
---|
756 | return ret_val |
---|
757 | def sniff( self, filename ): |
---|
758 | """ |
---|
759 | Determines whether the file is in gff format |
---|
760 | |
---|
761 | GFF lines have nine required fields that must be tab-separated. |
---|
762 | |
---|
763 | For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format3 |
---|
764 | |
---|
765 | >>> fname = get_test_fname( 'gff_version_3.gff' ) |
---|
766 | >>> Gff().sniff( fname ) |
---|
767 | False |
---|
768 | >>> fname = get_test_fname( 'test.gff' ) |
---|
769 | >>> Gff().sniff( fname ) |
---|
770 | True |
---|
771 | """ |
---|
772 | headers = get_headers( filename, '\t' ) |
---|
773 | try: |
---|
774 | if len(headers) < 2: |
---|
775 | return False |
---|
776 | for hdr in headers: |
---|
777 | if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0: |
---|
778 | return False |
---|
779 | if hdr and hdr[0] and not hdr[0].startswith( '#' ): |
---|
780 | if len(hdr) != 9: |
---|
781 | return False |
---|
782 | try: |
---|
783 | int( hdr[3] ) |
---|
784 | int( hdr[4] ) |
---|
785 | except: |
---|
786 | return False |
---|
787 | if hdr[5] != '.': |
---|
788 | try: |
---|
789 | score = float( hdr[5] ) |
---|
790 | except: |
---|
791 | return False |
---|
792 | if hdr[6] not in data.valid_strand: |
---|
793 | return False |
---|
794 | return True |
---|
795 | except: |
---|
796 | return False |
---|
797 | |
---|
798 | def get_track_type( self ): |
---|
799 | return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"} |
---|
800 | |
---|
801 | |
---|
802 | class Gff3( Gff ): |
---|
803 | """Tab delimited data in Gff3 format""" |
---|
804 | file_ext = "gff3" |
---|
805 | valid_gff3_strand = ['+', '-', '.', '?'] |
---|
806 | valid_gff3_phase = ['.', '0', '1', '2'] |
---|
807 | column_names = [ 'Seqid', 'Source', 'Type', 'Start', 'End', 'Score', 'Strand', 'Phase', 'Attributes' ] |
---|
808 | |
---|
809 | """Add metadata elements""" |
---|
810 | MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False ) |
---|
811 | |
---|
812 | def __init__(self, **kwd): |
---|
813 | """Initialize datatype, by adding GBrowse display app""" |
---|
814 | Gff.__init__(self, **kwd) |
---|
815 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
816 | i = 0 |
---|
817 | for i, line in enumerate( file ( dataset.file_name ) ): |
---|
818 | line = line.rstrip('\r\n') |
---|
819 | if line and not line.startswith( '#' ): |
---|
820 | elems = line.split( '\t' ) |
---|
821 | valid_start = False |
---|
822 | valid_end = False |
---|
823 | if len( elems ) == 9: |
---|
824 | try: |
---|
825 | start = int( elems[3] ) |
---|
826 | valid_start = True |
---|
827 | except: |
---|
828 | if elems[3] == '.': |
---|
829 | valid_start = True |
---|
830 | try: |
---|
831 | end = int( elems[4] ) |
---|
832 | valid_end = True |
---|
833 | except: |
---|
834 | if elems[4] == '.': |
---|
835 | valid_end = True |
---|
836 | strand = elems[6] |
---|
837 | phase = elems[7] |
---|
838 | if valid_start and valid_end and start < end and strand in self.valid_gff3_strand and phase in self.valid_gff3_phase: |
---|
839 | break |
---|
840 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i ) |
---|
841 | def sniff( self, filename ): |
---|
842 | """ |
---|
843 | Determines whether the file is in gff version 3 format |
---|
844 | |
---|
845 | GFF 3 format: |
---|
846 | |
---|
847 | 1) adds a mechanism for representing more than one level |
---|
848 | of hierarchical grouping of features and subfeatures. |
---|
849 | 2) separates the ideas of group membership and feature name/id |
---|
850 | 3) constrains the feature type field to be taken from a controlled |
---|
851 | vocabulary. |
---|
852 | 4) allows a single feature, such as an exon, to belong to more than |
---|
853 | one group at a time. |
---|
854 | 5) provides an explicit convention for pairwise alignments |
---|
855 | 6) provides an explicit convention for features that occupy disjunct regions |
---|
856 | |
---|
857 | The format consists of 9 columns, separated by tabs (NOT spaces). |
---|
858 | |
---|
859 | Undefined fields are replaced with the "." character, as described in the original GFF spec. |
---|
860 | |
---|
861 | For complete details see http://song.sourceforge.net/gff3.shtml |
---|
862 | |
---|
863 | >>> fname = get_test_fname( 'test.gff' ) |
---|
864 | >>> Gff3().sniff( fname ) |
---|
865 | False |
---|
866 | >>> fname = get_test_fname('gff_version_3.gff') |
---|
867 | >>> Gff3().sniff( fname ) |
---|
868 | True |
---|
869 | """ |
---|
870 | headers = get_headers( filename, '\t' ) |
---|
871 | try: |
---|
872 | if len(headers) < 2: |
---|
873 | return False |
---|
874 | for hdr in headers: |
---|
875 | if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) >= 0: |
---|
876 | return True |
---|
877 | elif hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '3' ) < 0: |
---|
878 | return False |
---|
879 | # Header comments may have been stripped, so inspect the data |
---|
880 | if hdr and hdr[0] and not hdr[0].startswith( '#' ): |
---|
881 | if len(hdr) != 9: |
---|
882 | return False |
---|
883 | try: |
---|
884 | int( hdr[3] ) |
---|
885 | except: |
---|
886 | if hdr[3] != '.': |
---|
887 | return False |
---|
888 | try: |
---|
889 | int( hdr[4] ) |
---|
890 | except: |
---|
891 | if hdr[4] != '.': |
---|
892 | return False |
---|
893 | if hdr[5] != '.': |
---|
894 | try: |
---|
895 | score = float( hdr[5] ) |
---|
896 | except: |
---|
897 | return False |
---|
898 | if hdr[6] not in self.valid_gff3_strand: |
---|
899 | return False |
---|
900 | if hdr[7] not in self.valid_gff3_phase: |
---|
901 | return False |
---|
902 | return True |
---|
903 | except: |
---|
904 | return False |
---|
905 | |
---|
906 | class Gtf( Gff ): |
---|
907 | """Tab delimited data in Gtf format""" |
---|
908 | file_ext = "gtf" |
---|
909 | column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Attributes' ] |
---|
910 | |
---|
911 | """Add metadata elements""" |
---|
912 | MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False ) |
---|
913 | MetadataElement( name="column_types", default=['str','str','str','int','int','float','str','int','list'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False ) |
---|
914 | |
---|
915 | |
---|
916 | def sniff( self, filename ): |
---|
917 | """ |
---|
918 | Determines whether the file is in gtf format |
---|
919 | |
---|
920 | GTF lines have nine required fields that must be tab-separated. The first eight GTF fields are the same as GFF. |
---|
921 | The group field has been expanded into a list of attributes. Each attribute consists of a type/value pair. |
---|
922 | Attributes must end in a semi-colon, and be separated from any following attribute by exactly one space. |
---|
923 | The attribute list must begin with the two mandatory attributes: |
---|
924 | |
---|
925 | gene_id value - A globally unique identifier for the genomic source of the sequence. |
---|
926 | transcript_id value - A globally unique identifier for the predicted transcript. |
---|
927 | |
---|
928 | For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format4 |
---|
929 | |
---|
930 | >>> fname = get_test_fname( '1.bed' ) |
---|
931 | >>> Gtf().sniff( fname ) |
---|
932 | False |
---|
933 | >>> fname = get_test_fname( 'test.gff' ) |
---|
934 | >>> Gtf().sniff( fname ) |
---|
935 | False |
---|
936 | >>> fname = get_test_fname( 'test.gtf' ) |
---|
937 | >>> Gtf().sniff( fname ) |
---|
938 | True |
---|
939 | """ |
---|
940 | headers = get_headers( filename, '\t' ) |
---|
941 | try: |
---|
942 | if len(headers) < 2: |
---|
943 | return False |
---|
944 | for hdr in headers: |
---|
945 | if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0: |
---|
946 | return False |
---|
947 | if hdr and hdr[0] and not hdr[0].startswith( '#' ): |
---|
948 | if len(hdr) != 9: |
---|
949 | return False |
---|
950 | try: |
---|
951 | int( hdr[3] ) |
---|
952 | int( hdr[4] ) |
---|
953 | except: |
---|
954 | return False |
---|
955 | if hdr[5] != '.': |
---|
956 | try: |
---|
957 | score = float( hdr[5] ) |
---|
958 | except: |
---|
959 | return False |
---|
960 | if hdr[6] not in data.valid_strand: |
---|
961 | return False |
---|
962 | |
---|
963 | # Check attributes for gene_id, transcript_id |
---|
964 | attributes = hdr[8].split(";") |
---|
965 | if len( attributes ) >= 2: |
---|
966 | try: |
---|
967 | # Imprecise: should check for a single space per the spec. |
---|
968 | attr_name, attr_value = attributes[0].split(" ") |
---|
969 | if attr_name != 'gene_id': |
---|
970 | return False |
---|
971 | except: |
---|
972 | return False |
---|
973 | try: |
---|
974 | # Imprecise: should check for a single space per the spec. |
---|
975 | attr_name, attr_value = attributes[1][1:].split(" ") |
---|
976 | if attr_name != 'transcript_id': |
---|
977 | return False |
---|
978 | except: |
---|
979 | return False |
---|
980 | else: |
---|
981 | return False |
---|
982 | return True |
---|
983 | except: |
---|
984 | return False |
---|
985 | |
---|
986 | |
---|
987 | class Wiggle( Tabular, _RemoteCallMixin ): |
---|
988 | """Tab delimited data in wiggle format""" |
---|
989 | file_ext = "wig" |
---|
990 | |
---|
991 | MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False ) |
---|
992 | |
---|
993 | def __init__( self, **kwd ): |
---|
994 | Tabular.__init__( self, **kwd ) |
---|
995 | self.add_display_app( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) |
---|
996 | self.add_display_app( 'gbrowse', 'display in Gbrowse', 'as_gbrowse_display_file', 'gbrowse_links' ) |
---|
997 | def get_estimated_display_viewport( self, dataset ): |
---|
998 | """Return a chrom, start, stop tuple for viewing a file.""" |
---|
999 | viewport_feature_count = 100 # viewport should check at least 100 features; excludes comment lines |
---|
1000 | max_line_count = max( viewport_feature_count, 500 ) # maximum number of lines to check; includes comment lines |
---|
1001 | if self.displayable( dataset ): |
---|
1002 | try: |
---|
1003 | chrom = None |
---|
1004 | start = sys.maxint |
---|
1005 | end = 0 |
---|
1006 | span = 1 |
---|
1007 | step = None |
---|
1008 | fh = open( dataset.file_name ) |
---|
1009 | while True: |
---|
1010 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
1011 | if not line: break #EOF |
---|
1012 | try: |
---|
1013 | if line.startswith( "browser" ): |
---|
1014 | chr_info = line.rstrip( '\n\r' ).split()[-1] |
---|
1015 | chrom, coords = chr_info.split( ":" ) |
---|
1016 | start, end = map( int, coords.split( "-" ) ) |
---|
1017 | break # use the browser line |
---|
1018 | # variableStep chrom=chr20 |
---|
1019 | if line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ): |
---|
1020 | if chrom is not None: break #different chrom or different section of the chrom |
---|
1021 | chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0] |
---|
1022 | if 'span=' in line: |
---|
1023 | span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] ) |
---|
1024 | if 'step=' in line: |
---|
1025 | step = int( line.rstrip( '\n\r' ).split("step=")[1].split()[0] ) |
---|
1026 | start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] ) |
---|
1027 | else: |
---|
1028 | fields = line.rstrip( '\n\r' ).split() |
---|
1029 | if fields: |
---|
1030 | if step is not None: |
---|
1031 | if not end: |
---|
1032 | end = start + span |
---|
1033 | else: |
---|
1034 | end += step |
---|
1035 | else: |
---|
1036 | start = min( int( fields[0] ), start ) |
---|
1037 | end = max( end, int( fields[0] ) + span ) |
---|
1038 | viewport_feature_count -= 1 |
---|
1039 | except: |
---|
1040 | pass |
---|
1041 | #make sure we are at the next new line |
---|
1042 | readline_count = VIEWPORT_MAX_READS_PER_LINE |
---|
1043 | while line.rstrip( '\n\r' ) == line: |
---|
1044 | assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id ) |
---|
1045 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
1046 | if not line: break #EOF |
---|
1047 | readline_count -= 1 |
---|
1048 | max_line_count -= 1 |
---|
1049 | if not viewport_feature_count or not max_line_count: |
---|
1050 | #exceeded viewport or total line count to check |
---|
1051 | break |
---|
1052 | if chrom is not None: |
---|
1053 | return ( chrom, str( start ), str( end ) ) #Necessary to return strings? |
---|
1054 | except Exception, e: |
---|
1055 | #unexpected error |
---|
1056 | log.exception( str( e ) ) |
---|
1057 | return ( None, None, None ) #could not determine viewport |
---|
1058 | def gbrowse_links( self, dataset, type, app, base_url ): |
---|
1059 | ret_val = [] |
---|
1060 | chrom, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
1061 | if chrom is not None: |
---|
1062 | for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ): |
---|
1063 | if site_name in app.config.gbrowse_display_sites: |
---|
1064 | if chrom.startswith( 'chr' ) and len ( chrom ) > 3: |
---|
1065 | chrom = chrom[3:] |
---|
1066 | redirect_url = urllib.quote_plus( "%s/?q=%s:%s..%s&eurl=%%s" % ( site_url, chrom, start, stop ) ) |
---|
1067 | link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url ) |
---|
1068 | ret_val.append( ( site_name, link ) ) |
---|
1069 | return ret_val |
---|
1070 | def ucsc_links( self, dataset, type, app, base_url ): |
---|
1071 | ret_val = [] |
---|
1072 | chrom, start, stop = self.get_estimated_display_viewport( dataset ) |
---|
1073 | if chrom is not None: |
---|
1074 | for site_name, site_url in util.get_ucsc_by_build( dataset.dbkey ): |
---|
1075 | if site_name in app.config.ucsc_display_sites: |
---|
1076 | redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % ( site_url, dataset.dbkey, chrom, start, stop ) ) |
---|
1077 | link = self._get_remote_call_url( redirect_url, site_name, dataset, type, app, base_url ) |
---|
1078 | ret_val.append( ( site_name, link ) ) |
---|
1079 | return ret_val |
---|
1080 | def make_html_table( self, dataset ): |
---|
1081 | return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] ) |
---|
1082 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
1083 | max_data_lines = None |
---|
1084 | i = 0 |
---|
1085 | for i, line in enumerate( file ( dataset.file_name ) ): |
---|
1086 | line = line.rstrip('\r\n') |
---|
1087 | if line and not line.startswith( '#' ): |
---|
1088 | elems = line.split( '\t' ) |
---|
1089 | try: |
---|
1090 | float( elems[0] ) #"Wiggle track data values can be integer or real, positive or negative values" |
---|
1091 | break |
---|
1092 | except: |
---|
1093 | do_break = False |
---|
1094 | for col_startswith in data.col1_startswith: |
---|
1095 | if elems[0].lower().startswith( col_startswith ): |
---|
1096 | do_break = True |
---|
1097 | break |
---|
1098 | if do_break: |
---|
1099 | break |
---|
1100 | if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize: |
---|
1101 | #we'll arbitrarily only use the first 100 data lines in this wig file to calculate tabular attributes (column types) |
---|
1102 | #this should be sufficient, except when we have mixed wig track types (bed, variable, fixed), |
---|
1103 | # but those cases are not a single table that would have consistant column definitions |
---|
1104 | #optional metadata values set in Tabular class will be 'None' |
---|
1105 | max_data_lines = 100 |
---|
1106 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = i, max_data_lines = max_data_lines ) |
---|
1107 | def sniff( self, filename ): |
---|
1108 | """ |
---|
1109 | Determines wether the file is in wiggle format |
---|
1110 | |
---|
1111 | The .wig format is line-oriented. Wiggle data is preceeded by a track definition line, |
---|
1112 | which adds a number of options for controlling the default display of this track. |
---|
1113 | Following the track definition line is the track data, which can be entered in several |
---|
1114 | different formats. |
---|
1115 | |
---|
1116 | The track definition line begins with the word 'track' followed by the track type. |
---|
1117 | The track type with version is REQUIRED, and it currently must be wiggle_0. For example, |
---|
1118 | track type=wiggle_0... |
---|
1119 | |
---|
1120 | For complete details see http://genome.ucsc.edu/goldenPath/help/wiggle.html |
---|
1121 | |
---|
1122 | >>> fname = get_test_fname( 'interval1.bed' ) |
---|
1123 | >>> Wiggle().sniff( fname ) |
---|
1124 | False |
---|
1125 | >>> fname = get_test_fname( 'wiggle.wig' ) |
---|
1126 | >>> Wiggle().sniff( fname ) |
---|
1127 | True |
---|
1128 | """ |
---|
1129 | headers = get_headers( filename, None ) |
---|
1130 | try: |
---|
1131 | for hdr in headers: |
---|
1132 | if len(hdr) > 1 and hdr[0] == 'track' and hdr[1].startswith('type=wiggle'): |
---|
1133 | return True |
---|
1134 | return False |
---|
1135 | except: |
---|
1136 | return False |
---|
1137 | def get_track_window(self, dataset, data, start, end): |
---|
1138 | """ |
---|
1139 | Assumes we have a numpy file. |
---|
1140 | """ |
---|
1141 | # Maybe if we import here people will still be able to use Galaxy when numpy kills it |
---|
1142 | pkg_resources.require("numpy>=1.2.1") |
---|
1143 | #from numpy.lib import format |
---|
1144 | import numpy |
---|
1145 | |
---|
1146 | range = end - start |
---|
1147 | # Determine appropriate resolution to plot ~1000 points |
---|
1148 | resolution = ( 10 ** math.ceil( math.log10( range / 1000 ) ) ) |
---|
1149 | # Restrict to valid range |
---|
1150 | resolution = min( resolution, 100000 ) |
---|
1151 | resolution = max( resolution, 1 ) |
---|
1152 | # Memory map the array (don't load all the data) |
---|
1153 | data = numpy.load( data ) |
---|
1154 | # Grab just what we need |
---|
1155 | t_start = math.floor( start / resolution ) |
---|
1156 | t_end = math.ceil( end / resolution ) |
---|
1157 | x = numpy.arange( t_start, t_end ) * resolution |
---|
1158 | y = data[ t_start : t_end ] |
---|
1159 | |
---|
1160 | return zip(x.tolist(), y.tolist()) |
---|
1161 | def get_track_resolution( self, dataset, start, end): |
---|
1162 | range = end - start |
---|
1163 | # Determine appropriate resolution to plot ~1000 points |
---|
1164 | resolution = math.ceil( 10 ** math.ceil( math.log10( range / 1000 ) ) ) |
---|
1165 | # Restrict to valid range |
---|
1166 | resolution = min( resolution, 100000 ) |
---|
1167 | resolution = max( resolution, 1 ) |
---|
1168 | return resolution |
---|
1169 | def get_track_type( self ): |
---|
1170 | return "LineTrack", {"data": "array_tree"} |
---|
1171 | |
---|
1172 | class CustomTrack ( Tabular ): |
---|
1173 | """UCSC CustomTrack""" |
---|
1174 | file_ext = "customtrack" |
---|
1175 | |
---|
1176 | def __init__(self, **kwd): |
---|
1177 | """Initialize interval datatype, by adding UCSC display app""" |
---|
1178 | Tabular.__init__(self, **kwd) |
---|
1179 | self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) |
---|
1180 | def set_meta( self, dataset, overwrite = True, **kwd ): |
---|
1181 | Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 ) |
---|
1182 | def display_peek( self, dataset ): |
---|
1183 | """Returns formated html of peek""" |
---|
1184 | return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] ) |
---|
1185 | def get_estimated_display_viewport( self, dataset, chrom_col = None, start_col = None, end_col = None ): |
---|
1186 | """Return a chrom, start, stop tuple for viewing a file.""" |
---|
1187 | #FIXME: only BED and WIG custom tracks are currently supported |
---|
1188 | #As per previously existing behavior, viewport will only be over the first intervals |
---|
1189 | max_line_count = 100 # maximum number of lines to check; includes comment lines |
---|
1190 | variable_step_wig = False |
---|
1191 | chrom = None |
---|
1192 | span = 1 |
---|
1193 | if self.displayable( dataset ): |
---|
1194 | try: |
---|
1195 | fh = open( dataset.file_name ) |
---|
1196 | while True: |
---|
1197 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
1198 | if not line: break #EOF |
---|
1199 | if not line.startswith( '#' ): |
---|
1200 | try: |
---|
1201 | if variable_step_wig: |
---|
1202 | fields = line.rstrip().split() |
---|
1203 | if len( fields ) == 2: |
---|
1204 | start = int( fields[ 0 ] ) |
---|
1205 | return ( chrom, str( start ), str( start + span ) ) |
---|
1206 | elif line and ( line.lower().startswith( "variablestep" ) or line.lower().startswith( "fixedstep" ) ): |
---|
1207 | chrom = line.rstrip( '\n\r' ).split("chrom=")[1].split()[0] |
---|
1208 | if 'span=' in line: |
---|
1209 | span = int( line.rstrip( '\n\r' ).split("span=")[1].split()[0] ) |
---|
1210 | if 'start=' in line: |
---|
1211 | start = int( line.rstrip( '\n\r' ).split("start=")[1].split()[0] ) |
---|
1212 | return ( chrom, str( start ), str( start + span ) ) |
---|
1213 | else: |
---|
1214 | variable_step_wig = True |
---|
1215 | else: |
---|
1216 | fields = line.rstrip().split( '\t' ) |
---|
1217 | if len( fields ) >= 3: |
---|
1218 | chrom = fields[ 0 ] |
---|
1219 | start = int( fields[ 1 ] ) |
---|
1220 | end = int( fields[ 2 ] ) |
---|
1221 | return ( chrom, str( start ), str( end ) ) |
---|
1222 | except Exception: |
---|
1223 | #most likely a non-integer field has been encountered for start / stop |
---|
1224 | continue |
---|
1225 | #make sure we are at the next new line |
---|
1226 | readline_count = VIEWPORT_MAX_READS_PER_LINE |
---|
1227 | while line.rstrip( '\n\r' ) == line: |
---|
1228 | assert readline_count > 0, Exception( 'Viewport readline count exceeded for dataset %s.' % dataset.id ) |
---|
1229 | line = fh.readline( VIEWPORT_READLINE_BUFFER_SIZE ) |
---|
1230 | if not line: break #EOF |
---|
1231 | readline_count -= 1 |
---|
1232 | max_line_count -= 1 |
---|
1233 | if not max_line_count: |
---|
1234 | #exceeded viewport or total line count to check |
---|
1235 | break |
---|
1236 | except Exception, e: |
---|
1237 | #unexpected error |
---|
1238 | log.exception( str( e ) ) |
---|
1239 | return ( None, None, None ) #could not determine viewport |
---|
1240 | def ucsc_links( self, dataset, type, app, base_url ): |
---|
1241 | ret_val = [] |
---|
1242 | chrom, start, stop = self.get_estimated_display_viewport(dataset) |
---|
1243 | if chrom is not None: |
---|
1244 | for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey): |
---|
1245 | if site_name in app.config.ucsc_display_sites: |
---|
1246 | internal_url = "%s" % url_for( controller='dataset', dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name ) |
---|
1247 | if base_url.startswith( 'https://' ): |
---|
1248 | base_url = base_url.replace( 'https', 'http', 1 ) |
---|
1249 | display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type) ) |
---|
1250 | redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop ) ) |
---|
1251 | link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url ) |
---|
1252 | ret_val.append( (site_name, link) ) |
---|
1253 | return ret_val |
---|
1254 | def sniff( self, filename ): |
---|
1255 | """ |
---|
1256 | Determines whether the file is in customtrack format. |
---|
1257 | |
---|
1258 | CustomTrack files are built within Galaxy and are basically bed or interval files with the first line looking |
---|
1259 | something like this. |
---|
1260 | |
---|
1261 | track name="User Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1 |
---|
1262 | |
---|
1263 | >>> fname = get_test_fname( 'complete.bed' ) |
---|
1264 | >>> CustomTrack().sniff( fname ) |
---|
1265 | False |
---|
1266 | >>> fname = get_test_fname( 'ucsc.customtrack' ) |
---|
1267 | >>> CustomTrack().sniff( fname ) |
---|
1268 | True |
---|
1269 | """ |
---|
1270 | headers = get_headers( filename, None ) |
---|
1271 | first_line = True |
---|
1272 | for hdr in headers: |
---|
1273 | if first_line: |
---|
1274 | first_line = False |
---|
1275 | try: |
---|
1276 | if hdr[0].startswith('track'): |
---|
1277 | color_found = False |
---|
1278 | visibility_found = False |
---|
1279 | for elem in hdr[1:]: |
---|
1280 | if elem.startswith('color'): color_found = True |
---|
1281 | if elem.startswith('visibility'): visibility_found = True |
---|
1282 | if color_found and visibility_found: break |
---|
1283 | if not color_found or not visibility_found: return False |
---|
1284 | else: return False |
---|
1285 | except: return False |
---|
1286 | else: |
---|
1287 | try: |
---|
1288 | if hdr[0] and not hdr[0].startswith( '#' ): |
---|
1289 | if len( hdr ) < 3: |
---|
1290 | return False |
---|
1291 | try: |
---|
1292 | int( hdr[1] ) |
---|
1293 | int( hdr[2] ) |
---|
1294 | except: |
---|
1295 | return False |
---|
1296 | except: |
---|
1297 | return False |
---|
1298 | return True |
---|
1299 | |
---|
1300 | if __name__ == '__main__': |
---|
1301 | import doctest, sys |
---|
1302 | doctest.testmod(sys.modules[__name__]) |
---|