1 | """ |
---|
2 | Data providers for tracks visualizations. |
---|
3 | """ |
---|
4 | |
---|
5 | from math import floor, ceil, log, pow |
---|
6 | import pkg_resources |
---|
7 | pkg_resources.require( "bx-python" ); pkg_resources.require( "pysam" ); pkg_resources.require( "numpy" ) |
---|
8 | from bx.interval_index_file import Indexes |
---|
9 | from bx.arrays.array_tree import FileArrayTreeDict |
---|
10 | from galaxy.util.lrucache import LRUCache |
---|
11 | from galaxy.visualization.tracks.summary import * |
---|
12 | from galaxy.datatypes.tabular import Vcf |
---|
13 | from galaxy.datatypes.interval import Bed, Gff |
---|
14 | from pysam import csamtools |
---|
15 | |
---|
16 | MAX_VALS = 5000 # only display first MAX_VALS features |
---|
17 | |
---|
18 | class TracksDataProvider( object ): |
---|
19 | """ Base class for tracks data providers. """ |
---|
20 | |
---|
21 | """ |
---|
22 | Mapping from column name to payload data; this mapping is used to create |
---|
23 | filters. Key is column name, value is a dict with mandatory key 'index' and |
---|
24 | optional key 'name'. E.g. this defines column 4 |
---|
25 | |
---|
26 | col_name_data_attr_mapping = {4 : { index: 5, name: 'Score' } } |
---|
27 | """ |
---|
28 | col_name_data_attr_mapping = {} |
---|
29 | |
---|
30 | def __init__( self, converted_dataset=None, original_dataset=None ): |
---|
31 | """ Create basic data provider. """ |
---|
32 | self.converted_dataset = converted_dataset |
---|
33 | self.original_dataset = original_dataset |
---|
34 | |
---|
35 | def get_data( self, chrom, start, end, **kwargs ): |
---|
36 | """ Returns data in region defined by chrom, start, and end. """ |
---|
37 | # Override. |
---|
38 | pass |
---|
39 | |
---|
40 | def get_filters( self ): |
---|
41 | """ |
---|
42 | Returns filters for provider's data. Return value is a list of |
---|
43 | filters; each filter is a dictionary with the keys 'name', 'index', 'type'. |
---|
44 | NOTE: This method uses the original dataset's datatype and metadata to |
---|
45 | create the filters. |
---|
46 | """ |
---|
47 | # Get column names. |
---|
48 | try: |
---|
49 | column_names = self.original_dataset.datatype.column_names |
---|
50 | except AttributeError: |
---|
51 | try: |
---|
52 | column_names = range( self.original_dataset.metadata.columns ) |
---|
53 | except: # Give up |
---|
54 | return [] |
---|
55 | |
---|
56 | # Dataset must have column types; if not, cannot create filters. |
---|
57 | try: |
---|
58 | column_types = self.original_dataset.metadata.column_types |
---|
59 | except AttributeError: |
---|
60 | return [] |
---|
61 | |
---|
62 | # Create and return filters. |
---|
63 | filters = [] |
---|
64 | if self.original_dataset.metadata.viz_filter_cols: |
---|
65 | for viz_col_index in self.original_dataset.metadata.viz_filter_cols: |
---|
66 | col_name = column_names[ viz_col_index ] |
---|
67 | # Make sure that column has a mapped index. If not, do not add filter. |
---|
68 | try: |
---|
69 | attrs = self.col_name_data_attr_mapping[ col_name ] |
---|
70 | except KeyError: |
---|
71 | continue |
---|
72 | filters.append( |
---|
73 | { 'name' : attrs[ 'name' ], 'type' : column_types[viz_col_index], \ |
---|
74 | 'index' : attrs[ 'index' ] } ) |
---|
75 | return filters |
---|
76 | |
---|
77 | class SummaryTreeDataProvider( TracksDataProvider ): |
---|
78 | """ |
---|
79 | Summary tree data provider for the Galaxy track browser. |
---|
80 | """ |
---|
81 | |
---|
82 | CACHE = LRUCache(20) # Store 20 recently accessed indices for performance |
---|
83 | |
---|
84 | def get_summary( self, chrom, start, end, **kwargs): |
---|
85 | filename = self.converted_dataset.file_name |
---|
86 | st = self.CACHE[filename] |
---|
87 | if st is None: |
---|
88 | st = summary_tree_from_file( self.converted_dataset.file_name ) |
---|
89 | self.CACHE[filename] = st |
---|
90 | |
---|
91 | # If chrom is not found in blocks, try removing the first three |
---|
92 | # characters (e.g. 'chr') and see if that works. This enables the |
---|
93 | # provider to handle chrome names defined as chrXXX and as XXX. |
---|
94 | if chrom in st.chrom_blocks: |
---|
95 | pass |
---|
96 | elif chrom[3:] in st.chrom_blocks: |
---|
97 | chrom = chrom[3:] |
---|
98 | else: |
---|
99 | return None |
---|
100 | |
---|
101 | resolution = max(1, ceil(float(kwargs['resolution']))) |
---|
102 | |
---|
103 | level = ceil( log( resolution, st.block_size ) ) - 1 |
---|
104 | level = int(max( level, 0 )) |
---|
105 | if level <= 0: |
---|
106 | return None |
---|
107 | |
---|
108 | stats = st.chrom_stats[chrom] |
---|
109 | results = st.query(chrom, int(start), int(end), level) |
---|
110 | if results == "detail": |
---|
111 | return None |
---|
112 | elif results == "draw" or level <= 1: |
---|
113 | return "no_detail" |
---|
114 | else: |
---|
115 | return results, stats[level]["max"], stats[level]["avg"], stats[level]["delta"] |
---|
116 | |
---|
117 | class VcfDataProvider( TracksDataProvider ): |
---|
118 | """ |
---|
119 | VCF data provider for the Galaxy track browser. |
---|
120 | |
---|
121 | Payload format: |
---|
122 | [ uid (offset), start, end, ID, reference base(s), alternate base(s), quality score] |
---|
123 | """ |
---|
124 | |
---|
125 | col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } } |
---|
126 | |
---|
127 | def get_data( self, chrom, start, end, **kwargs ): |
---|
128 | """ Returns data in region defined by chrom, start, and end. """ |
---|
129 | start, end = int(start), int(end) |
---|
130 | source = open( self.original_dataset.file_name ) |
---|
131 | index = Indexes( self.converted_dataset.file_name ) |
---|
132 | results = [] |
---|
133 | count = 0 |
---|
134 | message = None |
---|
135 | |
---|
136 | # If chrom is not found in indexes, try removing the first three |
---|
137 | # characters (e.g. 'chr') and see if that works. This enables the |
---|
138 | # provider to handle chrome names defined as chrXXX and as XXX. |
---|
139 | chrom = str(chrom) |
---|
140 | if chrom not in index.indexes and chrom[3:] in index.indexes: |
---|
141 | chrom = chrom[3:] |
---|
142 | |
---|
143 | for start, end, offset in index.find(chrom, start, end): |
---|
144 | if count >= MAX_VALS: |
---|
145 | message = "Only the first %s features are being displayed." % MAX_VALS |
---|
146 | break |
---|
147 | count += 1 |
---|
148 | source.seek(offset) |
---|
149 | feature = source.readline().split() |
---|
150 | |
---|
151 | payload = [ offset, start, end, \ |
---|
152 | # ID: |
---|
153 | feature[2], \ |
---|
154 | # reference base(s): |
---|
155 | feature[3], \ |
---|
156 | # alternative base(s) |
---|
157 | feature[4], \ |
---|
158 | # phred quality score |
---|
159 | int( feature[5] )] |
---|
160 | results.append(payload) |
---|
161 | |
---|
162 | return { 'data_type' : 'vcf', 'data': results, 'message': message } |
---|
163 | |
---|
164 | class BamDataProvider( TracksDataProvider ): |
---|
165 | """ |
---|
166 | Provides access to intervals from a sorted indexed BAM file. |
---|
167 | """ |
---|
168 | def get_data( self, chrom, start, end, **kwargs ): |
---|
169 | """ |
---|
170 | Fetch intervals in the region |
---|
171 | """ |
---|
172 | start, end = int(start), int(end) |
---|
173 | no_detail = "no_detail" in kwargs |
---|
174 | # Attempt to open the BAM file with index |
---|
175 | bamfile = csamtools.Samfile( filename=self.original_dataset.file_name, mode='rb', index_filename=self.converted_dataset.file_name ) |
---|
176 | message = None |
---|
177 | try: |
---|
178 | data = bamfile.fetch(start=start, end=end, reference=chrom) |
---|
179 | except ValueError, e: |
---|
180 | # Some BAM files do not prefix chromosome names with chr, try without |
---|
181 | if chrom.startswith( 'chr' ): |
---|
182 | try: |
---|
183 | data = bamfile.fetch( start=start, end=end, reference=chrom[3:] ) |
---|
184 | except ValueError: |
---|
185 | return None |
---|
186 | else: |
---|
187 | return None |
---|
188 | # Encode reads as list of dictionaries |
---|
189 | results = [] |
---|
190 | paired_pending = {} |
---|
191 | for read in data: |
---|
192 | if len(results) > MAX_VALS: |
---|
193 | message = "Only the first %s pairs are being displayed." % MAX_VALS |
---|
194 | break |
---|
195 | qname = read.qname |
---|
196 | seq = read.seq |
---|
197 | read_len = sum( [cig[1] for cig in read.cigar] ) # Use cigar to determine length |
---|
198 | if read.is_proper_pair: |
---|
199 | if qname in paired_pending: # one in dict is always first |
---|
200 | pair = paired_pending[qname] |
---|
201 | results.append( [ qname, pair['start'], read.pos + read_len, seq, read.cigar, [pair['start'], pair['end'], pair['seq']], [read.pos, read.pos + read_len, seq] ] ) |
---|
202 | del paired_pending[qname] |
---|
203 | else: |
---|
204 | paired_pending[qname] = { 'start': read.pos, 'end': read.pos + read_len, 'seq': seq, 'mate_start': read.mpos, 'rlen': read_len, 'cigar': read.cigar } |
---|
205 | else: |
---|
206 | results.append( [qname, read.pos, read.pos + read_len, seq, read.cigar] ) |
---|
207 | # take care of reads whose mates are out of range |
---|
208 | for qname, read in paired_pending.iteritems(): |
---|
209 | if read['mate_start'] < read['start']: |
---|
210 | start = read['mate_start'] |
---|
211 | end = read['end'] |
---|
212 | r1 = [read['mate_start'], read['mate_start'] + read['rlen']] |
---|
213 | r2 = [read['start'], read['end'], read['seq']] |
---|
214 | else: |
---|
215 | start = read['start'] |
---|
216 | end = read['mate_start'] + read['rlen'] |
---|
217 | r1 = [read['start'], read['end'], read['seq']] |
---|
218 | r2 = [read['mate_start'], read['mate_start'] + read['rlen']] |
---|
219 | |
---|
220 | results.append( [ qname, start, end, read['seq'], read['cigar'], r1, r2 ] ) |
---|
221 | |
---|
222 | bamfile.close() |
---|
223 | return { 'data': results, 'message': message } |
---|
224 | |
---|
225 | class ArrayTreeDataProvider( TracksDataProvider ): |
---|
226 | """ |
---|
227 | Array tree data provider for the Galaxy track browser. |
---|
228 | """ |
---|
229 | def get_stats( self, chrom ): |
---|
230 | f = open( self.converted_dataset.file_name ) |
---|
231 | d = FileArrayTreeDict( f ) |
---|
232 | try: |
---|
233 | chrom_array_tree = d[chrom] |
---|
234 | except KeyError: |
---|
235 | f.close() |
---|
236 | return None |
---|
237 | |
---|
238 | root_summary = chrom_array_tree.get_summary( 0, chrom_array_tree.levels ) |
---|
239 | |
---|
240 | level = chrom_array_tree.levels - 1 |
---|
241 | desired_summary = chrom_array_tree.get_summary( 0, level ) |
---|
242 | bs = chrom_array_tree.block_size ** level |
---|
243 | |
---|
244 | frequencies = map(int, desired_summary.frequencies) |
---|
245 | out = [ (i * bs, freq) for i, freq in enumerate(frequencies) ] |
---|
246 | |
---|
247 | f.close() |
---|
248 | return { 'max': float( max(root_summary.maxs) ), \ |
---|
249 | 'min': float( min(root_summary.mins) ), \ |
---|
250 | 'frequencies': out, \ |
---|
251 | 'total_frequency': sum(root_summary.frequencies) } |
---|
252 | |
---|
253 | # Return None instead of NaN to pass jQuery 1.4's strict JSON |
---|
254 | def float_nan(self, n): |
---|
255 | if n != n: # NaN != NaN |
---|
256 | return None |
---|
257 | else: |
---|
258 | return float(n) |
---|
259 | |
---|
260 | def get_data( self, chrom, start, end, **kwargs ): |
---|
261 | if 'stats' in kwargs: |
---|
262 | return self.get_stats(chrom) |
---|
263 | |
---|
264 | f = open( self.converted_dataset.file_name ) |
---|
265 | d = FileArrayTreeDict( f ) |
---|
266 | |
---|
267 | # Get the right chromosome |
---|
268 | try: |
---|
269 | chrom_array_tree = d[chrom] |
---|
270 | except: |
---|
271 | f.close() |
---|
272 | return None |
---|
273 | |
---|
274 | block_size = chrom_array_tree.block_size |
---|
275 | start = int( start ) |
---|
276 | end = int( end ) |
---|
277 | resolution = max(1, ceil(float(kwargs['resolution']))) |
---|
278 | |
---|
279 | level = int( floor( log( resolution, block_size ) ) ) |
---|
280 | level = max( level, 0 ) |
---|
281 | stepsize = block_size ** level |
---|
282 | |
---|
283 | # Is the requested level valid? |
---|
284 | assert 0 <= level <= chrom_array_tree.levels |
---|
285 | |
---|
286 | results = [] |
---|
287 | for block_start in range( start, end, stepsize * block_size ): |
---|
288 | # print block_start |
---|
289 | # Return either data point or a summary depending on the level |
---|
290 | indexes = range( block_start, block_start + stepsize * block_size, stepsize ) |
---|
291 | if level > 0: |
---|
292 | s = chrom_array_tree.get_summary( block_start, level ) |
---|
293 | if s is not None: |
---|
294 | results.extend( zip( indexes, map( self.float_nan, s.sums / s.counts ) ) ) |
---|
295 | else: |
---|
296 | l = chrom_array_tree.get_leaf( block_start ) |
---|
297 | if l is not None: |
---|
298 | results.extend( zip( indexes, map( self.float_nan, l ) ) ) |
---|
299 | |
---|
300 | f.close() |
---|
301 | return results |
---|
302 | |
---|
303 | class IntervalIndexDataProvider( TracksDataProvider ): |
---|
304 | """ |
---|
305 | Interval index data provider for the Galaxy track browser. |
---|
306 | |
---|
307 | Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ] |
---|
308 | """ |
---|
309 | |
---|
310 | col_name_data_attr_mapping = { 4 : { 'index': 8 , 'name' : 'Score' } } |
---|
311 | |
---|
312 | def get_data( self, chrom, start, end, **kwargs ): |
---|
313 | start, end = int(start), int(end) |
---|
314 | source = open( self.original_dataset.file_name ) |
---|
315 | index = Indexes( self.converted_dataset.file_name ) |
---|
316 | results = [] |
---|
317 | count = 0 |
---|
318 | message = None |
---|
319 | |
---|
320 | # If chrom is not found in indexes, try removing the first three |
---|
321 | # characters (e.g. 'chr') and see if that works. This enables the |
---|
322 | # provider to handle chrome names defined as chrXXX and as XXX. |
---|
323 | chrom = str(chrom) |
---|
324 | if chrom not in index.indexes and chrom[3:] in index.indexes: |
---|
325 | chrom = chrom[3:] |
---|
326 | |
---|
327 | for start, end, offset in index.find(chrom, start, end): |
---|
328 | if count >= MAX_VALS: |
---|
329 | message = "Only the first %s features are being displayed." % MAX_VALS |
---|
330 | break |
---|
331 | count += 1 |
---|
332 | source.seek(offset) |
---|
333 | feature = source.readline().split() |
---|
334 | payload = [ offset, start, end ] |
---|
335 | # TODO: can we use column metadata to fill out payload? |
---|
336 | # TODO: use function to set payload data |
---|
337 | if "no_detail" not in kwargs: |
---|
338 | length = len(feature) |
---|
339 | if isinstance( self.original_dataset.datatype, Gff ): |
---|
340 | # GFF dataset. |
---|
341 | if length >= 3: |
---|
342 | payload.append( feature[2] ) # name |
---|
343 | if length >= 7: |
---|
344 | payload.append( feature[6] ) # strand |
---|
345 | elif isinstance( self.original_dataset.datatype, Bed ): |
---|
346 | # BED dataset. |
---|
347 | if length >= 4: |
---|
348 | payload.append(feature[3]) # name |
---|
349 | if length >= 6: # strand |
---|
350 | payload.append(feature[5]) |
---|
351 | |
---|
352 | if length >= 8: |
---|
353 | payload.append(int(feature[6])) |
---|
354 | payload.append(int(feature[7])) |
---|
355 | |
---|
356 | if length >= 12: |
---|
357 | block_sizes = [ int(n) for n in feature[10].split(',') if n != ''] |
---|
358 | block_starts = [ int(n) for n in feature[11].split(',') if n != '' ] |
---|
359 | blocks = zip(block_sizes, block_starts) |
---|
360 | payload.append( [ (start + block[1], start + block[1] + block[0]) for block in blocks] ) |
---|
361 | |
---|
362 | if length >= 5: |
---|
363 | payload.append( int(feature[4]) ) # score |
---|
364 | |
---|
365 | results.append(payload) |
---|
366 | |
---|
367 | return { 'data': results, 'message': message } |
---|
368 | |
---|
369 | # |
---|
370 | # Helper methods. |
---|
371 | # |
---|
372 | |
---|
373 | # Mapping from dataset type name to a class that can fetch data from a file of that |
---|
374 | # type. First key is converted dataset type; if result is another dict, second key |
---|
375 | # is original dataset type. TODO: This needs to be more flexible. |
---|
376 | dataset_type_name_to_data_provider = { |
---|
377 | "array_tree": ArrayTreeDataProvider, |
---|
378 | "interval_index": { "vcf": VcfDataProvider, "default" : IntervalIndexDataProvider }, |
---|
379 | "bai": BamDataProvider, |
---|
380 | "summary_tree": SummaryTreeDataProvider |
---|
381 | } |
---|
382 | |
---|
383 | dataset_type_to_data_provider = { |
---|
384 | Vcf : VcfDataProvider, |
---|
385 | } |
---|
386 | |
---|
387 | def get_data_provider( name=None, original_dataset=None ): |
---|
388 | """ |
---|
389 | Returns data provider class by name and/or original dataset. |
---|
390 | """ |
---|
391 | data_provider = None |
---|
392 | if name: |
---|
393 | value = dataset_type_name_to_data_provider[ name ] |
---|
394 | if isinstance( value, dict ): |
---|
395 | # Get converter by dataset extension; if there is no data provider, |
---|
396 | # get the default. |
---|
397 | data_provider = value.get( original_dataset.ext, value.get( "default" ) ) |
---|
398 | else: |
---|
399 | data_provider = value |
---|
400 | elif original_dataset: |
---|
401 | # Look for data provider in mapping. |
---|
402 | data_provider = dataset_type_to_data_provider.get( original_dataset.datatype.__class__, None ) |
---|
403 | if not data_provider: |
---|
404 | # Look up data provider from datatype's informaton. |
---|
405 | try: |
---|
406 | # Get data provider mapping and data provider for 'data'. If |
---|
407 | # provider available, use it; otherwise use generic provider. |
---|
408 | _ , data_provider_mapping = original_dataset.datatype.get_track_type() |
---|
409 | data_provider_name = data_provider_mapping[ 'data' ] |
---|
410 | if data_provider_name: |
---|
411 | data_provider = get_data_provider( name=data_provider_name, original_dataset=original_dataset ) |
---|
412 | else: |
---|
413 | data_provider = TracksDataProvider |
---|
414 | except: |
---|
415 | pass |
---|
416 | return data_provider |
---|
417 | |
---|