[2] | 1 | """ |
---|
| 2 | Data providers for tracks visualizations. |
---|
| 3 | """ |
---|
| 4 | |
---|
| 5 | from math import floor, ceil, log, pow |
---|
| 6 | import pkg_resources |
---|
| 7 | pkg_resources.require( "bx-python" ); pkg_resources.require( "pysam" ); pkg_resources.require( "numpy" ) |
---|
| 8 | from bx.interval_index_file import Indexes |
---|
| 9 | from bx.arrays.array_tree import FileArrayTreeDict |
---|
| 10 | from galaxy.util.lrucache import LRUCache |
---|
| 11 | from galaxy.visualization.tracks.summary import * |
---|
| 12 | from galaxy.datatypes.tabular import Vcf |
---|
| 13 | from galaxy.datatypes.interval import Bed, Gff |
---|
| 14 | from pysam import csamtools |
---|
| 15 | |
---|
| 16 | MAX_VALS = 5000 # only display first MAX_VALS features |
---|
| 17 | |
---|
| 18 | class TracksDataProvider( object ): |
---|
| 19 | """ Base class for tracks data providers. """ |
---|
| 20 | |
---|
| 21 | """ |
---|
| 22 | Mapping from column name to payload data; this mapping is used to create |
---|
| 23 | filters. Key is column name, value is a dict with mandatory key 'index' and |
---|
| 24 | optional key 'name'. E.g. this defines column 4 |
---|
| 25 | |
---|
| 26 | col_name_data_attr_mapping = {4 : { index: 5, name: 'Score' } } |
---|
| 27 | """ |
---|
| 28 | col_name_data_attr_mapping = {} |
---|
| 29 | |
---|
| 30 | def __init__( self, converted_dataset=None, original_dataset=None ): |
---|
| 31 | """ Create basic data provider. """ |
---|
| 32 | self.converted_dataset = converted_dataset |
---|
| 33 | self.original_dataset = original_dataset |
---|
| 34 | |
---|
| 35 | def get_data( self, chrom, start, end, **kwargs ): |
---|
| 36 | """ Returns data in region defined by chrom, start, and end. """ |
---|
| 37 | # Override. |
---|
| 38 | pass |
---|
| 39 | |
---|
| 40 | def get_filters( self ): |
---|
| 41 | """ |
---|
| 42 | Returns filters for provider's data. Return value is a list of |
---|
| 43 | filters; each filter is a dictionary with the keys 'name', 'index', 'type'. |
---|
| 44 | NOTE: This method uses the original dataset's datatype and metadata to |
---|
| 45 | create the filters. |
---|
| 46 | """ |
---|
| 47 | # Get column names. |
---|
| 48 | try: |
---|
| 49 | column_names = self.original_dataset.datatype.column_names |
---|
| 50 | except AttributeError: |
---|
| 51 | try: |
---|
| 52 | column_names = range( self.original_dataset.metadata.columns ) |
---|
| 53 | except: # Give up |
---|
| 54 | return [] |
---|
| 55 | |
---|
| 56 | # Dataset must have column types; if not, cannot create filters. |
---|
| 57 | try: |
---|
| 58 | column_types = self.original_dataset.metadata.column_types |
---|
| 59 | except AttributeError: |
---|
| 60 | return [] |
---|
| 61 | |
---|
| 62 | # Create and return filters. |
---|
| 63 | filters = [] |
---|
| 64 | if self.original_dataset.metadata.viz_filter_cols: |
---|
| 65 | for viz_col_index in self.original_dataset.metadata.viz_filter_cols: |
---|
| 66 | col_name = column_names[ viz_col_index ] |
---|
| 67 | # Make sure that column has a mapped index. If not, do not add filter. |
---|
| 68 | try: |
---|
| 69 | attrs = self.col_name_data_attr_mapping[ col_name ] |
---|
| 70 | except KeyError: |
---|
| 71 | continue |
---|
| 72 | filters.append( |
---|
| 73 | { 'name' : attrs[ 'name' ], 'type' : column_types[viz_col_index], \ |
---|
| 74 | 'index' : attrs[ 'index' ] } ) |
---|
| 75 | return filters |
---|
| 76 | |
---|
| 77 | class SummaryTreeDataProvider( TracksDataProvider ): |
---|
| 78 | """ |
---|
| 79 | Summary tree data provider for the Galaxy track browser. |
---|
| 80 | """ |
---|
| 81 | |
---|
| 82 | CACHE = LRUCache(20) # Store 20 recently accessed indices for performance |
---|
| 83 | |
---|
| 84 | def get_summary( self, chrom, start, end, **kwargs): |
---|
| 85 | filename = self.converted_dataset.file_name |
---|
| 86 | st = self.CACHE[filename] |
---|
| 87 | if st is None: |
---|
| 88 | st = summary_tree_from_file( self.converted_dataset.file_name ) |
---|
| 89 | self.CACHE[filename] = st |
---|
| 90 | |
---|
| 91 | # If chrom is not found in blocks, try removing the first three |
---|
| 92 | # characters (e.g. 'chr') and see if that works. This enables the |
---|
| 93 | # provider to handle chrome names defined as chrXXX and as XXX. |
---|
| 94 | if chrom in st.chrom_blocks: |
---|
| 95 | pass |
---|
| 96 | elif chrom[3:] in st.chrom_blocks: |
---|
| 97 | chrom = chrom[3:] |
---|
| 98 | else: |
---|
| 99 | return None |
---|
| 100 | |
---|
| 101 | resolution = max(1, ceil(float(kwargs['resolution']))) |
---|
| 102 | |
---|
| 103 | level = ceil( log( resolution, st.block_size ) ) - 1 |
---|
| 104 | level = int(max( level, 0 )) |
---|
| 105 | if level <= 0: |
---|
| 106 | return None |
---|
| 107 | |
---|
| 108 | stats = st.chrom_stats[chrom] |
---|
| 109 | results = st.query(chrom, int(start), int(end), level) |
---|
| 110 | if results == "detail": |
---|
| 111 | return None |
---|
| 112 | elif results == "draw" or level <= 1: |
---|
| 113 | return "no_detail" |
---|
| 114 | else: |
---|
| 115 | return results, stats[level]["max"], stats[level]["avg"], stats[level]["delta"] |
---|
| 116 | |
---|
| 117 | class VcfDataProvider( TracksDataProvider ): |
---|
| 118 | """ |
---|
| 119 | VCF data provider for the Galaxy track browser. |
---|
| 120 | |
---|
| 121 | Payload format: |
---|
| 122 | [ uid (offset), start, end, ID, reference base(s), alternate base(s), quality score] |
---|
| 123 | """ |
---|
| 124 | |
---|
| 125 | col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } } |
---|
| 126 | |
---|
| 127 | def get_data( self, chrom, start, end, **kwargs ): |
---|
| 128 | """ Returns data in region defined by chrom, start, and end. """ |
---|
| 129 | start, end = int(start), int(end) |
---|
| 130 | source = open( self.original_dataset.file_name ) |
---|
| 131 | index = Indexes( self.converted_dataset.file_name ) |
---|
| 132 | results = [] |
---|
| 133 | count = 0 |
---|
| 134 | message = None |
---|
| 135 | |
---|
| 136 | # If chrom is not found in indexes, try removing the first three |
---|
| 137 | # characters (e.g. 'chr') and see if that works. This enables the |
---|
| 138 | # provider to handle chrome names defined as chrXXX and as XXX. |
---|
| 139 | chrom = str(chrom) |
---|
| 140 | if chrom not in index.indexes and chrom[3:] in index.indexes: |
---|
| 141 | chrom = chrom[3:] |
---|
| 142 | |
---|
| 143 | for start, end, offset in index.find(chrom, start, end): |
---|
| 144 | if count >= MAX_VALS: |
---|
| 145 | message = "Only the first %s features are being displayed." % MAX_VALS |
---|
| 146 | break |
---|
| 147 | count += 1 |
---|
| 148 | source.seek(offset) |
---|
| 149 | feature = source.readline().split() |
---|
| 150 | |
---|
| 151 | payload = [ offset, start, end, \ |
---|
| 152 | # ID: |
---|
| 153 | feature[2], \ |
---|
| 154 | # reference base(s): |
---|
| 155 | feature[3], \ |
---|
| 156 | # alternative base(s) |
---|
| 157 | feature[4], \ |
---|
| 158 | # phred quality score |
---|
| 159 | int( feature[5] )] |
---|
| 160 | results.append(payload) |
---|
| 161 | |
---|
| 162 | return { 'data_type' : 'vcf', 'data': results, 'message': message } |
---|
| 163 | |
---|
| 164 | class BamDataProvider( TracksDataProvider ): |
---|
| 165 | """ |
---|
| 166 | Provides access to intervals from a sorted indexed BAM file. |
---|
| 167 | """ |
---|
| 168 | def get_data( self, chrom, start, end, **kwargs ): |
---|
| 169 | """ |
---|
| 170 | Fetch intervals in the region |
---|
| 171 | """ |
---|
| 172 | start, end = int(start), int(end) |
---|
| 173 | no_detail = "no_detail" in kwargs |
---|
| 174 | # Attempt to open the BAM file with index |
---|
| 175 | bamfile = csamtools.Samfile( filename=self.original_dataset.file_name, mode='rb', index_filename=self.converted_dataset.file_name ) |
---|
| 176 | message = None |
---|
| 177 | try: |
---|
| 178 | data = bamfile.fetch(start=start, end=end, reference=chrom) |
---|
| 179 | except ValueError, e: |
---|
| 180 | # Some BAM files do not prefix chromosome names with chr, try without |
---|
| 181 | if chrom.startswith( 'chr' ): |
---|
| 182 | try: |
---|
| 183 | data = bamfile.fetch( start=start, end=end, reference=chrom[3:] ) |
---|
| 184 | except ValueError: |
---|
| 185 | return None |
---|
| 186 | else: |
---|
| 187 | return None |
---|
| 188 | # Encode reads as list of dictionaries |
---|
| 189 | results = [] |
---|
| 190 | paired_pending = {} |
---|
| 191 | for read in data: |
---|
| 192 | if len(results) > MAX_VALS: |
---|
| 193 | message = "Only the first %s pairs are being displayed." % MAX_VALS |
---|
| 194 | break |
---|
| 195 | qname = read.qname |
---|
| 196 | seq = read.seq |
---|
| 197 | read_len = sum( [cig[1] for cig in read.cigar] ) # Use cigar to determine length |
---|
| 198 | if read.is_proper_pair: |
---|
| 199 | if qname in paired_pending: # one in dict is always first |
---|
| 200 | pair = paired_pending[qname] |
---|
| 201 | results.append( [ qname, pair['start'], read.pos + read_len, seq, read.cigar, [pair['start'], pair['end'], pair['seq']], [read.pos, read.pos + read_len, seq] ] ) |
---|
| 202 | del paired_pending[qname] |
---|
| 203 | else: |
---|
| 204 | paired_pending[qname] = { 'start': read.pos, 'end': read.pos + read_len, 'seq': seq, 'mate_start': read.mpos, 'rlen': read_len, 'cigar': read.cigar } |
---|
| 205 | else: |
---|
| 206 | results.append( [qname, read.pos, read.pos + read_len, seq, read.cigar] ) |
---|
| 207 | # take care of reads whose mates are out of range |
---|
| 208 | for qname, read in paired_pending.iteritems(): |
---|
| 209 | if read['mate_start'] < read['start']: |
---|
| 210 | start = read['mate_start'] |
---|
| 211 | end = read['end'] |
---|
| 212 | r1 = [read['mate_start'], read['mate_start'] + read['rlen']] |
---|
| 213 | r2 = [read['start'], read['end'], read['seq']] |
---|
| 214 | else: |
---|
| 215 | start = read['start'] |
---|
| 216 | end = read['mate_start'] + read['rlen'] |
---|
| 217 | r1 = [read['start'], read['end'], read['seq']] |
---|
| 218 | r2 = [read['mate_start'], read['mate_start'] + read['rlen']] |
---|
| 219 | |
---|
| 220 | results.append( [ qname, start, end, read['seq'], read['cigar'], r1, r2 ] ) |
---|
| 221 | |
---|
| 222 | bamfile.close() |
---|
| 223 | return { 'data': results, 'message': message } |
---|
| 224 | |
---|
| 225 | class ArrayTreeDataProvider( TracksDataProvider ): |
---|
| 226 | """ |
---|
| 227 | Array tree data provider for the Galaxy track browser. |
---|
| 228 | """ |
---|
| 229 | def get_stats( self, chrom ): |
---|
| 230 | f = open( self.converted_dataset.file_name ) |
---|
| 231 | d = FileArrayTreeDict( f ) |
---|
| 232 | try: |
---|
| 233 | chrom_array_tree = d[chrom] |
---|
| 234 | except KeyError: |
---|
| 235 | f.close() |
---|
| 236 | return None |
---|
| 237 | |
---|
| 238 | root_summary = chrom_array_tree.get_summary( 0, chrom_array_tree.levels ) |
---|
| 239 | |
---|
| 240 | level = chrom_array_tree.levels - 1 |
---|
| 241 | desired_summary = chrom_array_tree.get_summary( 0, level ) |
---|
| 242 | bs = chrom_array_tree.block_size ** level |
---|
| 243 | |
---|
| 244 | frequencies = map(int, desired_summary.frequencies) |
---|
| 245 | out = [ (i * bs, freq) for i, freq in enumerate(frequencies) ] |
---|
| 246 | |
---|
| 247 | f.close() |
---|
| 248 | return { 'max': float( max(root_summary.maxs) ), \ |
---|
| 249 | 'min': float( min(root_summary.mins) ), \ |
---|
| 250 | 'frequencies': out, \ |
---|
| 251 | 'total_frequency': sum(root_summary.frequencies) } |
---|
| 252 | |
---|
| 253 | # Return None instead of NaN to pass jQuery 1.4's strict JSON |
---|
| 254 | def float_nan(self, n): |
---|
| 255 | if n != n: # NaN != NaN |
---|
| 256 | return None |
---|
| 257 | else: |
---|
| 258 | return float(n) |
---|
| 259 | |
---|
| 260 | def get_data( self, chrom, start, end, **kwargs ): |
---|
| 261 | if 'stats' in kwargs: |
---|
| 262 | return self.get_stats(chrom) |
---|
| 263 | |
---|
| 264 | f = open( self.converted_dataset.file_name ) |
---|
| 265 | d = FileArrayTreeDict( f ) |
---|
| 266 | |
---|
| 267 | # Get the right chromosome |
---|
| 268 | try: |
---|
| 269 | chrom_array_tree = d[chrom] |
---|
| 270 | except: |
---|
| 271 | f.close() |
---|
| 272 | return None |
---|
| 273 | |
---|
| 274 | block_size = chrom_array_tree.block_size |
---|
| 275 | start = int( start ) |
---|
| 276 | end = int( end ) |
---|
| 277 | resolution = max(1, ceil(float(kwargs['resolution']))) |
---|
| 278 | |
---|
| 279 | level = int( floor( log( resolution, block_size ) ) ) |
---|
| 280 | level = max( level, 0 ) |
---|
| 281 | stepsize = block_size ** level |
---|
| 282 | |
---|
| 283 | # Is the requested level valid? |
---|
| 284 | assert 0 <= level <= chrom_array_tree.levels |
---|
| 285 | |
---|
| 286 | results = [] |
---|
| 287 | for block_start in range( start, end, stepsize * block_size ): |
---|
| 288 | # print block_start |
---|
| 289 | # Return either data point or a summary depending on the level |
---|
| 290 | indexes = range( block_start, block_start + stepsize * block_size, stepsize ) |
---|
| 291 | if level > 0: |
---|
| 292 | s = chrom_array_tree.get_summary( block_start, level ) |
---|
| 293 | if s is not None: |
---|
| 294 | results.extend( zip( indexes, map( self.float_nan, s.sums / s.counts ) ) ) |
---|
| 295 | else: |
---|
| 296 | l = chrom_array_tree.get_leaf( block_start ) |
---|
| 297 | if l is not None: |
---|
| 298 | results.extend( zip( indexes, map( self.float_nan, l ) ) ) |
---|
| 299 | |
---|
| 300 | f.close() |
---|
| 301 | return results |
---|
| 302 | |
---|
| 303 | class IntervalIndexDataProvider( TracksDataProvider ): |
---|
| 304 | """ |
---|
| 305 | Interval index data provider for the Galaxy track browser. |
---|
| 306 | |
---|
| 307 | Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ] |
---|
| 308 | """ |
---|
| 309 | |
---|
| 310 | col_name_data_attr_mapping = { 4 : { 'index': 8 , 'name' : 'Score' } } |
---|
| 311 | |
---|
| 312 | def get_data( self, chrom, start, end, **kwargs ): |
---|
| 313 | start, end = int(start), int(end) |
---|
| 314 | source = open( self.original_dataset.file_name ) |
---|
| 315 | index = Indexes( self.converted_dataset.file_name ) |
---|
| 316 | results = [] |
---|
| 317 | count = 0 |
---|
| 318 | message = None |
---|
| 319 | |
---|
| 320 | # If chrom is not found in indexes, try removing the first three |
---|
| 321 | # characters (e.g. 'chr') and see if that works. This enables the |
---|
| 322 | # provider to handle chrome names defined as chrXXX and as XXX. |
---|
| 323 | chrom = str(chrom) |
---|
| 324 | if chrom not in index.indexes and chrom[3:] in index.indexes: |
---|
| 325 | chrom = chrom[3:] |
---|
| 326 | |
---|
| 327 | for start, end, offset in index.find(chrom, start, end): |
---|
| 328 | if count >= MAX_VALS: |
---|
| 329 | message = "Only the first %s features are being displayed." % MAX_VALS |
---|
| 330 | break |
---|
| 331 | count += 1 |
---|
| 332 | source.seek(offset) |
---|
| 333 | feature = source.readline().split() |
---|
| 334 | payload = [ offset, start, end ] |
---|
| 335 | # TODO: can we use column metadata to fill out payload? |
---|
| 336 | # TODO: use function to set payload data |
---|
| 337 | if "no_detail" not in kwargs: |
---|
| 338 | length = len(feature) |
---|
| 339 | if isinstance( self.original_dataset.datatype, Gff ): |
---|
| 340 | # GFF dataset. |
---|
| 341 | if length >= 3: |
---|
| 342 | payload.append( feature[2] ) # name |
---|
| 343 | if length >= 7: |
---|
| 344 | payload.append( feature[6] ) # strand |
---|
| 345 | elif isinstance( self.original_dataset.datatype, Bed ): |
---|
| 346 | # BED dataset. |
---|
| 347 | if length >= 4: |
---|
| 348 | payload.append(feature[3]) # name |
---|
| 349 | if length >= 6: # strand |
---|
| 350 | payload.append(feature[5]) |
---|
| 351 | |
---|
| 352 | if length >= 8: |
---|
| 353 | payload.append(int(feature[6])) |
---|
| 354 | payload.append(int(feature[7])) |
---|
| 355 | |
---|
| 356 | if length >= 12: |
---|
| 357 | block_sizes = [ int(n) for n in feature[10].split(',') if n != ''] |
---|
| 358 | block_starts = [ int(n) for n in feature[11].split(',') if n != '' ] |
---|
| 359 | blocks = zip(block_sizes, block_starts) |
---|
| 360 | payload.append( [ (start + block[1], start + block[1] + block[0]) for block in blocks] ) |
---|
| 361 | |
---|
| 362 | if length >= 5: |
---|
| 363 | payload.append( int(feature[4]) ) # score |
---|
| 364 | |
---|
| 365 | results.append(payload) |
---|
| 366 | |
---|
| 367 | return { 'data': results, 'message': message } |
---|
| 368 | |
---|
| 369 | # |
---|
| 370 | # Helper methods. |
---|
| 371 | # |
---|
| 372 | |
---|
| 373 | # Mapping from dataset type name to a class that can fetch data from a file of that |
---|
| 374 | # type. First key is converted dataset type; if result is another dict, second key |
---|
| 375 | # is original dataset type. TODO: This needs to be more flexible. |
---|
| 376 | dataset_type_name_to_data_provider = { |
---|
| 377 | "array_tree": ArrayTreeDataProvider, |
---|
| 378 | "interval_index": { "vcf": VcfDataProvider, "default" : IntervalIndexDataProvider }, |
---|
| 379 | "bai": BamDataProvider, |
---|
| 380 | "summary_tree": SummaryTreeDataProvider |
---|
| 381 | } |
---|
| 382 | |
---|
| 383 | dataset_type_to_data_provider = { |
---|
| 384 | Vcf : VcfDataProvider, |
---|
| 385 | } |
---|
| 386 | |
---|
| 387 | def get_data_provider( name=None, original_dataset=None ): |
---|
| 388 | """ |
---|
| 389 | Returns data provider class by name and/or original dataset. |
---|
| 390 | """ |
---|
| 391 | data_provider = None |
---|
| 392 | if name: |
---|
| 393 | value = dataset_type_name_to_data_provider[ name ] |
---|
| 394 | if isinstance( value, dict ): |
---|
| 395 | # Get converter by dataset extension; if there is no data provider, |
---|
| 396 | # get the default. |
---|
| 397 | data_provider = value.get( original_dataset.ext, value.get( "default" ) ) |
---|
| 398 | else: |
---|
| 399 | data_provider = value |
---|
| 400 | elif original_dataset: |
---|
| 401 | # Look for data provider in mapping. |
---|
| 402 | data_provider = dataset_type_to_data_provider.get( original_dataset.datatype.__class__, None ) |
---|
| 403 | if not data_provider: |
---|
| 404 | # Look up data provider from datatype's informaton. |
---|
| 405 | try: |
---|
| 406 | # Get data provider mapping and data provider for 'data'. If |
---|
| 407 | # provider available, use it; otherwise use generic provider. |
---|
| 408 | _ , data_provider_mapping = original_dataset.datatype.get_track_type() |
---|
| 409 | data_provider_name = data_provider_mapping[ 'data' ] |
---|
| 410 | if data_provider_name: |
---|
| 411 | data_provider = get_data_provider( name=data_provider_name, original_dataset=original_dataset ) |
---|
| 412 | else: |
---|
| 413 | data_provider = TracksDataProvider |
---|
| 414 | except: |
---|
| 415 | pass |
---|
| 416 | return data_provider |
---|
| 417 | |
---|