root/galaxy-central/lib/galaxy/visualization/tracks/data_providers.py

リビジョン 2, 16.3 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2Data providers for tracks visualizations.
3"""
4
5from math import floor, ceil, log, pow
6import pkg_resources
7pkg_resources.require( "bx-python" ); pkg_resources.require( "pysam" ); pkg_resources.require( "numpy" )
8from bx.interval_index_file import Indexes
9from bx.arrays.array_tree import FileArrayTreeDict
10from galaxy.util.lrucache import LRUCache
11from galaxy.visualization.tracks.summary import *
12from galaxy.datatypes.tabular import Vcf
13from galaxy.datatypes.interval import Bed, Gff
14from pysam import csamtools
15
16MAX_VALS = 5000 # only display first MAX_VALS features
17
18class TracksDataProvider( object ):
19    """ Base class for tracks data providers. """
20   
21    """
22    Mapping from column name to payload data; this mapping is used to create
23    filters. Key is column name, value is a dict with mandatory key 'index' and
24    optional key 'name'. E.g. this defines column 4
25
26    col_name_data_attr_mapping = {4 : { index: 5, name: 'Score' } }
27    """
28    col_name_data_attr_mapping = {}
29   
30    def __init__( self, converted_dataset=None, original_dataset=None ):
31        """ Create basic data provider. """
32        self.converted_dataset = converted_dataset
33        self.original_dataset = original_dataset
34       
35    def get_data( self, chrom, start, end, **kwargs ):
36        """ Returns data in region defined by chrom, start, and end. """
37        # Override.
38        pass
39       
40    def get_filters( self ):
41        """
42        Returns filters for provider's data. Return value is a list of
43        filters; each filter is a dictionary with the keys 'name', 'index', 'type'.
44        NOTE: This method uses the original dataset's datatype and metadata to
45        create the filters.
46        """
47        # Get column names.
48        try:
49            column_names = self.original_dataset.datatype.column_names
50        except AttributeError:
51            try:
52                column_names = range( self.original_dataset.metadata.columns )
53            except: # Give up
54                return []
55           
56        # Dataset must have column types; if not, cannot create filters.
57        try:
58            column_types = self.original_dataset.metadata.column_types
59        except AttributeError:
60            return []
61           
62        # Create and return filters.
63        filters = []
64        if self.original_dataset.metadata.viz_filter_cols:
65            for viz_col_index in self.original_dataset.metadata.viz_filter_cols:
66                col_name = column_names[ viz_col_index ]
67                # Make sure that column has a mapped index. If not, do not add filter.
68                try:
69                    attrs = self.col_name_data_attr_mapping[ col_name ]
70                except KeyError:
71                    continue
72                filters.append(
73                    { 'name' : attrs[ 'name' ], 'type' : column_types[viz_col_index], \
74                    'index' : attrs[ 'index' ] } )
75        return filters
76           
77class SummaryTreeDataProvider( TracksDataProvider ):
78    """
79    Summary tree data provider for the Galaxy track browser.
80    """
81   
82    CACHE = LRUCache(20) # Store 20 recently accessed indices for performance
83   
84    def get_summary( self, chrom, start, end, **kwargs):
85        filename = self.converted_dataset.file_name
86        st = self.CACHE[filename]
87        if st is None:
88            st = summary_tree_from_file( self.converted_dataset.file_name )
89            self.CACHE[filename] = st
90
91        # If chrom is not found in blocks, try removing the first three
92        # characters (e.g. 'chr') and see if that works. This enables the
93        # provider to handle chrome names defined as chrXXX and as XXX.
94        if chrom in st.chrom_blocks:
95            pass
96        elif chrom[3:] in st.chrom_blocks:
97            chrom = chrom[3:]
98        else:
99            return None
100
101        resolution = max(1, ceil(float(kwargs['resolution'])))
102
103        level = ceil( log( resolution, st.block_size ) ) - 1
104        level = int(max( level, 0 ))
105        if level <= 0:
106            return None
107
108        stats = st.chrom_stats[chrom]
109        results = st.query(chrom, int(start), int(end), level)
110        if results == "detail":
111            return None
112        elif results == "draw" or level <= 1:
113            return "no_detail"
114        else:
115            return results, stats[level]["max"], stats[level]["avg"], stats[level]["delta"]
116
117class VcfDataProvider( TracksDataProvider ):
118    """
119    VCF data provider for the Galaxy track browser.
120
121    Payload format:
122    [ uid (offset), start, end, ID, reference base(s), alternate base(s), quality score]
123    """
124
125    col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } }
126
127    def get_data( self, chrom, start, end, **kwargs ):
128        """ Returns data in region defined by chrom, start, and end. """
129        start, end = int(start), int(end)
130        source = open( self.original_dataset.file_name )
131        index = Indexes( self.converted_dataset.file_name )
132        results = []
133        count = 0
134        message = None
135
136        # If chrom is not found in indexes, try removing the first three
137        # characters (e.g. 'chr') and see if that works. This enables the
138        # provider to handle chrome names defined as chrXXX and as XXX.
139        chrom = str(chrom)
140        if chrom not in index.indexes and chrom[3:] in index.indexes:
141            chrom = chrom[3:]
142
143        for start, end, offset in index.find(chrom, start, end):
144            if count >= MAX_VALS:
145                message = "Only the first %s features are being displayed." % MAX_VALS
146                break
147            count += 1
148            source.seek(offset)
149            feature = source.readline().split()
150
151            payload = [ offset, start, end, \
152                        # ID:
153                        feature[2], \
154                        # reference base(s):
155                        feature[3], \
156                        # alternative base(s)
157                        feature[4], \
158                        # phred quality score
159                        int( feature[5] )]
160            results.append(payload)
161
162        return { 'data_type' : 'vcf', 'data': results, 'message': message }
163               
164class BamDataProvider( TracksDataProvider ):
165    """
166    Provides access to intervals from a sorted indexed BAM file.
167    """       
168    def get_data( self, chrom, start, end, **kwargs ):
169        """
170        Fetch intervals in the region
171        """
172        start, end = int(start), int(end)
173        no_detail = "no_detail" in kwargs
174        # Attempt to open the BAM file with index
175        bamfile = csamtools.Samfile( filename=self.original_dataset.file_name, mode='rb', index_filename=self.converted_dataset.file_name )
176        message = None
177        try:
178            data = bamfile.fetch(start=start, end=end, reference=chrom)
179        except ValueError, e:
180            # Some BAM files do not prefix chromosome names with chr, try without
181            if chrom.startswith( 'chr' ):
182                try:
183                    data = bamfile.fetch( start=start, end=end, reference=chrom[3:] )
184                except ValueError:
185                    return None
186            else:
187                return None
188        # Encode reads as list of dictionaries
189        results = []
190        paired_pending = {}
191        for read in data:
192            if len(results) > MAX_VALS:
193                message = "Only the first %s pairs are being displayed." % MAX_VALS
194                break
195            qname = read.qname
196            seq = read.seq
197            read_len = sum( [cig[1] for cig in read.cigar] ) # Use cigar to determine length
198            if read.is_proper_pair:
199                if qname in paired_pending: # one in dict is always first
200                    pair = paired_pending[qname]
201                    results.append( [ qname, pair['start'], read.pos + read_len, seq, read.cigar, [pair['start'], pair['end'], pair['seq']], [read.pos, read.pos + read_len, seq] ] )
202                    del paired_pending[qname]
203                else:
204                    paired_pending[qname] = { 'start': read.pos, 'end': read.pos + read_len, 'seq': seq, 'mate_start': read.mpos, 'rlen': read_len, 'cigar': read.cigar }
205            else:
206                results.append( [qname, read.pos, read.pos + read_len, seq, read.cigar] )
207        # take care of reads whose mates are out of range
208        for qname, read in paired_pending.iteritems():
209            if read['mate_start'] < read['start']:
210                start = read['mate_start']
211                end = read['end']
212                r1 = [read['mate_start'], read['mate_start']  + read['rlen']]
213                r2 = [read['start'], read['end'], read['seq']]
214            else:
215                start = read['start']
216                end = read['mate_start'] + read['rlen']
217                r1 = [read['start'], read['end'], read['seq']]
218                r2 = [read['mate_start'], read['mate_start'] + read['rlen']]
219
220            results.append( [ qname, start, end, read['seq'], read['cigar'], r1, r2 ] )
221
222        bamfile.close()
223        return { 'data': results, 'message': message }
224
225class ArrayTreeDataProvider( TracksDataProvider ):
226    """
227    Array tree data provider for the Galaxy track browser.
228    """
229    def get_stats( self, chrom ):
230        f = open( self.converted_dataset.file_name )
231        d = FileArrayTreeDict( f )
232        try:
233            chrom_array_tree = d[chrom]
234        except KeyError:
235            f.close()
236            return None
237
238        root_summary = chrom_array_tree.get_summary( 0, chrom_array_tree.levels )
239
240        level = chrom_array_tree.levels - 1
241        desired_summary = chrom_array_tree.get_summary( 0, level )
242        bs = chrom_array_tree.block_size ** level
243
244        frequencies = map(int, desired_summary.frequencies)
245        out = [ (i * bs, freq) for i, freq in enumerate(frequencies) ]
246
247        f.close()
248        return {    'max': float( max(root_summary.maxs) ), \
249                    'min': float( min(root_summary.mins) ), \
250                    'frequencies': out, \
251                    'total_frequency': sum(root_summary.frequencies) }
252
253    # Return None instead of NaN to pass jQuery 1.4's strict JSON
254    def float_nan(self, n):
255        if n != n: # NaN != NaN
256            return None
257        else:
258            return float(n)           
259
260    def get_data( self, chrom, start, end, **kwargs ):
261        if 'stats' in kwargs:
262            return self.get_stats(chrom)
263
264        f = open( self.converted_dataset.file_name )
265        d = FileArrayTreeDict( f )
266
267        # Get the right chromosome
268        try:
269            chrom_array_tree = d[chrom]
270        except:
271            f.close()
272            return None
273
274        block_size = chrom_array_tree.block_size
275        start = int( start )
276        end = int( end )
277        resolution = max(1, ceil(float(kwargs['resolution'])))
278
279        level = int( floor( log( resolution, block_size ) ) )
280        level = max( level, 0 )
281        stepsize = block_size ** level
282
283        # Is the requested level valid?
284        assert 0 <= level <= chrom_array_tree.levels
285
286        results = []
287        for block_start in range( start, end, stepsize * block_size ):
288            # print block_start
289            # Return either data point or a summary depending on the level
290            indexes = range( block_start, block_start + stepsize * block_size, stepsize )
291            if level > 0:
292                s = chrom_array_tree.get_summary( block_start, level )
293                if s is not None:
294                    results.extend( zip( indexes, map( self.float_nan, s.sums / s.counts ) ) )
295            else:
296                l = chrom_array_tree.get_leaf( block_start )
297                if l is not None:
298                    results.extend( zip( indexes, map( self.float_nan, l ) ) )
299
300        f.close()
301        return results
302
303class IntervalIndexDataProvider( TracksDataProvider ):
304    """
305    Interval index data provider for the Galaxy track browser.
306   
307    Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ]
308    """
309   
310    col_name_data_attr_mapping = { 4 : { 'index': 8 , 'name' : 'Score' } }
311   
312    def get_data( self, chrom, start, end, **kwargs ):
313        start, end = int(start), int(end)
314        source = open( self.original_dataset.file_name )
315        index = Indexes( self.converted_dataset.file_name )
316        results = []
317        count = 0
318        message = None
319
320        # If chrom is not found in indexes, try removing the first three
321        # characters (e.g. 'chr') and see if that works. This enables the
322        # provider to handle chrome names defined as chrXXX and as XXX.
323        chrom = str(chrom)
324        if chrom not in index.indexes and chrom[3:] in index.indexes:
325            chrom = chrom[3:]
326
327        for start, end, offset in index.find(chrom, start, end):
328            if count >= MAX_VALS:
329                message = "Only the first %s features are being displayed." % MAX_VALS
330                break
331            count += 1
332            source.seek(offset)
333            feature = source.readline().split()
334            payload = [ offset, start, end ]
335            # TODO: can we use column metadata to fill out payload?
336            # TODO: use function to set payload data
337            if "no_detail" not in kwargs:
338                length = len(feature)
339                if isinstance( self.original_dataset.datatype, Gff ):
340                    # GFF dataset.
341                    if length >= 3:
342                        payload.append( feature[2] ) # name
343                    if length >= 7:
344                        payload.append( feature[6] ) # strand
345                elif isinstance( self.original_dataset.datatype, Bed ):
346                    # BED dataset.
347                    if length >= 4:
348                        payload.append(feature[3]) # name
349                    if length >= 6: # strand
350                        payload.append(feature[5])
351
352                    if length >= 8:
353                        payload.append(int(feature[6]))
354                        payload.append(int(feature[7]))
355
356                    if length >= 12:
357                        block_sizes = [ int(n) for n in feature[10].split(',') if n != '']
358                        block_starts = [ int(n) for n in feature[11].split(',') if n != '' ]
359                        blocks = zip(block_sizes, block_starts)
360                        payload.append( [ (start + block[1], start + block[1] + block[0]) for block in blocks] )
361                       
362                    if length >= 5:
363                        payload.append( int(feature[4]) ) # score
364
365            results.append(payload)
366
367        return { 'data': results, 'message': message }
368       
369#       
370# Helper methods.
371#
372
373# Mapping from dataset type name to a class that can fetch data from a file of that
374# type. First key is converted dataset type; if result is another dict, second key
375# is original dataset type. TODO: This needs to be more flexible.
376dataset_type_name_to_data_provider = {
377    "array_tree": ArrayTreeDataProvider,
378    "interval_index": { "vcf": VcfDataProvider, "default" : IntervalIndexDataProvider },
379    "bai": BamDataProvider,
380    "summary_tree": SummaryTreeDataProvider
381}
382
383dataset_type_to_data_provider = {
384    Vcf : VcfDataProvider,
385}
386
387def get_data_provider( name=None, original_dataset=None ):
388    """
389    Returns data provider class by name and/or original dataset.
390    """
391    data_provider = None
392    if name:
393        value = dataset_type_name_to_data_provider[ name ]
394        if isinstance( value, dict ):
395            # Get converter by dataset extension; if there is no data provider,
396            # get the default.
397            data_provider = value.get( original_dataset.ext, value.get( "default" ) )
398        else:
399            data_provider = value
400    elif original_dataset:
401        # Look for data provider in mapping.
402        data_provider = dataset_type_to_data_provider.get( original_dataset.datatype.__class__, None )
403        if not data_provider:
404            # Look up data provider from datatype's informaton.
405            try:
406                # Get data provider mapping and data provider for 'data'. If
407                # provider available, use it; otherwise use generic provider.
408                _ , data_provider_mapping = original_dataset.datatype.get_track_type()
409                data_provider_name = data_provider_mapping[ 'data' ]
410                if data_provider_name:
411                    data_provider = get_data_provider( name=data_provider_name, original_dataset=original_dataset )
412                else:
413                    data_provider = TracksDataProvider
414            except:
415                pass
416    return data_provider
417
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。