root/galaxy-central/lib/galaxy/datatypes/tabular.py

リビジョン 2, 22.6 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2Tabular datatype
3
4"""
5import pkg_resources
6pkg_resources.require( "bx-python" )
7
8import logging
9import data
10from galaxy import util
11from cgi import escape
12from galaxy.datatypes import metadata
13from galaxy.datatypes.metadata import MetadataElement
14import galaxy_utils.sequence.vcf
15from sniff import *
16
17log = logging.getLogger(__name__)
18
19class Tabular( data.Text ):
20    """Tab delimited data"""
21
22    """Add metadata elements"""
23    MetadataElement( name="comment_lines", default=0, desc="Number of comment lines", readonly=False, optional=True, no_value=0 )
24    MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 )
25    MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] )
26
27    def init_meta( self, dataset, copy_from=None ):
28        data.Text.init_meta( self, dataset, copy_from=copy_from )
29    def set_meta( self, dataset, overwrite = True, skip = None, max_data_lines = None, **kwd ):
30        """
31        Tries to determine the number of columns as well as those columns
32        that contain numerical values in the dataset.  A skip parameter is
33        used because various tabular data types reuse this function, and
34        their data type classes are responsible to determine how many invalid
35        comment lines should be skipped. Using None for skip will cause skip
36        to be zero, but the first line will be processed as a header. A
37        max_data_lines parameter is used because various tabular data types
38        reuse this function, and their data type classes are responsible to
39        determine how many data lines should be processed to ensure that the
40        non-optional metadata parameters are properly set; if used, optional
41        metadata parameters will be set to None, unless the entire file has
42        already been read. Using None (default) for max_data_lines will
43        process all data lines.
44
45        Items of interest:
46        1. We treat 'overwrite' as always True (we always want to set tabular metadata when called).
47        2. If a tabular file has no data, it will have one column of type 'str'.
48        3. We used to check only the first 100 lines when setting metadata and this class's
49           set_peek() method read the entire file to determine the number of lines in the file.
50           Since metadata can now be processed on cluster nodes, we've merged the line count portion
51           of the set_peek() processing here, and we now check the entire contents of the file.
52        """
53        # Store original skip value to check with later
54        requested_skip = skip
55        if skip is None:
56            skip = 0
57        column_type_set_order = [ 'int', 'float', 'list', 'str'  ] #Order to set column types in
58        default_column_type = column_type_set_order[-1] # Default column type is lowest in list
59        column_type_compare_order = list( column_type_set_order ) #Order to compare column types
60        column_type_compare_order.reverse()
61        def type_overrules_type( column_type1, column_type2 ):
62            if column_type1 is None or column_type1 == column_type2:
63                return False
64            if column_type2 is None:
65                return True
66            for column_type in column_type_compare_order:
67                if column_type1 == column_type:
68                    return True
69                if column_type2 == column_type:
70                    return False
71            #neither column type was found in our ordered list, this cannot happen
72            raise "Tried to compare unknown column types"
73        def is_int( column_text ):
74            try:
75                int( column_text )
76                return True
77            except:
78                return False
79        def is_float( column_text ):
80            try:
81                float( column_text )
82                return True
83            except:
84                if column_text.strip().lower() == 'na':
85                    return True #na is special cased to be a float
86                return False
87        def is_list( column_text ):
88            return "," in column_text
89        def is_str( column_text ):
90            #anything, except an empty string, is True
91            if column_text == "":
92                return False
93            return True
94        is_column_type = {} #Dict to store column type string to checking function
95        for column_type in column_type_set_order:
96            is_column_type[column_type] = locals()[ "is_%s" % ( column_type ) ]
97        def guess_column_type( column_text ):
98            for column_type in column_type_set_order:
99                if is_column_type[column_type]( column_text ):
100                    return column_type
101            return None
102        data_lines = 0
103        comment_lines = 0
104        column_types = []
105        first_line_column_types = [default_column_type] # default value is one column of type str
106        if dataset.has_data():
107            #NOTE: if skip > num_check_lines, we won't detect any metadata, and will use default
108            dataset_fh = open( dataset.file_name )
109            i = 0
110            while True:
111                line = dataset_fh.readline()
112                if not line: break
113                line = line.rstrip( '\r\n' )
114                if i < skip or not line or line.startswith( '#' ):
115                    # We'll call blank lines comments
116                    comment_lines += 1
117                else:
118                    data_lines += 1
119                    fields = line.split( '\t' )
120                    for field_count, field in enumerate( fields ):
121                        if field_count >= len( column_types ): #found a previously unknown column, we append None
122                            column_types.append( None )
123                        column_type = guess_column_type( field )
124                        if type_overrules_type( column_type, column_types[field_count] ):
125                            column_types[field_count] = column_type
126                    if i == 0 and requested_skip is None:
127                        # This is our first line, people seem to like to upload files that have a header line, but do not
128                        # start with '#' (i.e. all column types would then most likely be detected as str).  We will assume
129                        # that the first line is always a header (this was previous behavior - it was always skipped).  When
130                        # the requested skip is None, we only use the data from the first line if we have no other data for
131                        # a column.  This is far from perfect, as
132                        # 1,2,3 1.1     2.2     qwerty
133                        # 0     0               1,2,3
134                        # will be detected as
135                        # "column_types": ["int", "int", "float", "list"]
136                        # instead of
137                        # "column_types": ["list", "float", "float", "str"]  *** would seem to be the 'Truth' by manual
138                        # observation that the first line should be included as data.  The old method would have detected as
139                        # "column_types": ["int", "int", "str", "list"]
140                        first_line_column_types = column_types
141                        column_types = [ None for col in first_line_column_types ]
142                if max_data_lines is not None and data_lines >= max_data_lines:
143                    if dataset_fh.tell() != dataset.get_size():
144                        data_lines = None #Clear optional data_lines metadata value
145                        comment_lines = None #Clear optional comment_lines metadata value; additional comment lines could appear below this point
146                    break
147                i += 1
148            dataset_fh.close()
149                       
150        #we error on the larger number of columns
151        #first we pad our column_types by using data from first line
152        if len( first_line_column_types ) > len( column_types ):
153            for column_type in first_line_column_types[len( column_types ):]:
154                column_types.append( column_type )
155        #Now we fill any unknown (None) column_types with data from first line
156        for i in range( len( column_types ) ):
157            if column_types[i] is None:
158                if len( first_line_column_types ) <= i or first_line_column_types[i] is None:
159                    column_types[i] = default_column_type
160                else:
161                    column_types[i] = first_line_column_types[i]
162        # Set the discovered metadata values for the dataset
163        dataset.metadata.data_lines = data_lines
164        dataset.metadata.comment_lines = comment_lines
165        dataset.metadata.column_types = column_types
166        dataset.metadata.columns = len( column_types )
167    def make_html_table( self, dataset, skipchars=[] ):
168        """Create HTML table, used for displaying peek"""
169        out = ['<table cellspacing="0" cellpadding="3">']
170        try:
171            out.append( '<tr>' )
172            # Generate column header
173            for i in range( 1, dataset.metadata.columns+1 ):
174                out.append( '<th>%s</th>' % str( i ) )
175            out.append( '</tr>' )
176            out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
177            out.append( '</table>' )
178            out = "".join( out )
179        except Exception, exc:
180            out = "Can't create peek %s" % str( exc )
181        return out
182    def make_html_peek_rows( self, dataset, skipchars=[] ):
183        out = [""]
184        comments = []
185        if not dataset.peek:
186            dataset.set_peek()
187        data = dataset.peek
188        lines =  data.splitlines()
189        for line in lines:
190            line = line.rstrip( '\r\n' )
191            if not line:
192                continue
193            comment = False
194            for skipchar in skipchars:
195                if line.startswith( skipchar ):
196                    comments.append( line )
197                    comment = True
198                    break
199            if comment:
200                continue
201            elems = line.split( '\t' )
202            if len( elems ) != dataset.metadata.columns:
203                # We may have an invalid comment line or invalid data
204                comments.append( line )
205                comment = True
206                continue
207            while len( comments ) > 0: # Keep comments
208                try:
209                    out.append( '<tr><td colspan="100%">' )
210                except:
211                    out.append( '<tr><td>' )
212                out.append( '%s</td></tr>'  % escape( comments.pop(0) ) )
213            out.append( '<tr>' )
214            for elem in elems: # valid data
215                elem = escape( elem )
216                out.append( '<td>%s</td>' % elem )
217            out.append( '</tr>' )
218        # Peek may consist only of comments
219        while len( comments ) > 0:
220            try:
221                out.append( '<tr><td colspan="100%">' )
222            except:
223                out.append( '<tr><td>' )
224            out.append( '%s</td></tr>'  % escape( comments.pop(0) ) )
225        return "".join( out )
226    def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
227        data.Text.set_peek( self, dataset, line_count=line_count, is_multi_byte=is_multi_byte )
228        if dataset.metadata.comment_lines:
229            dataset.blurb = "%s, %s comments" % ( dataset.blurb, util.commaify( str( dataset.metadata.comment_lines ) ) )
230    def display_peek( self, dataset ):
231        """Returns formatted html of peek"""
232        return self.make_html_table( dataset )
233    def displayable( self, dataset ):
234        try:
235            return dataset.has_data() \
236                and dataset.state == dataset.states.OK \
237                and dataset.metadata.columns > 0 \
238                and dataset.metadata.data_lines > 0
239        except:
240            return False
241    def as_gbrowse_display_file( self, dataset, **kwd ):
242        return open( dataset.file_name )
243    def as_ucsc_display_file( self, dataset, **kwd ):
244        return open( dataset.file_name )
245
246class Taxonomy( Tabular ):
247    def __init__(self, **kwd):
248        """Initialize taxonomy datatype"""
249        Tabular.__init__( self, **kwd )
250        self.column_names = ['Name', 'TaxId', 'Root', 'Superkingdom', 'Kingdom', 'Subkingdom',
251                             'Superphylum', 'Phylum', 'Subphylum', 'Superclass', 'Class', 'Subclass',
252                             'Superorder', 'Order', 'Suborder', 'Superfamily', 'Family', 'Subfamily',
253                             'Tribe', 'Subtribe', 'Genus', 'Subgenus', 'Species', 'Subspecies'
254                             ]
255    def make_html_table( self, dataset, skipchars=[] ):
256        """Create HTML table, used for displaying peek"""
257        out = ['<table cellspacing="0" cellpadding="3">']
258        comments = []
259        try:
260            # Generate column header
261            out.append( '<tr>' )
262            for i, name in enumerate( self.column_names ):
263                out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
264            # This data type requires at least 24 columns in the data
265            if dataset.metadata.columns - len( self.column_names ) > 0:
266                for i in range( len( self.column_names ), dataset.metadata.columns ):
267                    out.append( '<th>%s</th>' % str( i+1 ) )
268                out.append( '</tr>' )
269            out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
270            out.append( '</table>' )
271            out = "".join( out )
272        except Exception, exc:
273            out = "Can't create peek %s" % exc
274        return out
275
276class Sam( Tabular ):
277    file_ext = 'sam'
278    def __init__(self, **kwd):
279        """Initialize taxonomy datatype"""
280        Tabular.__init__( self, **kwd )
281        self.column_names = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR',
282                             'MRNM', 'MPOS', 'ISIZE', 'SEQ', 'QUAL', 'OPT'
283                             ]
284    def make_html_table( self, dataset, skipchars=[] ):
285        """Create HTML table, used for displaying peek"""
286        out = ['<table cellspacing="0" cellpadding="3">']
287        try:
288            # Generate column header
289            out.append( '<tr>' )
290            for i, name in enumerate( self.column_names ):
291                out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
292            # This data type requires at least 11 columns in the data
293            if dataset.metadata.columns - len( self.column_names ) > 0:
294                for i in range( len( self.column_names ), dataset.metadata.columns ):
295                    out.append( '<th>%s</th>' % str( i+1 ) )
296                out.append( '</tr>' )
297            out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
298            out.append( '</table>' )
299            out = "".join( out )
300        except Exception, exc:
301            out = "Can't create peek %s" % exc
302        return out
303    def sniff( self, filename ):
304        """
305        Determines whether the file is in SAM format
306       
307        A file in SAM format consists of lines of tab-separated data.
308        The following header line may be the first line:
309        @QNAME  FLAG    RNAME   POS     MAPQ    CIGAR   MRNM    MPOS    ISIZE   SEQ     QUAL
310        or
311        @QNAME  FLAG    RNAME   POS     MAPQ    CIGAR   MRNM    MPOS    ISIZE   SEQ     QUAL    OPT
312        Data in the OPT column is optional and can consist of tab-separated data
313
314        For complete details see http://samtools.sourceforge.net/SAM1.pdf
315       
316        Rules for sniffing as True:
317            There must be 11 or more columns of data on each line
318            Columns 2 (FLAG), 4(POS), 5 (MAPQ), 8 (MPOS), and 9 (ISIZE) must be numbers (9 can be negative)
319            We will only check that up to the first 5 alignments are correctly formatted.
320       
321        >>> fname = get_test_fname( 'sequence.maf' )
322        >>> Sam().sniff( fname )
323        False
324        >>> fname = get_test_fname( '1.sam' )
325        >>> Sam().sniff( fname )
326        True
327        """
328        try:
329            fh = open( filename )
330            count = 0
331            while True:
332                line = fh.readline()
333                line = line.strip()
334                if not line:
335                    break #EOF
336                if line:
337                    if line[0] != '@':
338                        linePieces = line.split('\t')
339                        if len(linePieces) < 11:
340                            return False
341                        try:
342                            check = int(linePieces[1])
343                            check = int(linePieces[3])
344                            check = int(linePieces[4])
345                            check = int(linePieces[7])
346                            check = int(linePieces[8])
347                        except ValueError:
348                            return False
349                        count += 1
350                        if count == 5:
351                            return True
352            fh.close()
353            if count < 5 and count > 0:
354                return True
355        except:
356            pass
357        return False
358
359class Pileup( Tabular ):
360    """Tab delimited data in pileup (6- or 10-column) format"""
361    file_ext = "pileup"
362
363    """Add metadata elements"""
364    MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter )
365    MetadataElement( name="startCol", default=2, desc="Start column", param=metadata.ColumnParameter )
366    MetadataElement( name="baseCol", default=3, desc="Reference base column", param=metadata.ColumnParameter )
367
368    def init_meta( self, dataset, copy_from=None ):
369        Tabular.init_meta( self, dataset, copy_from=copy_from )
370
371    def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
372        """Set the peek and blurb text"""
373        if not dataset.dataset.purged:
374            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
375            if line_count is None:
376                # See if line_count is stored in the metadata
377                if dataset.metadata.data_lines:
378                    dataset.blurb = "%s genomic coordinates" % util.commaify( str( dataset.metadata.data_lines ) )
379                else:
380                    # Number of lines is not known ( this should not happen ), and auto-detect is
381                    # needed to set metadata
382                    dataset.blurb = "? genomic coordinates"
383            else:
384                dataset.blurb = "%s genomic coordinates" % util.commaify( str( line_count ) )
385        else:
386            dataset.peek = 'file does not exist'
387            dataset.blurb = 'file purged from disk'
388
389    def make_html_table( self, dataset, skipchars=[] ):
390        """Create HTML table, used for displaying peek"""
391        out = ['<table cellspacing="0" cellpadding="3">']
392        comments = []
393        try:
394            # Generate column header
395            out.append('<tr>')
396            for i in range( 1, dataset.metadata.columns+1 ):
397                if i == dataset.metadata.chromCol:
398                    out.append( '<th>%s.Chrom</th>' % i )
399                elif i == dataset.metadata.startCol:
400                    out.append( '<th>%s.Start</th>' % i )
401                elif i == dataset.metadata.baseCol:
402                    out.append( '<th>%s.Base</th>' % i )
403                else:
404                    out.append( '<th>%s</th>' % i )
405            out.append('</tr>')
406            out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
407            out.append( '</table>' )
408            out = "".join( out )
409        except Exception, exc:
410            out = "Can't create peek %s" % str( exc )
411        return out
412
413    def repair_methods( self, dataset ):
414        """Return options for removing errors along with a description"""
415        return [ ("lines", "Remove erroneous lines") ]
416
417    def sniff( self, filename ):
418        """
419        Checks for 'pileup-ness'
420
421        There are two main types of pileup: 6-column and 10-column. For both,
422        the first three and last two columns are the same. We only check the
423        first three to allow for some personalization of the format.
424
425        >>> fname = get_test_fname( 'interval.interval' )
426        >>> Pileup().sniff( fname )
427        False
428        >>> fname = get_test_fname( '6col.pileup' )
429        >>> Pileup().sniff( fname )
430        True
431        >>> fname = get_test_fname( '10col.pileup' )
432        >>> Pileup().sniff( fname )
433        True
434        """
435        headers = get_headers( filename, '\t' )
436        try:
437            for hdr in headers:
438                if hdr and not hdr[0].startswith( '#' ):
439                    if len( hdr ) < 3:
440                        return False
441                    try:
442                        # chrom start in column 1 (with 0-based columns)
443                        # and reference base is in column 2
444                        check = int( hdr[1] )
445                        assert hdr[2] in [ 'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', 'n' ]
446                    except:
447                        return False
448            return True
449        except:
450            return False
451
452class Eland( Tabular ):
453    file_ext = 'eland'
454   
455    def sniff( self, filename ):
456        return False
457
458class ElandMulti( Tabular ):
459    file_ext = 'elandmulti'
460   
461    def sniff( self, filename ):
462        return False
463       
464class Vcf( Tabular ):
465    """ Variant Call Format for describing SNPs and other simple genome variations. """
466   
467    file_ext = 'vcf'
468    column_names = [ 'Chrom', 'Pos', 'ID', 'Ref', 'Alt', 'Qual', 'Filter', 'Info', 'Format', 'data' ]
469   
470    MetadataElement( name="columns", default=10, desc="Number of columns", readonly=True, visible=False )
471    MetadataElement( name="column_types", default=['str','int','str','str','str','int','str','list','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False )
472    MetadataElement( name="viz_filter_cols", default=[5], param=metadata.ColumnParameter, multiple=True )
473   
474    def sniff( self, filename ):
475        try:
476            # If reader can read and parse file, it's VCF.
477            for line in list( galaxy_utils.sequence.vcf.Reader( open( filename ) ) ):
478                pass
479            return True
480        except:
481            return False
482           
483    def make_html_table( self, dataset, skipchars=[] ):
484        """Create HTML table, used for displaying peek"""
485        out = ['<table cellspacing="0" cellpadding="3">']
486        try:
487            # Generate column header
488            out.append( '<tr>' )
489            for i, name in enumerate( self.column_names ):
490                out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) )
491            out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) )
492            out.append( '</table>' )
493            out = "".join( out )
494        except Exception, exc:
495            out = "Can't create peek %s" % exc
496        return out
497       
498    def get_track_type( self ):
499        return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"}
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。