root/galaxy-central/lib/galaxy/datatypes/data.py @ 2

リビジョン 2, 21.6 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

Rev行番号 
[2]1import logging, os, sys, time, tempfile
2from galaxy import util
3from galaxy.util.odict import odict
4from galaxy.util.bunch import Bunch
5from cgi import escape
6import metadata
7import zipfile
8from metadata import MetadataElement #import directly to maintain ease of use in Datatype class definitions
9
10log = logging.getLogger(__name__)
11
12# Valid first column and strand column values vor bed, other formats
13col1_startswith = ['chr', 'chl', 'groupun', 'reftig_', 'scaffold', 'super_', 'vcho']
14valid_strand = ['+', '-', '.']
15
16class DataMeta( type ):
17    """
18    Metaclass for Data class.  Sets up metadata spec.
19    """
20    def __init__( cls, name, bases, dict_ ):
21        cls.metadata_spec = metadata.MetadataSpecCollection()
22        for base in bases: #loop through bases (class/types) of cls
23            if hasattr( base, "metadata_spec" ): #base of class Data (object) has no metadata
24                cls.metadata_spec.update( base.metadata_spec ) #add contents of metadata spec of base class to cls
25        metadata.Statement.process( cls )
26
27class Data( object ):
28    """
29    Base class for all datatypes.  Implements basic interfaces as well
30    as class methods for metadata.
31
32    >>> class DataTest( Data ):
33    ...     MetadataElement( name="test" )
34    ...
35    >>> DataTest.metadata_spec.test.name
36    'test'
37    >>> DataTest.metadata_spec.test.desc
38    'test'
39    >>> type( DataTest.metadata_spec.test.param )
40    <class 'galaxy.datatypes.metadata.MetadataParameter'>
41   
42    """
43    __metaclass__ = DataMeta
44    # Add metadata elements
45    MetadataElement( name="dbkey", desc="Database/Build", default="?", param=metadata.DBKeyParameter, multiple=False, no_value="?" )
46    # Stores the set of display applications, and viewing methods, supported by this datatype
47    supported_display_apps = {}
48    # If False, the peek is regenerated whenever a dataset of this type is copied
49    copy_safe_peek = True
50    # The dataset contains binary data --> do not space_to_tab or convert newlines, etc.
51    # Allow binary file uploads of this type when True.
52    is_binary = True
53    # Allow user to change between this datatype and others. If False, this datatype
54    # cannot be changed from or into.
55    allow_datatype_change = True
56    #Composite datatypes
57    composite_type = None
58    composite_files = odict()
59    primary_file_name = 'index'
60    #A per datatype setting (inherited): max file size (in bytes) for setting optional metadata
61    _max_optional_metadata_filesize = None
62   
63    def __init__(self, **kwd):
64        """Initialize the datatype"""
65        object.__init__(self, **kwd)
66        self.supported_display_apps = self.supported_display_apps.copy()
67        self.composite_files = self.composite_files.copy()
68        self.display_applications = odict()
69    def write_from_stream(self, dataset, stream):
70        """Writes data from a stream"""
71        fd = open(dataset.file_name, 'wb')
72        while 1:
73            chunk = stream.read(1048576)
74            if not chunk:
75                break
76            os.write(fd, chunk)
77        os.close(fd)
78    def set_raw_data(self, dataset, data):
79        """Saves the data on the disc"""
80        fd = open(dataset.file_name, 'wb')
81        os.write(fd, data)
82        os.close(fd)
83    def get_raw_data( self, dataset ):
84        """Returns the full data. To stream it open the file_name and read/write as needed"""
85        try:
86            return file(datset.file_name, 'rb').read(-1)
87        except OSError, e:
88            log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name))
89            return ''
90    def groom_dataset_content( self, file_name ):
91        """This function is called on an output dataset file after the content is initially generated."""
92        pass
93    def init_meta( self, dataset, copy_from=None ):
94        # Metadata should be left mostly uninitialized.  Dataset will
95        # handle returning default values when metadata is not set.
96        # copy_from allows metadata to be passed in that will be
97        # copied. (although this seems ambiguous, see
98        # Dataset.set_metadata.  It always copies the rhs in order to
99        # flag the object as modified for SQLAlchemy.
100        if copy_from:
101            dataset.metadata = copy_from.metadata
102    def set_meta( self, dataset, overwrite = True, **kwd ):
103        """Unimplemented method, allows guessing of metadata from contents of file"""
104        return True
105    def missing_meta( self, dataset, check = [], skip = [] ):
106        """
107        Checks for empty metadata values, Returns True if non-optional metadata is missing
108        Specifying a list of 'check' values will only check those names provided; when used, optionality is ignored
109        Specifying a list of 'skip' items will return True even when a named metadata value is missing
110        """
111        if check:
112            to_check = [ ( to_check, dataset.metadata.get( to_check ) ) for to_check in check ]
113        else:
114            to_check = dataset.metadata.items()
115        for key, value in to_check:
116            if key in skip or ( not check and dataset.metadata.spec[key].get( "optional" ) ):
117                continue #we skip check for optional and nonrequested values here
118            if not value:
119                return True
120        return False
121    def set_max_optional_metadata_filesize( self, max_value ):
122        try:
123            max_value = int( max_value )
124        except:
125            return
126        self.__class__._max_optional_metadata_filesize = max_value
127    def get_max_optional_metadata_filesize( self ):
128        rval = self.__class__._max_optional_metadata_filesize
129        if rval is None:
130            return -1
131        return rval
132    max_optional_metadata_filesize = property( get_max_optional_metadata_filesize, set_max_optional_metadata_filesize )
133    def set_peek( self, dataset, is_multi_byte=False ):
134        """Set the peek and blurb text"""
135        if not dataset.dataset.purged:
136            dataset.peek = ''
137            dataset.blurb = 'data'
138        else:
139            dataset.peek = 'file does not exist'
140            dataset.blurb = 'file purged from disk'
141    def display_peek(self, dataset ):
142        """Create HTML table, used for displaying peek"""
143        out = ['<table cellspacing="0" cellpadding="3">']
144        try:
145            if not dataset.peek:
146                dataset.set_peek()
147            data = dataset.peek
148            lines =  data.splitlines()
149            for line in lines:
150                line = line.strip()
151                if not line:
152                    continue
153                if type( line ) is unicode:
154                    out.append( '<tr><td>%s</td></tr>' % escape( line ) )
155                else:
156                    out.append( '<tr><td>%s</td></tr>' % escape( unicode( line, 'utf-8' ) ) )
157            out.append( '</table>' )
158            out = "".join( out )
159        except Exception, exc:
160            out = "Can't create peek %s" % str( exc )
161        return out
162    def display_name(self, dataset):
163        """Returns formatted html of dataset name"""
164        try:
165            if type ( dataset.name ) is unicode:
166                return escape( dataset.name )
167            else:
168                return escape( unicode( dataset.name, 'utf-8 ') )
169        except:
170            return "name unavailable"
171    def display_info(self, dataset):
172        """Returns formatted html of dataset info"""
173        try:
174            # Change new line chars to html
175            info = escape( dataset.info )
176            if info.find( '\r\n' ) >= 0:
177                info = info.replace( '\r\n', '<br/>' )
178            if info.find( '\r' ) >= 0:
179                info = info.replace( '\r', '<br/>' )
180            if info.find( '\n' ) >= 0:
181                info = info.replace( '\n', '<br/>' )
182               
183            # Convert to unicode to display non-ascii characters.
184            if type( info ) is not unicode:
185                info = unicode( info, 'utf-8')
186               
187            return info
188        except:
189            return "info unavailable"
190    def validate(self, dataset):
191        """Unimplemented validate, return no exceptions"""
192        return list()
193    def repair_methods(self, dataset):
194        """Unimplemented method, returns dict with method/option for repairing errors"""
195        return None
196    def get_mime(self):
197        """Returns the mime type of the datatype"""
198        return 'application/octet-stream'
199    def add_display_app ( self, app_id, label, file_function, links_function ):
200        """
201        Adds a display app to the datatype.
202        app_id is a unique id
203        label is the primary display label, e.g., display at 'UCSC'
204        file_function is a string containing the name of the function that returns a properly formatted display
205        links_function is a string containing the name of the function that returns a list of (link_name,link)
206        """
207        self.supported_display_apps = self.supported_display_apps.copy()
208        self.supported_display_apps[app_id] = {'label':label,'file_function':file_function,'links_function':links_function}
209    def remove_display_app (self, app_id):
210        """Removes a display app from the datatype"""
211        self.supported_display_apps = self.supported_display_apps.copy()
212        try:
213            del self.supported_display_apps[app_id]
214        except:
215            log.exception('Tried to remove display app %s from datatype %s, but this display app is not declared.' % ( type, self.__class__.__name__ ) )
216    def clear_display_apps( self ):
217        self.supported_display_apps = {}
218    def add_display_application( self, display_application ):
219        """New style display applications"""
220        assert display_application.id not in self.display_applications, 'Attempted to add a display application twice'
221        self.display_applications[ display_application.id ] = display_application
222    def get_display_application( self, key, default = None ):
223        return self.display_applications.get( key, default )
224    def get_display_applications_by_dataset( self, dataset, trans ):
225        rval = odict()
226        for key, value in self.display_applications.iteritems():
227            value = value.filter_by_dataset( dataset, trans )
228            if value.links:
229                rval[key] = value
230        return rval
231    def get_display_types(self):
232        """Returns display types available"""
233        return self.supported_display_apps.keys()
234    def get_display_label(self, type):
235        """Returns primary label for display app"""
236        try:
237            return self.supported_display_apps[type]['label']
238        except:
239            return 'unknown'
240    def as_display_type(self, dataset, type, **kwd):
241        """Returns modified file contents for a particular display type """
242        try:
243            if type in self.get_display_types():
244                return getattr (self, self.supported_display_apps[type]['file_function']) (dataset, **kwd)
245        except:
246            log.exception('Function %s is referred to in datatype %s for displaying as type %s, but is not accessible' % (self.supported_display_apps[type]['file_function'], self.__class__.__name__, type) )
247        return "This display type (%s) is not implemented for this datatype (%s)." % ( type, dataset.ext)
248    def get_display_links( self, dataset, type, app, base_url, target_frame='_blank', **kwd ):
249        """
250        Returns a list of tuples of (name, link) for a particular display type.  No check on
251        'access' permissions is done here - if you can view the dataset, you can also save it
252        or send it to a destination outside of Galaxy, so Galaxy security restrictions do not
253        apply anyway.
254        """
255        try:
256            if type in self.get_display_types():
257                return target_frame, getattr ( self, self.supported_display_apps[type]['links_function'] ) ( dataset, type, app, base_url, **kwd )
258        except:
259            log.exception( 'Function %s is referred to in datatype %s for generating links for type %s, but is not accessible' \
260                           % ( self.supported_display_apps[type]['links_function'], self.__class__.__name__, type ) )
261        return []
262    def get_converter_types(self, original_dataset, datatypes_registry):
263        """Returns available converters by type for this dataset"""
264        return datatypes_registry.get_converters_by_datatype(original_dataset.ext)
265    def find_conversion_destination( self, dataset, accepted_formats, datatypes_registry, **kwd ):
266        """Returns ( target_ext, existing converted dataset )"""
267        return datatypes_registry.find_conversion_destination_for_dataset_by_extensions( dataset, accepted_formats, **kwd )
268    def convert_dataset(self, trans, original_dataset, target_type, return_output = False, visible = True, deps=None):
269        """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure."""
270        converter = trans.app.datatypes_registry.get_converter_by_target_type( original_dataset.ext, target_type )
271       
272        if converter is None:
273            raise Exception( "A converter does not exist for %s to %s." % ( original_dataset.ext, target_type ) )
274        #Generate parameter dictionary
275        params = {}
276        #determine input parameter name and add to params
277        input_name = 'input1'
278        for key, value in converter.inputs.items():
279            if (deps) and (value.name in deps):
280                params[value.name] = deps[value.name]
281            elif value.type == 'data':
282                input_name = key
283           
284        params[input_name] = original_dataset
285        #Run converter, job is dispatched through Queue
286        converted_dataset = converter.execute( trans, incoming = params, set_output_hid = visible )[1]
287        if len(params) > 0:
288            trans.log_event( "Converter params: %s" % (str(params)), tool_id=converter.id )
289        if not visible:
290            for name, value in converted_dataset.iteritems():
291                value.visible = False
292        if return_output:
293            return converted_dataset
294        return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid)
295    #We need to clear associated files before we set metadata
296    #so that as soon as metadata starts to be set, e.g. implicitly converted datasets are deleted and no longer available 'while' metadata is being set, not just after
297    #We'll also clear after setting metadata, for backwards compatibility
298    def after_setting_metadata( self, dataset ):
299        """This function is called on the dataset after metadata is set."""
300        dataset.clear_associated_files( metadata_safe = True )
301    def before_setting_metadata( self, dataset ):
302        """This function is called on the dataset before metadata is set."""
303        dataset.clear_associated_files( metadata_safe = True )
304    def __new_composite_file( self, name, optional = False, mimetype = None, description = None, substitute_name_with_metadata = None, is_binary = False, space_to_tab = False, **kwds ):
305        kwds[ 'name' ] = name
306        kwds[ 'optional' ] = optional
307        kwds[ 'mimetype' ] = mimetype
308        kwds[ 'description' ] = description
309        kwds[ 'substitute_name_with_metadata' ] = substitute_name_with_metadata
310        kwds[ 'is_binary' ] = is_binary
311        kwds[ 'space_to_tab' ] = space_to_tab
312        return Bunch( **kwds )
313    def add_composite_file( self, name, **kwds ):
314        #self.composite_files = self.composite_files.copy()
315        self.composite_files[ name ] = self.__new_composite_file( name, **kwds )
316    def __substitute_composite_key( self, key, composite_file, dataset = None ):
317        if composite_file.substitute_name_with_metadata:
318            if dataset:
319                meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
320            else:
321                meta_value = self.spec[composite_file.substitute_name_with_metadata].default
322            return key % meta_value
323        return key
324    @property
325    def writable_files( self, dataset = None ):
326        files = odict()
327        if self.composite_type != 'auto_primary_file':
328            files[ self.primary_file_name ] = self.__new_composite_file( self.primary_file_name )
329        for key, value in self.get_composite_files( dataset = dataset ).iteritems():
330            files[ key ] = value
331        return files
332    def get_composite_files( self, dataset = None ):
333        def substitute_composite_key( key, composite_file ):
334            if composite_file.substitute_name_with_metadata:
335                if dataset:
336                    meta_value = str( dataset.metadata.get( composite_file.substitute_name_with_metadata ) )
337                else:
338                    meta_value = self.metadata_spec[ composite_file.substitute_name_with_metadata ].default
339                return key % meta_value
340            return key
341        files = odict()
342        for key, value in self.composite_files.iteritems():
343            files[ substitute_composite_key( key, value ) ] = value
344        return files
345    def generate_auto_primary_file( self, dataset = None ):
346        raise Exception( "generate_auto_primary_file is not implemented for this datatype." )
347    @property
348    def has_resolution(self):
349        return False
350
351class Text( Data ):
352    file_ext = 'txt'
353
354    """Add metadata elements"""
355    MetadataElement( name="data_lines", default=0, desc="Number of data lines", readonly=True, optional=True, visible=False, no_value=0 )
356
357    def write_from_stream(self, dataset, stream):
358        """Writes data from a stream"""
359        # write it twice for now
360        fd, temp_name = tempfile.mkstemp()
361        while 1:
362            chunk = stream.read(1048576)
363            if not chunk:
364                break
365            os.write(fd, chunk)
366        os.close(fd)
367        # rewrite the file with unix newlines
368        fp = open(dataset.file_name, 'wt')
369        for line in file(temp_name, "U"):
370            line = line.strip() + '\n'
371            fp.write(line)
372        fp.close()
373    def set_raw_data(self, dataset, data):
374        """Saves the data on the disc"""
375        fd, temp_name = tempfile.mkstemp()
376        os.write(fd, data)
377        os.close(fd)
378        # rewrite the file with unix newlines
379        fp = open(dataset.file_name, 'wt')
380        for line in file(temp_name, "U"):
381            line = line.strip() + '\n'
382            fp.write(line)
383        fp.close()
384        os.remove( temp_name )
385    def get_mime(self):
386        """Returns the mime type of the datatype"""
387        return 'text/plain'
388    def set_meta( self, dataset, **kwd ):
389        """
390        Set the number of lines of data in dataset,
391        skipping all blank lines and comments.
392        """
393        data_lines = 0
394        for line in file( dataset.file_name ):
395            line = line.strip()
396            if line and not line.startswith( '#' ):
397                data_lines += 1
398        dataset.metadata.data_lines = data_lines
399    def set_peek( self, dataset, line_count=None, is_multi_byte=False ):
400        if not dataset.dataset.purged:
401            # The file must exist on disk for the get_file_peek() method
402            dataset.peek = get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
403            if line_count is None:
404                # See if line_count is stored in the metadata
405                if dataset.metadata.data_lines:
406                    dataset.blurb = "%s lines" % util.commaify( str( dataset.metadata.data_lines ) )
407                else:
408                    # Number of lines is not known ( this should not happen ), and auto-detect is
409                    # needed to set metadata
410                    dataset.blurb = "? lines"
411            else:
412                dataset.blurb = "%s lines" % util.commaify( str( line_count ) )
413        else:
414            dataset.peek = 'file does not exist'
415            dataset.blurb = 'file purged from disk'
416
417class Newick( Text ):
418    pass
419
420# ------------- Utility methods --------------
421
422def get_test_fname( fname ):
423    """Returns test data filename"""
424    path, name = os.path.split(__file__)
425    full_path = os.path.join( path, 'test', fname )
426    return full_path
427def nice_size(size):
428    """
429    Returns a readably formatted string with the size
430
431    >>> nice_size(100)
432    '100.0 bytes'
433    >>> nice_size(10000)
434    '9.8 Kb'
435    >>> nice_size(1000000)
436    '976.6 Kb'
437    >>> nice_size(100000000)
438    '95.4 Mb'
439    """
440    words = [ 'bytes', 'Kb', 'Mb', 'Gb' ]
441    try:
442        size = float( size )
443    except:
444        return '??? bytes'
445    for ind, word in enumerate(words):
446        step  = 1024 ** (ind + 1)
447        if step > size:
448            size = size / float(1024 ** ind)
449            out  = "%.1f %s" % (size, word)
450            return out
451    return '??? bytes'
452def get_file_peek( file_name, is_multi_byte=False, WIDTH=256, LINE_COUNT=5 ):
453    """
454    Returns the first LINE_COUNT lines wrapped to WIDTH
455   
456    ## >>> fname = get_test_fname('4.bed')
457    ## >>> get_file_peek(fname)
458    ## 'chr22    30128507    31828507    uc003bnx.1_cds_2_0_chr22_29227_f    0    +\n'
459    """
460    lines = []
461    count = 0
462    file_type = None
463    data_checked = False
464    temp = open( file_name, "U" )
465    while count <= LINE_COUNT:
466        line = temp.readline( WIDTH )
467        if line and not is_multi_byte and not data_checked:
468            # See if we have a compressed or binary file
469            if line[0:2] == util.gzip_magic:
470                file_type = 'gzipped'
471                break
472            else:
473                for char in line:
474                    if ord( char ) > 128:
475                        file_type = 'binary'
476                        break
477            data_checked = True
478        if file_type in [ 'gzipped', 'binary' ]:
479            break
480        lines.append( line )
481        count += 1
482    temp.close()
483    if file_type in [ 'gzipped', 'binary' ]:
484        text = "%s file" % file_type
485    else:
486        try:
487            text = unicode( '\n'.join( lines ), 'utf-8' )
488        except UnicodeDecodeError:
489            text = "binary/unknown file"
490    return text
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。