root/galaxy-central/lib/galaxy/datatypes/binary.py

リビジョン 2, 9.6 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2Binary classes
3"""
4
5import data, logging, binascii
6from galaxy.datatypes.metadata import MetadataElement
7from galaxy.datatypes import metadata
8from galaxy.datatypes.sniff import *
9from urllib import urlencode, quote_plus
10import zipfile, gzip
11import os, subprocess, tempfile
12import struct
13
14log = logging.getLogger(__name__)
15
16# Currently these supported binary data types must be manually set on upload
17unsniffable_binary_formats = [ 'ab1', 'scf' ]
18
19class Binary( data.Data ):
20    """Binary data"""
21    def set_peek( self, dataset, is_multi_byte=False ):
22        """Set the peek and blurb text"""
23        if not dataset.dataset.purged:
24            dataset.peek = 'binary data'
25            dataset.blurb = 'data'
26        else:
27            dataset.peek = 'file does not exist'
28            dataset.blurb = 'file purged from disk'
29    def get_mime( self ):
30        """Returns the mime type of the datatype"""
31        return 'application/octet-stream'
32
33class Ab1( Binary ):
34    """Class describing an ab1 binary sequence file"""
35    file_ext = "ab1"
36
37    def set_peek( self, dataset, is_multi_byte=False ):
38        if not dataset.dataset.purged:
39            dataset.peek  = "Binary ab1 sequence file"
40            dataset.blurb = data.nice_size( dataset.get_size() )
41        else:
42            dataset.peek = 'file does not exist'
43            dataset.blurb = 'file purged from disk'
44    def display_peek( self, dataset ):
45        try:
46            return dataset.peek
47        except:
48            return "Binary ab1 sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
49
50class Bam( Binary ):
51    """Class describing a BAM binary file"""
52    file_ext = "bam"
53    MetadataElement( name="bam_index", desc="BAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
54   
55    def _is_coordinate_sorted(self, filename):
56        """Check if the input BAM file is sorted from the header information.
57        """
58        params = ["samtools", "view", "-H", filename]
59        output = subprocess.Popen(params, stderr=subprocess.PIPE, stdout=subprocess.PIPE).communicate()[0]
60        # find returns -1 if string is not found
61        return output.find("SO:coordinate") != -1 or output.find("SO:sorted") != -1
62
63    def groom_dataset_content( self, file_name ):
64        """
65        Ensures that the Bam file contents are sorted.  This function is called
66        on an output dataset after the content is initially generated.
67        """
68        # Use samtools to sort the Bam file
69        ##$ samtools sort
70        ##Usage: samtools sort [-on] [-m <maxMem>] <in.bam> <out.prefix>
71        ## Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created.
72        ## This command may also create temporary files <out.prefix>.%d.bam when the
73        ## whole alignment cannot be fitted into memory ( controlled by option -m ).
74        #do this in a unique temp directory, because of possible <out.prefix>.%d.bam temp files
75       
76        if self._is_coordinate_sorted(file_name):
77            # Don't re-sort if already sorted
78            return
79           
80        tmp_dir = tempfile.mkdtemp()
81        tmp_sorted_dataset_file_name_prefix = os.path.join( tmp_dir, 'sorted' )
82        stderr_name = tempfile.NamedTemporaryFile( dir = tmp_dir, prefix = "bam_sort_stderr" ).name
83        samtools_created_sorted_file_name = "%s.bam" % tmp_sorted_dataset_file_name_prefix #samtools accepts a prefix, not a filename, it always adds .bam to the prefix
84        command = "samtools sort %s %s" % ( file_name, tmp_sorted_dataset_file_name_prefix )
85        proc = subprocess.Popen( args=command, shell=True, cwd=tmp_dir, stderr=open( stderr_name, 'wb' ) )
86        exit_code = proc.wait()
87       
88        #Did sort succeed?
89        stderr = open( stderr_name ).read().strip()
90        if stderr:
91            if exit_code != 0:
92                shutil.rmtree( tmp_dir) #clean up
93                raise Exception, "Error Grooming BAM file contents: %s" % stderr
94            else:
95                print stderr
96       
97        # Move samtools_created_sorted_file_name to our output dataset location
98        shutil.move( samtools_created_sorted_file_name, file_name )
99       
100        # Remove temp file and empty temporary directory
101        os.unlink( stderr_name )
102        os.rmdir( tmp_dir )
103    def init_meta( self, dataset, copy_from=None ):
104        Binary.init_meta( self, dataset, copy_from=copy_from )
105    def set_meta( self, dataset, overwrite = True, **kwd ):
106        """ Creates the index for the BAM file. """
107        # These metadata values are not accessible by users, always overwrite
108        index_file = dataset.metadata.bam_index
109        if not index_file:
110            index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset )
111       
112        # Create the Bam index
113        ##$ samtools index
114        ##Usage: samtools index <in.bam> [<out.index>]
115        stderr_name = tempfile.NamedTemporaryFile( prefix = "bam_index_stderr" ).name
116        command = 'samtools index %s %s' % ( dataset.file_name, index_file.file_name )
117        proc = subprocess.Popen( args=command, shell=True, stderr=open( stderr_name, 'wb' ) )
118        exit_code = proc.wait()
119        #Did index succeed?
120        stderr = open( stderr_name ).read().strip()
121        if stderr:
122            if exit_code != 0:
123                os.unlink( stderr_name ) #clean up
124                raise Exception, "Error Setting BAM Metadata: %s" % stderr
125            else:
126                print stderr
127       
128        dataset.metadata.bam_index = index_file
129       
130        # Remove temp file
131        os.unlink( stderr_name )
132    def sniff( self, filename ):
133        # BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
134        # The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
135        try:
136            header = gzip.open( filename ).read(4)
137            if binascii.b2a_hex( header ) == binascii.hexlify( 'BAM\1' ):
138                return True
139            return False
140        except:
141            return False
142    def set_peek( self, dataset, is_multi_byte=False ):
143        if not dataset.dataset.purged:
144            dataset.peek  = "Binary bam alignments file"
145            dataset.blurb = data.nice_size( dataset.get_size() )
146        else:
147            dataset.peek = 'file does not exist'
148            dataset.blurb = 'file purged from disk'
149    def display_peek( self, dataset ):
150        try:
151            return dataset.peek
152        except:
153            return "Binary bam alignments file (%s)" % ( data.nice_size( dataset.get_size() ) )
154    def get_track_type( self ):
155        return "ReadTrack", {"data": "bai", "index": "summary_tree"}
156   
157class Scf( Binary ):
158    """Class describing an scf binary sequence file"""
159    file_ext = "scf"
160
161    def set_peek( self, dataset, is_multi_byte=False ):
162        if not dataset.dataset.purged:
163            dataset.peek  = "Binary scf sequence file"
164            dataset.blurb = data.nice_size( dataset.get_size() )
165        else:
166            dataset.peek = 'file does not exist'
167            dataset.blurb = 'file purged from disk'
168    def display_peek( self, dataset ):
169        try:
170            return dataset.peek
171        except:
172            return "Binary scf sequence file (%s)" % ( data.nice_size( dataset.get_size() ) )
173
174class Sff( Binary ):
175    """ Standard Flowgram Format (SFF) """
176    file_ext = "sff"
177
178    def __init__( self, **kwd ):
179        Binary.__init__( self, **kwd )
180    def sniff( self, filename ):
181        # The first 4 bytes of any sff file is '.sff', and the file is binary. For details
182        # about the format, see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=format
183        try:
184            header = open( filename ).read(4)
185            if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
186                return True
187            return False
188        except:
189            return False
190    def set_peek( self, dataset, is_multi_byte=False ):
191        if not dataset.dataset.purged:
192            dataset.peek  = "Binary sff file"
193            dataset.blurb = data.nice_size( dataset.get_size() )
194        else:
195            dataset.peek = 'file does not exist'
196            dataset.blurb = 'file purged from disk'
197    def display_peek( self, dataset ):
198        try:
199            return dataset.peek
200        except:
201            return "Binary sff file (%s)" % ( data.nice_size( dataset.get_size() ) )
202
203class BigWig(Binary):
204    """
205    Accessing binary BigWig files from UCSC.
206    The supplemental info in the paper has the binary details:
207    http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq351v1
208    """
209    def __init__( self, **kwd ):
210        Binary.__init__( self, **kwd )
211        self._magic = 0x888FFC26
212        self._name = "BigWig"
213    def _unpack( self, pattern, handle ):
214        return struct.unpack( pattern, handle.read( struct.calcsize( pattern ) ) )
215    def sniff( self, filename ):
216        magic = self._unpack( "I", open( filename ) )
217        return magic[0] == self._magic
218    def set_peek( self, dataset, is_multi_byte=False ):
219        if not dataset.dataset.purged:
220            dataset.peek  = "Binary UCSC %s file" % self._name
221            dataset.blurb = data.nice_size( dataset.get_size() )
222        else:
223            dataset.peek = 'file does not exist'
224            dataset.blurb = 'file purged from disk'
225    def display_peek( self, dataset ):
226        try:
227            return dataset.peek
228        except:
229            return "Binary UCSC %s file (%s)" % ( self._name, data.nice_size( dataset.get_size() ) )
230
231class BigBed(BigWig):
232    """BigBed support from UCSC."""
233    def __init__( self, **kwd ):
234        Binary.__init__( self, **kwd )
235        self._magic = 0x8789F2EB
236        self._name = "BigBed"
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。