root/galaxy-central/lib/galaxy/datatypes/sequence.py @ 2

リビジョン 2, 24.2 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2Sequence classes
3"""
4
5import data
6import logging
7import re
8import string
9from cgi import escape
10from galaxy.datatypes.metadata import MetadataElement
11from galaxy.datatypes import metadata
12import galaxy.model
13from galaxy import util
14from sniff import *
15
16log = logging.getLogger(__name__)
17
18class Sequence( data.Text ):
19    """Class describing a sequence"""
20
21    """Add metadata elements"""
22    MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
23
24    def set_meta( self, dataset, **kwd ):
25        """
26        Set the number of sequences and the number of data lines in dataset.
27        """
28        data_lines = 0
29        sequences = 0
30        for line in file( dataset.file_name ):
31            line = line.strip()
32            if line and line.startswith( '#' ):
33                # We don't count comment lines for sequence data types
34                continue
35            if line and line.startswith( '>' ):
36                sequences += 1
37                data_lines +=1
38            else:
39                data_lines += 1
40        dataset.metadata.data_lines = data_lines
41        dataset.metadata.sequences = sequences
42    def set_peek( self, dataset, is_multi_byte=False ):
43        if not dataset.dataset.purged:
44            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
45            if dataset.metadata.sequences:
46                dataset.blurb = "%s sequences" % util.commaify( str( dataset.metadata.sequences ) )
47            else:
48                dataset.blurb = data.nice_size( dataset.get_size() )
49        else:
50            dataset.peek = 'file does not exist'
51            dataset.blurb = 'file purged from disk'
52
53class Alignment( data.Text ):
54    """Class describing an alignment"""
55
56    """Add metadata elements"""
57    MetadataElement( name="species", desc="Species", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None )
58
59class Fasta( Sequence ):
60    """Class representing a FASTA sequence"""
61    file_ext = "fasta"
62
63    def sniff( self, filename ):
64        """
65        Determines whether the file is in fasta format
66       
67        A sequence in FASTA format consists of a single-line description, followed by lines of sequence data.
68        The first character of the description line is a greater-than (">") symbol in the first column.
69        All lines should be shorter than 80 characters
70       
71        For complete details see http://www.ncbi.nlm.nih.gov/blast/fasta.shtml
72       
73        Rules for sniffing as True:
74            We don't care about line length (other than empty lines).
75            The first non-empty line must start with '>' and the Very Next line.strip() must have sequence data and not be a header.
76                'sequence data' here is loosely defined as non-empty lines which do not start with '>'
77                This will cause Color Space FASTA (csfasta) to be detected as True (they are, after all, still FASTA files - they have a header line followed by sequence data)
78                    Previously this method did some checking to determine if the sequence data had integers (presumably to differentiate between fasta and csfasta)
79                    This should be done through sniff order, where csfasta (currently has a null sniff function) is detected for first (stricter definition) followed sometime after by fasta
80            We will only check that the first purported sequence is correctly formatted.
81       
82        >>> fname = get_test_fname( 'sequence.maf' )
83        >>> Fasta().sniff( fname )
84        False
85        >>> fname = get_test_fname( 'sequence.fasta' )
86        >>> Fasta().sniff( fname )
87        True
88        """
89       
90        try:
91            fh = open( filename )
92            while True:
93                line = fh.readline()
94                if not line:
95                    break #EOF
96                line = line.strip()
97                if line: #first non-empty line
98                    if line.startswith( '>' ):
99                        #The next line.strip() must not be '', nor startwith '>'
100                        line = fh.readline().strip()
101                        if line == '' or line.startswith( '>' ):
102                            break
103                        return True
104                    else:
105                        break #we found a non-empty line, but its not a fasta header
106            fh.close()
107        except:
108            pass
109        return False
110
111class csFasta( Sequence ):
112    """ Class representing the SOLID Color-Space sequence ( csfasta ) """
113    file_ext = "csfasta"
114
115    def sniff( self, filename ):
116        """
117        Color-space sequence:
118            >2_15_85_F3
119            T213021013012303002332212012112221222112212222
120
121        >>> fname = get_test_fname( 'sequence.fasta' )
122        >>> csFasta().sniff( fname )
123        False
124        >>> fname = get_test_fname( 'sequence.csfasta' )
125        >>> csFasta().sniff( fname )
126        True
127        """
128        try:
129            fh = open( filename )
130            while True:
131                line = fh.readline()
132                if not line:
133                    break #EOF
134                line = line.strip()
135                if line and not line.startswith( '#' ): #first non-empty non-comment line
136                    if line.startswith( '>' ):
137                        line = fh.readline().strip()
138                        if line == '' or line.startswith( '>' ):
139                            break
140                        elif line[0] not in string.ascii_uppercase:
141                            return False
142                        elif len( line ) > 1 and not re.search( '^[\d.]+$', line[1:] ):
143                            return False
144                        return True
145                    else:
146                        break #we found a non-empty line, but it's not a header
147            fh.close()
148        except:
149            pass
150        return False
151   
152    def set_meta( self, dataset, **kwd ):
153        if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
154            dataset.metadata.data_lines = None
155            dataset.metadata.sequences = None
156            return
157        return Sequence.set_meta( self, dataset, **kwd )
158
159class Fastq ( Sequence ):
160    """Class representing a generic FASTQ sequence"""
161    file_ext = "fastq"
162
163    def set_meta( self, dataset, **kwd ):
164        """
165        Set the number of sequences and the number of data lines
166        in dataset.
167        """
168        if self.max_optional_metadata_filesize >= 0 and dataset.get_size() > self.max_optional_metadata_filesize:
169            dataset.metadata.data_lines = None
170            dataset.metadata.sequences = None
171            return
172        data_lines = 0
173        sequences = 0
174        seq_counter = 0     # blocks should be 4 lines long
175        for line in file( dataset.file_name ):
176            line = line.strip()
177            if line and line.startswith( '#' ) and not sequences:
178                # We don't count comment lines for sequence data types
179                continue
180            if line and line.startswith( '@' ):
181                if seq_counter >= 4:
182                    # count previous block
183                    # blocks should be 4 lines long
184                    sequences += 1
185                    seq_counter = 1
186                else:
187                    # in case quality line starts with @
188                    seq_counter += 1
189                data_lines += 1
190            else:
191                data_lines += 1
192                seq_counter += 1
193        if seq_counter >= 4:
194            # count final block
195            sequences += 1
196        dataset.metadata.data_lines = data_lines
197        dataset.metadata.sequences = sequences
198    def sniff ( self, filename ):
199        """
200        Determines whether the file is in generic fastq format
201        For details, see http://maq.sourceforge.net/fastq.shtml
202
203        Note: There are three kinds of FASTQ files, known as "Sanger" (sometimes called "Standard"), Solexa, and Illumina
204              These differ in the representation of the quality scores
205
206        >>> fname = get_test_fname( '1.fastqsanger' )
207        >>> Fastq().sniff( fname )
208        True
209        >>> fname = get_test_fname( '2.fastqsanger' )
210        >>> Fastq().sniff( fname )
211        True
212        """
213        headers = get_headers( filename, None )
214        bases_regexp = re.compile( "^[NGTAC]*" )
215        # check that first block looks like a fastq block
216        try:
217            if len( headers ) >= 4 and headers[0][0] and headers[0][0][0] == "@" and headers[2][0] and headers[2][0][0] == "+" and headers[1][0]:
218                # Check the sequence line, make sure it contains only G/C/A/T/N
219                if not bases_regexp.match( headers[1][0] ):
220                    return False
221                return True
222            return False
223        except:
224            return False
225
226class FastqSanger( Fastq ):
227    """Class representing a FASTQ sequence ( the Sanger variant )"""
228    file_ext = "fastqsanger"
229
230class FastqSolexa( Fastq ):
231    """Class representing a FASTQ sequence ( the Solexa variant )"""
232    file_ext = "fastqsolexa"
233
234class FastqIllumina( Fastq ):
235    """Class representing a FASTQ sequence ( the Illumina 1.3+ variant )"""
236    file_ext = "fastqillumina"
237
238class FastqCSSanger( Fastq ):
239    """Class representing a Color Space FASTQ sequence ( e.g a SOLiD variant )"""
240    file_ext = "fastqcssanger"
241
242try:
243    from galaxy import eggs
244    import pkg_resources; pkg_resources.require( "bx-python" )
245    import bx.align.maf
246except:
247    pass
248
249#trying to import maf_utilities here throws an ImportError due to a circular import between jobs and tools:
250#from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
251#Traceback (most recent call last):
252#  File "./scripts/paster.py", line 27, in <module>
253#    command.run()
254#  File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 78, in run
255#  File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 117, in invoke
256#  File "build/bdist.solaris-2.11-i86pc/egg/paste/script/command.py", line 212, in run
257#  File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 227, in command
258#  File "build/bdist.solaris-2.11-i86pc/egg/paste/script/serve.py", line 250, in loadapp
259#  File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 193, in loadapp
260#  File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 213, in loadobj
261#  File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 237, in loadcontext
262#  File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 267, in _loadconfig
263#  File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 397, in get_context
264#  File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 439, in _context_from_explicit
265#  File "build/bdist.solaris-2.11-i86pc/egg/paste/deploy/loadwsgi.py", line 18, in import_string
266#  File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/pkg_resources.py", line 1912, in load
267#    entry = __import__(self.module_name, globals(),globals(), ['__name__'])
268#  File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/web/buildapp.py", line 18, in <module>
269#    from galaxy import config, jobs, util, tools
270#  File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/jobs/__init__.py", line 3, in <module>
271#    from galaxy import util, model
272#  File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/model/__init__.py", line 13, in <module>
273#    import galaxy.datatypes.registry
274#  File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/registry.py", line 6, in <module>
275#    import data, tabular, interval, images, sequence, qualityscore, genetics, xml, coverage, tracks, chrominfo
276#  File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/datatypes/sequence.py", line 344, in <module>
277#    from galaxy.tools.util.maf_utilities import build_maf_index_species_chromosomes
278#  File "/afs/bx.psu.edu/home/dan/galaxy/central/lib/galaxy/tools/__init__.py", line 15, in <module>
279#    from galaxy import util, jobs, model
280#ImportError: cannot import name jobs
281#so we'll copy and paste for now...terribly icky
282#*** ANYCHANGE TO THIS METHOD HERE OR IN maf_utilities MUST BE PROPAGATED ***
283def COPIED_build_maf_index_species_chromosomes( filename, index_species = None ):
284    species = []
285    species_chromosomes = {}
286    indexes = bx.interval_index_file.Indexes()
287    blocks = 0
288    try:
289        maf_reader = bx.align.maf.Reader( open( filename ) )
290        while True:
291            pos = maf_reader.file.tell()
292            block = maf_reader.next()
293            if block is None:
294                break
295            blocks += 1
296            for c in block.components:
297                spec = c.src
298                chrom = None
299                if "." in spec:
300                    spec, chrom = spec.split( ".", 1 )
301                if spec not in species:
302                    species.append( spec )
303                    species_chromosomes[spec] = []
304                if chrom and chrom not in species_chromosomes[spec]:
305                    species_chromosomes[spec].append( chrom )
306                if index_species is None or spec in index_species:
307                    forward_strand_start = c.forward_strand_start
308                    forward_strand_end = c.forward_strand_end
309                    try:
310                        forward_strand_start = int( forward_strand_start )
311                        forward_strand_end = int( forward_strand_end )
312                    except ValueError:
313                        continue #start and end are not integers, can't add component to index, goto next component
314                        #this likely only occurs when parse_e_rows is True?
315                        #could a species exist as only e rows? should the
316                    if forward_strand_end > forward_strand_start:
317                        #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
318                        indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
319    except Exception, e:
320        #most likely a bad MAF
321        log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
322        return ( None, [], {}, 0 )
323    return ( indexes, species, species_chromosomes, blocks )
324
325class Maf( Alignment ):
326    """Class describing a Maf alignment"""
327    file_ext = "maf"
328   
329    #Readonly and optional, users can't unset it, but if it is not set, we are generally ok; if required use a metadata validator in the tool definition
330    MetadataElement( name="blocks", default=0, desc="Number of blocks", readonly=True, optional=True, visible=False, no_value=0 )
331    MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
332    MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
333
334    def init_meta( self, dataset, copy_from=None ):
335        Alignment.init_meta( self, dataset, copy_from=copy_from )
336    def set_meta( self, dataset, overwrite = True, **kwd ):
337        """
338        Parses and sets species, chromosomes, index from MAF file.
339        """
340        #these metadata values are not accessable by users, always overwrite
341        indexes, species, species_chromosomes, blocks = COPIED_build_maf_index_species_chromosomes( dataset.file_name )
342        if indexes is None:
343            return #this is not a MAF file
344        dataset.metadata.species = species
345        dataset.metadata.blocks = blocks
346       
347        #write species chromosomes to a file
348        chrom_file = dataset.metadata.species_chromosomes
349        if not chrom_file:
350            chrom_file = dataset.metadata.spec['species_chromosomes'].param.new_file( dataset = dataset )
351        chrom_out = open( chrom_file.file_name, 'wb' )
352        for spec, chroms in species_chromosomes.items():
353            chrom_out.write( "%s\t%s\n" % ( spec, "\t".join( chroms ) ) )
354        chrom_out.close()
355        dataset.metadata.species_chromosomes = chrom_file
356       
357        index_file = dataset.metadata.maf_index
358        if not index_file:
359            index_file = dataset.metadata.spec['maf_index'].param.new_file( dataset = dataset )
360        indexes.write( open( index_file.file_name, 'wb' ) )
361        dataset.metadata.maf_index = index_file
362    def set_peek( self, dataset, is_multi_byte=False ):
363        if not dataset.dataset.purged:
364            # The file must exist on disk for the get_file_peek() method
365            dataset.peek = data.get_file_peek( dataset.file_name, is_multi_byte=is_multi_byte )
366            if dataset.metadata.blocks:
367                dataset.blurb = "%s blocks" % util.commaify( str( dataset.metadata.blocks ) )
368            else:
369                # Number of blocks is not known ( this should not happen ), and auto-detect is
370                # needed to set metadata
371                dataset.blurb = "? blocks"
372        else:
373            dataset.peek = 'file does not exist'
374            dataset.blurb = 'file purged from disk'
375    def display_peek( self, dataset ):
376        """Returns formated html of peek"""
377        return self.make_html_table( dataset )
378    def make_html_table( self, dataset, skipchars=[] ):
379        """Create HTML table, used for displaying peek"""
380        out = ['<table cellspacing="0" cellpadding="3">']
381        try:
382            out.append('<tr><th>Species:&nbsp;')
383            for species in dataset.metadata.species:
384                out.append( '%s&nbsp;' % species )
385            out.append( '</th></tr>' )
386            if not dataset.peek:
387                dataset.set_peek()
388            data = dataset.peek
389            lines =  data.splitlines()
390            for line in lines:
391                line = line.strip()
392                if not line:
393                    continue
394                out.append( '<tr><td>%s</td></tr>' % escape( line ) )
395            out.append( '</table>' )
396            out = "".join( out )
397        except Exception, exc:
398            out = "Can't create peek %s" % exc
399        return out
400    def sniff( self, filename ):
401        """
402        Determines wether the file is in maf format
403       
404        The .maf format is line-oriented. Each multiple alignment ends with a blank line.
405        Each sequence in an alignment is on a single line, which can get quite long, but
406        there is no length limit. Words in a line are delimited by any white space.
407        Lines starting with # are considered to be comments. Lines starting with ## can
408        be ignored by most programs, but contain meta-data of one form or another.
409       
410        The first line of a .maf file begins with ##maf. This word is followed by white-space-separated
411        variable=value pairs. There should be no white space surrounding the "=".
412     
413        For complete details see http://genome.ucsc.edu/FAQ/FAQformat#format5
414       
415        >>> fname = get_test_fname( 'sequence.maf' )
416        >>> Maf().sniff( fname )
417        True
418        >>> fname = get_test_fname( 'sequence.fasta' )
419        >>> Maf().sniff( fname )
420        False
421        """
422        headers = get_headers( filename, None )
423        try:
424            if len(headers) > 1 and headers[0][0] and headers[0][0] == "##maf":
425                return True
426            else:
427                return False
428        except:
429            return False
430
431class MafCustomTrack( data.Text ):
432    file_ext = "mafcustomtrack"
433   
434    MetadataElement( name="vp_chromosome", default='chr1', desc="Viewport Chromosome", readonly=True, optional=True, visible=False, no_value='' )
435    MetadataElement( name="vp_start", default='1', desc="Viewport Start", readonly=True, optional=True, visible=False, no_value='' )
436    MetadataElement( name="vp_end", default='100', desc="Viewport End", readonly=True, optional=True, visible=False, no_value='' )
437   
438    def set_meta( self, dataset, overwrite = True, **kwd ):
439        """
440        Parses and sets viewport metadata from MAF file.
441        """
442        max_block_check = 10
443        chrom = None
444        forward_strand_start = float( 'inf' )
445        forward_strand_end = 0
446        try:
447            maf_file = open( dataset.file_name )
448            maf_file.readline() #move past track line
449            for i, block in enumerate( bx.align.maf.Reader( maf_file ) ):
450                ref_comp = block.get_component_by_src_start( dataset.metadata.dbkey )
451                if ref_comp:
452                    ref_chrom = bx.align.maf.src_split( ref_comp.src )[-1]
453                    if chrom is None:
454                        chrom = ref_chrom
455                    if chrom == ref_chrom:
456                        forward_strand_start = min( forward_strand_start, ref_comp.forward_strand_start )
457                        forward_strand_end = max( forward_strand_end, ref_comp.forward_strand_end )
458                if i > max_block_check:
459                    break
460           
461            if forward_strand_end > forward_strand_start:
462                dataset.metadata.vp_chromosome = chrom
463                dataset.metadata.vp_start = forward_strand_start
464                dataset.metadata.vp_end = forward_strand_end
465        except:
466            pass
467
468class Axt( data.Text ):
469    """Class describing an axt alignment"""
470   
471    # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
472    # here simply for backward compatibility ( although it is still in the datatypes registry ).  Subclassing
473    # from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
474
475    file_ext = "axt"
476
477    def sniff( self, filename ):
478        """
479        Determines whether the file is in axt format
480       
481        axt alignment files are produced from Blastz, an alignment tool available from Webb Miller's lab
482        at Penn State University.
483       
484        Each alignment block in an axt file contains three lines: a summary line and 2 sequence lines.
485        Blocks are separated from one another by blank lines.
486       
487        The summary line contains chromosomal position and size information about the alignment. It
488        consists of 9 required fields.
489       
490        The sequence lines contain the sequence of the primary assembly (line 2) and aligning assembly
491        (line 3) with inserts.  Repeats are indicated by lower-case letters.
492   
493        For complete details see http://genome.ucsc.edu/goldenPath/help/axt.html
494       
495        >>> fname = get_test_fname( 'alignment.axt' )
496        >>> Axt().sniff( fname )
497        True
498        >>> fname = get_test_fname( 'alignment.lav' )
499        >>> Axt().sniff( fname )
500        False
501        """
502        headers = get_headers( filename, None )
503        if len(headers) < 4:
504            return False
505        for hdr in headers:
506            if len(hdr) > 0 and hdr[0].startswith("##matrix=axt"):
507                return True
508            if len(hdr) > 0 and not hdr[0].startswith("#"):
509                if len(hdr) != 9:
510                    return False
511                try:
512                    map ( int, [hdr[0], hdr[2], hdr[3], hdr[5], hdr[6], hdr[8]] )
513                except:
514                    return False
515                if hdr[7] not in data.valid_strand:
516                    return False
517                else:
518                    return True
519
520class Lav( data.Text ):
521    """Class describing a LAV alignment"""
522
523    file_ext = "lav"
524
525    # gvk- 11/19/09 - This is really an alignment, but we no longer have tools that use this data type, and it is
526    # here simply for backward compatibility ( although it is still in the datatypes registry ).  Subclassing
527    # from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
528
529    def sniff( self, filename ):
530        """
531        Determines whether the file is in lav format
532       
533        LAV is an alignment format developed by Webb Miller's group. It is the primary output format for BLASTZ.
534        The first line of a .lav file begins with #:lav.
535   
536        For complete details see http://www.bioperl.org/wiki/LAV_alignment_format
537       
538        >>> fname = get_test_fname( 'alignment.lav' )
539        >>> Lav().sniff( fname )
540        True
541        >>> fname = get_test_fname( 'alignment.axt' )
542        >>> Lav().sniff( fname )
543        False
544        """
545        headers = get_headers( filename, None )
546        try:
547            if len(headers) > 1 and headers[0][0] and headers[0][0].startswith('#:lav'):
548                return True
549            else:
550                return False
551        except:
552            return False
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。