root/galaxy-central/lib/galaxy/tools/util/maf_utilities.py @ 2

リビジョン 2, 26.5 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2"""
3Provides wrappers and utilities for working with MAF files and alignments.
4"""
5#Dan Blankenberg
6import pkg_resources; pkg_resources.require( "bx-python" )
7import bx.align.maf
8import bx.intervals
9import bx.interval_index_file
10import sys, os, string, tempfile
11import logging
12from copy import deepcopy
13
14assert sys.version_info[:2] >= ( 2, 4 )
15
16log = logging.getLogger(__name__)
17
18
19GAP_CHARS = [ '-' ]
20SRC_SPLIT_CHAR = '.'
21
22def src_split( src ):
23    fields = src.split( SRC_SPLIT_CHAR, 1 )
24    spec = fields.pop( 0 )
25    if fields:
26        chrom = fields.pop( 0 )
27    else:
28        chrom = spec
29    return spec, chrom
30
31def src_merge( spec, chrom, contig = None ):
32    if None in [ spec, chrom ]:
33        spec = chrom = spec or chrom
34    return bx.align.maf.src_merge( spec, chrom, contig )
35
36def get_species_in_block( block ):
37    species = []
38    for c in block.components:
39        spec, chrom = src_split( c.src )
40        if spec not in species:
41            species.append( spec )
42    return species
43
44def tool_fail( msg = "Unknown Error" ):
45    print >> sys.stderr, "Fatal Error: %s" % msg
46    sys.exit()
47
48#an object corresponding to a reference layered alignment
49class RegionAlignment( object ):
50   
51    DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
52    MAX_SEQUENCE_SIZE = sys.maxint #Maximum length of sequence allowed
53   
54    def __init__( self, size, species = [] ):
55        assert size <= self.MAX_SEQUENCE_SIZE, "Maximum length allowed for an individual sequence has been exceeded (%i > %i)." % ( size, self.MAX_SEQUENCE_SIZE )
56        self.size = size
57        self.sequences = {}
58        if not isinstance( species, list ):
59            species = [species]
60        for spec in species:
61            self.add_species( spec )
62   
63    #add a species to the alignment
64    def add_species( self, species ):
65        #make temporary sequence files
66        self.sequences[species] = tempfile.TemporaryFile()
67        self.sequences[species].write( "-" * self.size )
68   
69    #returns the names for species found in alignment, skipping names as requested
70    def get_species_names( self, skip = [] ):
71        if not isinstance( skip, list ): skip = [skip]
72        names = self.sequences.keys()
73        for name in skip:
74            try: names.remove( name )
75            except: pass
76        return names
77   
78    #returns the sequence for a species
79    def get_sequence( self, species ):
80        self.sequences[species].seek( 0 )
81        return self.sequences[species].read()
82   
83    #returns the reverse complement of the sequence for a species
84    def get_sequence_reverse_complement( self, species ):
85        complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
86        complement.reverse()
87        return "".join( complement )
88   
89    #sets a position for a species
90    def set_position( self, index, species, base ):
91        if len( base ) != 1: raise Exception( "A genomic position can only have a length of 1." )
92        return self.set_range( index, species, base )
93    #sets a range for a species
94    def set_range( self, index, species, bases ):
95        if index >= self.size or index < 0: raise Exception( "Your index (%i) is out of range (0 - %i)." % ( index, self.size - 1 ) )
96        if len( bases ) == 0: raise Exception( "A set of genomic positions can only have a positive length." )
97        if species not in self.sequences.keys(): self.add_species( species )
98        self.sequences[species].seek( index )
99        self.sequences[species].write( bases )
100       
101    #Flush temp file of specified species, or all species
102    def flush( self, species = None ):
103        if species is None:
104            species = self.sequences.keys()
105        elif not isinstance( species, list ):
106            species = [species]
107        for spec in species:
108            self.sequences[spec].flush()
109
110class GenomicRegionAlignment( RegionAlignment ):
111   
112    def __init__( self, start, end, species = [] ):
113        RegionAlignment.__init__( self, end - start, species )
114        self.start = start
115        self.end = end
116
117class SplicedAlignment( object ):
118   
119    DNA_COMPLEMENT = string.maketrans( "ACGTacgt", "TGCAtgca" )
120   
121    def __init__( self, exon_starts, exon_ends, species = [] ):
122        if not isinstance( exon_starts, list ):
123            exon_starts = [exon_starts]
124        if not isinstance( exon_ends, list ):
125            exon_ends = [exon_ends]
126        assert len( exon_starts ) == len( exon_ends ), "The number of starts does not match the number of sizes."
127        self.exons = []
128        for i in range( len( exon_starts ) ):
129            self.exons.append( GenomicRegionAlignment( exon_starts[i], exon_ends[i], species ) )
130   
131    #returns the names for species found in alignment, skipping names as requested
132    def get_species_names( self, skip = [] ):
133        if not isinstance( skip, list ): skip = [skip]
134        names = []
135        for exon in self.exons:
136            for name in exon.get_species_names( skip = skip ):
137                if name not in names:
138                    names.append( name )
139        return names
140   
141    #returns the sequence for a species
142    def get_sequence( self, species ):
143        sequence = tempfile.TemporaryFile()
144        for exon in self.exons:
145            if species in exon.get_species_names():
146                sequence.write( exon.get_sequence( species ) )
147            else:
148                sequence.write( "-" * exon.size )
149        sequence.seek( 0 )
150        return sequence.read()
151   
152    #returns the reverse complement of the sequence for a species
153    def get_sequence_reverse_complement( self, species ):
154        complement = [base for base in self.get_sequence( species ).translate( self.DNA_COMPLEMENT )]
155        complement.reverse()
156        return "".join( complement )
157   
158    #Start and end of coding region
159    @property
160    def start( self ):
161        return self.exons[0].start
162    @property
163    def end( self ):
164        return self.exons[-1].end
165
166#Open a MAF index using a UID
167def maf_index_by_uid( maf_uid, index_location_file ):
168    for line in open( index_location_file ):
169        try:
170            #read each line, if not enough fields, go to next line
171            if line[0:1] == "#" : continue
172            fields = line.split('\t')
173            if maf_uid == fields[1]:
174                try:
175                    maf_files = fields[4].replace( "\n", "" ).replace( "\r", "" ).split( "," )
176                    return bx.align.maf.MultiIndexed( maf_files, keep_open = True, parse_e_rows = False )
177                except Exception, e:
178                    raise Exception( 'MAF UID (%s) found, but configuration appears to be malformed: %s' % ( maf_uid, e ) )
179        except:
180            pass
181    return None
182
183#return ( index, temp_index_filename ) for user maf, if available, or build one and return it, return None when no tempfile is created
184def open_or_build_maf_index( maf_file, index_filename, species = None ):
185    try:
186        return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), None )
187    except:
188        return build_maf_index( maf_file, species = species )
189   
190#*** ANYCHANGE TO THIS METHOD HERE OR IN galaxy.datatypes.sequences MUST BE PROPAGATED ***
191def build_maf_index_species_chromosomes( filename, index_species = None ):
192    species = []
193    species_chromosomes = {}
194    indexes = bx.interval_index_file.Indexes()
195    blocks = 0
196    try:
197        maf_reader = bx.align.maf.Reader( open( filename ) )
198        while True:
199            pos = maf_reader.file.tell()
200            block = maf_reader.next()
201            if block is None:
202                break
203            blocks += 1
204            for c in block.components:
205                spec = c.src
206                chrom = None
207                if "." in spec:
208                    spec, chrom = spec.split( ".", 1 )
209                if spec not in species:
210                    species.append( spec )
211                    species_chromosomes[spec] = []
212                if chrom and chrom not in species_chromosomes[spec]:
213                    species_chromosomes[spec].append( chrom )
214                if index_species is None or spec in index_species:
215                    forward_strand_start = c.forward_strand_start
216                    forward_strand_end = c.forward_strand_end
217                    try:
218                        forward_strand_start = int( forward_strand_start )
219                        forward_strand_end = int( forward_strand_end )
220                    except ValueError:
221                        continue #start and end are not integers, can't add component to index, goto next component
222                        #this likely only occurs when parse_e_rows is True?
223                        #could a species exist as only e rows? should the
224                    if forward_strand_end > forward_strand_start:
225                        #require positive length; i.e. certain lines have start = end = 0 and cannot be indexed
226                        indexes.add( c.src, forward_strand_start, forward_strand_end, pos, max=c.src_size )
227    except Exception, e:
228        #most likely a bad MAF
229        log.debug( 'Building MAF index on %s failed: %s' % ( filename, e ) )
230        return ( None, [], {} )
231    return ( indexes, species, species_chromosomes, blocks )
232
233#builds and returns ( index, index_filename ) for specified maf_file
234def build_maf_index( maf_file, species = None ):
235    indexes, found_species, species_chromosomes, blocks = build_maf_index_species_chromosomes( maf_file, species )
236    if indexes is not None:
237        fd, index_filename = tempfile.mkstemp()
238        out = os.fdopen( fd, 'w' )
239        indexes.write( out )
240        out.close()
241        return ( bx.align.maf.Indexed( maf_file, index_filename = index_filename, keep_open = True, parse_e_rows = False ), index_filename )
242    return ( None, None )
243
244def component_overlaps_region( c, region ):
245    if c is None: return False
246    start, end = c.get_forward_strand_start(), c.get_forward_strand_end()
247    if region.start >= end or region.end <= start:
248        return False
249    return True
250
251def chop_block_by_region( block, src, region, species = None, mincols = 0 ):
252    # This chopping method was designed to maintain consistency with how start/end padding gaps have been working in Galaxy thus far:
253    #   behavior as seen when forcing blocks to be '+' relative to src sequence (ref) and using block.slice_by_component( ref, slice_start, slice_end )
254    #   whether-or-not this is the 'correct' behavior is questionable, but this will at least maintain consistency
255    # comments welcome
256    slice_start = block.text_size #max for the min()
257    slice_end = 0 #min for the max()
258    old_score = block.score #save old score for later use
259    # We no longer assume only one occurance of src per block, so we need to check them all
260    for c in iter_components_by_src( block, src ):
261        if component_overlaps_region( c, region ):
262            if c.text is not None:
263                rev_strand = False
264                if c.strand == "-":
265                    #We want our coord_to_col coordinates to be returned from positive stranded component
266                    rev_strand = True
267                    c = c.reverse_complement()
268                start = max( region.start, c.start )
269                end = min( region.end, c.end )
270                start = c.coord_to_col( start )
271                end = c.coord_to_col( end )
272                if rev_strand:
273                    #need to orient slice coordinates to the original block direction
274                    slice_len = end - start
275                    end = len( c.text ) - start
276                    start = end - slice_len
277                slice_start = min( start, slice_start )
278                slice_end = max( end, slice_end )
279   
280    if slice_start < slice_end:
281        block = block.slice( slice_start, slice_end )
282        if block.text_size > mincols:
283            # restore old score, may not be accurate, but it is better than 0 for everything?
284            block.score = old_score
285            if species is not None:
286                block = block.limit_to_species( species )
287                block.remove_all_gap_columns()
288            return block
289    return None
290   
291def orient_block_by_region( block, src, region, force_strand = None ):
292    #loop through components matching src,
293    #make sure each of these components overlap region
294    #cache strand for each of overlaping regions
295    #if force_strand / region.strand not in strand cache, reverse complement
296    ### we could have 2 sequences with same src, overlapping region, on different strands, this would cause no reverse_complementing
297    strands = [ c.strand for c in iter_components_by_src( block, src ) if component_overlaps_region( c, region ) ]
298    if strands and ( force_strand is None and region.strand not in strands ) or ( force_strand is not None and force_strand not in strands ):
299        block = block.reverse_complement()
300    return block
301
302def get_oriented_chopped_blocks_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
303    for block, idx, offset in get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols, force_strand ):
304        yield block
305def get_oriented_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ):
306    for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
307        yield orient_block_by_region( block, src, region, force_strand ), idx, offset
308
309#split a block with multiple occurances of src into one block per src
310def iter_blocks_split_by_src( block, src ):
311    for src_c in iter_components_by_src( block, src ):
312        new_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) )
313        new_block.text_size = block.text_size
314        for c in block.components:
315            if c == src_c or c.src != src:
316                new_block.add_component( deepcopy( c ) ) #components have reference to alignment, dont want to loose reference to original alignment block in original components
317        yield new_block
318
319#split a block into multiple blocks with all combinations of a species appearing only once per block
320def iter_blocks_split_by_species( block, species = None ):
321    def __split_components_by_species( components_by_species, new_block ):
322        if components_by_species:
323            #more species with components to add to this block
324            components_by_species = deepcopy( components_by_species )
325            spec_comps = components_by_species.pop( 0 )
326            for c in spec_comps:
327                newer_block = deepcopy( new_block )
328                newer_block.add_component( deepcopy( c ) )
329                for value in __split_components_by_species( components_by_species, newer_block ):
330                    yield value
331        else:
332            #no more components to add, yield this block
333            yield new_block
334   
335    #divide components by species   
336    spec_dict = {}
337    if not species:
338        species = []
339        for c in block.components:
340            spec, chrom = src_split( c.src )
341            if spec not in spec_dict:
342                spec_dict[ spec ] = []
343                species.append( spec )
344            spec_dict[ spec ].append( c )
345    else:
346        for spec in species:
347            spec_dict[ spec ] = []
348            for c in iter_components_by_src_start( block, spec ):
349                spec_dict[ spec ].append( c )
350   
351    empty_block = bx.align.Alignment( score=block.score, attributes=deepcopy( block.attributes ) ) #should we copy attributes?
352    empty_block.text_size = block.text_size
353    #call recursive function to split into each combo of spec/blocks
354    for value in __split_components_by_species( spec_dict.values(), empty_block ):
355        sort_block_components_by_block( value, block ) #restore original component order
356        yield value
357
358
359#generator yielding only chopped and valid blocks for a specified region
360def get_chopped_blocks_for_region( index, src, region, species = None, mincols = 0 ):
361    for block, idx, offset in get_chopped_blocks_with_index_offset_for_region( index, src, region, species, mincols ):
362        yield block
363def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0 ):
364    for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ):
365        block = chop_block_by_region( block, src, region, species, mincols )
366        if block is not None:
367            yield block, idx, offset
368
369#returns a filled region alignment for specified regions
370def get_region_alignment( index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
371    if species is not None: alignment = RegionAlignment( end - start, species )
372    else: alignment = RegionAlignment( end - start, primary_species )
373    return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols, overwrite_with_gaps )
374
375#reduces a block to only positions exisiting in the src provided
376def reduce_block_by_primary_genome( block, species, chromosome, region_start ):
377    #returns ( startIndex, {species:texts}
378    #where texts' contents are reduced to only positions existing in the primary genome
379    src = "%s.%s" % ( species, chromosome )
380    ref = block.get_component_by_src( src )
381    start_offset = ref.start - region_start
382    species_texts = {}
383    for c in block.components:
384        species_texts[ c.src.split( '.' )[0] ] = list( c.text )
385    #remove locations which are gaps in the primary species, starting from the downstream end
386    for i in range( len( species_texts[ species ] ) - 1, -1, -1 ):
387        if species_texts[ species ][i] == '-':
388            for text in species_texts.values():
389                text.pop( i )
390    for spec, text in species_texts.items():
391        species_texts[spec] = ''.join( text )
392    return ( start_offset, species_texts )
393
394#fills a region alignment
395def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
396    region = bx.intervals.Interval( start, end )
397    region.chrom = chrom
398    region.strand = strand
399    primary_src = "%s.%s" % ( primary_species, chrom )
400   
401    #Order blocks overlaping this position by score, lowest first
402    blocks = []
403    for block, idx, offset in index.get_as_iterator_with_index_and_offset( primary_src, start, end ):
404        score = float( block.score )
405        for i in range( 0, len( blocks ) ):
406            if score < blocks[i][0]:
407                blocks.insert( i, ( score, idx, offset ) )
408                break
409        else:
410            blocks.append( ( score, idx, offset ) )
411   
412    #gap_chars_tuple = tuple( GAP_CHARS )
413    gap_chars_str = ''.join( GAP_CHARS )
414    #Loop through ordered blocks and layer by increasing score
415    for block_dict in blocks:         for block in iter_blocks_split_by_species( block_dict[1].get_at_offset( block_dict[2] ) ): #need to handle each occurance of sequence in block seperately
416            if component_overlaps_region( block.get_component_by_src( primary_src ), region ):
417                block = chop_block_by_region( block, primary_src, region, species, mincols ) #chop block
418                block = orient_block_by_region( block, primary_src, region ) #orient block
419                start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start )
420                for spec, text in species_texts.items():
421                    #we should trim gaps from both sides, since these are not positions in this species genome (sequence)
422                    text = text.rstrip( gap_chars_str )
423                    gap_offset = 0
424                    while True in [ text.startswith( gap_char ) for gap_char in GAP_CHARS ]: #python2.4 doesn't accept a tuple for .startswith()
425                    #while text.startswith( gap_chars_tuple ):
426                        gap_offset += 1
427                        text = text[1:]
428                        if not text:
429                            break
430                    if text:
431                        if overwrite_with_gaps:
432                            alignment.set_range( start_offset + gap_offset, spec, text )
433                        else:
434                            for i, char in enumerate( text ):
435                                if char not in GAP_CHARS:
436                                    alignment.set_position( start_offset + gap_offset + i, spec, char )
437    return alignment
438
439#returns a filled spliced region alignment for specified region with start and end lists
440def get_spliced_region_alignment( index, primary_species, chrom, starts, ends, strand = '+', species = None, mincols = 0, overwrite_with_gaps = True ):
441    #create spliced alignment object
442    if species is not None: alignment = SplicedAlignment( starts, ends, species )
443    else: alignment = SplicedAlignment( starts, ends, [primary_species] )
444    for exon in alignment.exons:
445        fill_region_alignment( exon, index, primary_species, chrom, exon.start, exon.end, strand, species, mincols, overwrite_with_gaps )
446    return alignment
447
448#loop through string array, only return non-commented lines
449def line_enumerator( lines, comment_start = '#' ):
450    i = 0
451    for line in lines:
452        if not line.startswith( comment_start ):
453            i += 1
454            yield ( i, line )
455
456#read a GeneBed file, return list of starts, ends, raw fields
457def get_starts_ends_fields_from_gene_bed( line ):
458    #Starts and ends for exons
459    starts = []
460    ends = []
461   
462    fields = line.split()
463    #Requires atleast 12 BED columns
464    if len(fields) < 12:
465        raise Exception( "Not a proper 12 column BED line (%s)." % line )
466    chrom     = fields[0]
467    tx_start  = int( fields[1] )
468    tx_end    = int( fields[2] )
469    name      = fields[3]
470    strand    = fields[5]
471    if strand != '-': strand='+' #Default strand is +
472    cds_start = int( fields[6] )
473    cds_end   = int( fields[7] )
474   
475    #Calculate and store starts and ends of coding exons
476    region_start, region_end = cds_start, cds_end
477    exon_starts = map( int, fields[11].rstrip( ',\n' ).split( ',' ) )
478    exon_starts = map( ( lambda x: x + tx_start ), exon_starts )
479    exon_ends = map( int, fields[10].rstrip( ',' ).split( ',' ) )
480    exon_ends = map( ( lambda x, y: x + y ), exon_starts, exon_ends );
481    for start, end in zip( exon_starts, exon_ends ):
482        start = max( start, region_start )
483        end = min( end, region_end )
484        if start < end:
485            starts.append( start )
486            ends.append( end )
487    return ( starts, ends, fields )
488
489def iter_components_by_src( block, src ):
490    for c in block.components:
491        if c.src == src:
492            yield c
493
494def get_components_by_src( block, src ):
495    return [ value for value in iter_components_by_src( block, src ) ]
496
497def iter_components_by_src_start( block, src ):
498    for c in block.components:
499        if c.src.startswith( src ):
500            yield c
501
502def get_components_by_src_start( block, src ):
503    return [ value for value in iter_components_by_src_start( block, src ) ]
504
505def sort_block_components_by_block( block1, block2 ):
506    #orders the components in block1 by the index of the component in block2
507    #block1 must be a subset of block2
508    #occurs in-place
509    return block1.components.sort( cmp = lambda x, y: block2.components.index( x ) - block2.components.index( y ) )
510
511def get_species_in_maf( maf_filename ):
512    species = []
513    for block in bx.align.maf.Reader( open( maf_filename ) ):
514        for spec in get_species_in_block( block ):
515            if spec not in species:
516                species.append( spec )
517    return species
518
519def parse_species_option( species ):
520    if species:
521        species = species.split( ',' )
522        if 'None' not in species:
523            return species
524    return None #provided species was '', None, or had 'None' in it
525
526def remove_temp_index_file( index_filename ):
527    try: os.unlink( index_filename )
528    except: pass
529
530#Below are methods to deal with FASTA files
531
532def get_fasta_header( component, attributes = {}, suffix = None ):
533    header = ">%s(%s):%i-%i|" % ( component.src, component.strand, component.get_forward_strand_start(), component.get_forward_strand_end() )
534    for key, value in attributes.iteritems():
535        header = "%s%s=%s|" % ( header, key, value )
536    if suffix:
537        header = "%s%s" % ( header, suffix )
538    else:
539        header = "%s%s" % ( header, src_split( component.src )[ 0 ] )
540    return header
541
542def get_attributes_from_fasta_header( header ):
543    if not header: return {}
544    attributes = {}
545    header = header.lstrip( '>' )
546    header = header.strip()
547    fields = header.split( '|' )
548    try:
549        region = fields[0]
550        region = region.split( '(', 1 )
551        temp = region[0].split( '.', 1 )
552        attributes['species'] = temp[0]
553        if len( temp ) == 2:
554            attributes['chrom'] = temp[1]
555        else:
556            attributes['chrom'] = temp[0]
557        region = region[1].split( ')', 1 )
558        attributes['strand'] = region[0]
559        region = region[1].lstrip( ':' ).split( '-' )
560        attributes['start'] = int( region[0] )
561        attributes['end'] = int( region[1] )
562    except:
563        #fields 0 is not a region coordinate
564        pass
565    if len( fields ) > 2:
566        for i in xrange( 1, len( fields ) - 1 ):
567            prop = fields[i].split( '=', 1 )
568            if len( prop ) == 2:
569                attributes[ prop[0] ] = prop[1]
570    if len( fields ) > 1:
571        attributes['__suffix__'] = fields[-1]
572    return attributes
573
574def iter_fasta_alignment( filename ):
575    class fastaComponent:
576        def __init__( self, species, text = "" ):
577            self.species = species
578            self.text = text
579        def extend( self, text ):
580            self.text = self.text + text.replace( '\n', '' ).replace( '\r', '' ).strip()
581    #yields a list of fastaComponents for a FASTA file
582    f = open( filename, 'rb' )
583    components = []
584    #cur_component = None
585    while True:
586        line = f.readline()
587        if not line:
588            if components:
589                yield components
590            return
591        line = line.strip()
592        if not line:
593            if components:
594                yield components
595            components = []
596        elif line.startswith( '>' ):
597            attributes = get_attributes_from_fasta_header( line )
598            components.append( fastaComponent( attributes['species'] ) )
599        elif components:
600            components[-1].extend( line )
601
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。