root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/align/core.py

リビジョン 3, 16.6 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Classes that represent alignments between multiple sequences.
3"""
4
5import random
6import string
7import weakref
8from bx.misc.readlengths import read_lengths_file
9
10# DNA reverse complement table
11## DNA_COMP = "                                             -                  " \
12##            " TVGH  CD  M KN   YSA BWXR       tvgh  cd  m kn   ysa bwxr      " \
13##            "                                                                " \
14##            "                                                                "
15
16DNA_COMP = string.maketrans( "ACGTacgt", "TGCAtgca" )
17
18class Alignment( object ):
19
20    def __init__( self, score=0, attributes={}, species_to_lengths=None ):
21        # species_to_lengths is needed only for file formats that don't provide
22        # chromosome lengths;  it maps each species name to one of these:
23        #   - the name of a file that contains a list of chromosome length pairs
24        #   - a dict mapping chromosome names to their length
25        #   - a single length value (useful when we just have one sequence and no chromosomes)
26        # internally a file name is replaced by a dict, but only on an "as
27        # needed" basis
28        self.score = score
29        self.text_size = 0
30        self.attributes = attributes
31        if species_to_lengths == None: self.species_to_lengths = {}
32        else: self.species_to_lengths = species_to_lengths
33        self.components = []
34
35    def add_component( self, component ):
36        component._alignment = weakref.ref( self )
37        self.components.append( component )
38        if component.text is not None:
39            if self.text_size == 0:
40                self.text_size = len( component.text )
41            elif self.text_size != len( component.text ):
42                raise Exception( "Components must have same text length" )
43
44    def get_score( self ):
45        return self.__score
46    def set_score( self,score ):
47        if type( score ) == str:
48            try:
49                score = int(score)
50            except:
51                try:
52                    score = float(score)
53                except:
54                    pass
55        self.__score = score
56    score = property( fget=get_score,fset=set_score )
57
58    def __str__( self ):
59        s = "a score=" + str( self.score )
60        for key in self.attributes:
61            s += " %s=%s" % ( key, self.attributes[key] )
62        s += "\n"
63        # Components
64        for c in self.components:
65            s += str( c )
66            s += "\n"
67        return s
68
69    def src_size( self, src ):
70        species,chrom = src_split( src )
71        if species in self.species_to_lengths:
72            chrom_to_length = self.species_to_lengths[species]
73        elif chrom in self.species_to_lengths:
74            chrom_to_length = self.species_to_lengths
75        else:
76            raise "no src_size (no length file for %s)" % species
77        if type( chrom_to_length ) == int:         # (if it's a single length)
78            return chrom_to_length
79        if type( chrom_to_length ) == type( "" ):  # (if it's a file name)
80            chrom_to_length = read_lengths_file( chrom_to_length )
81            self.species_to_lengths[species] = chrom_to_length
82        if chrom not in chrom_to_length: raise "no src_size (%s has no length for %s)" % ( species, chrom )
83        return chrom_to_length[chrom]
84
85    def get_component_by_src( self, src ):
86        for c in self.components:
87            if c.src == src: return c
88        return None
89
90    def get_component_by_src_start( self, src ):
91        for c in self.components:
92            if c.src.startswith( src ): return c
93        return None   
94
95    def slice( self, start, end ):
96        new = Alignment( score=self.score, attributes=self.attributes )
97        for component in self.components:
98            # FIXME: Is this the right solution?
99            if component.empty:
100                continue
101            new.components.append( component.slice( start, end ) )
102        new.text_size = end - start
103        return new
104   
105    def reverse_complement( self ):
106        new = Alignment( score=self.score, attributes=self.attributes )
107        for component in self.components:
108            new.components.append( component.reverse_complement() )
109        new.text_size = self.text_size
110        return new
111   
112    def slice_by_component( self, component_index, start, end ):
113        """
114        Return a slice of the alignment, corresponding to an coordinate interval in a specific component.
115
116        component_index is one of
117            an integer offset into the components list
118            a string indicating the src of the desired component
119            a component
120
121        start and end are relative to the + strand, regardless of the component's strand.
122
123        """
124        if type( component_index ) == type( 0 ):
125            ref = self.components[ component_index ]
126        elif type( component_index ) == type( "" ):
127            ref = self.get_component_by_src( component_index )
128        elif type( component_index ) == Component:
129            ref = component_index
130        else:
131            raise ValueError( "can't figure out what to do" )
132        start_col = ref.coord_to_col( start ) 
133        end_col = ref.coord_to_col( end ) 
134        if (ref.strand == '-'):
135            (start_col,end_col) = (end_col,start_col)
136        return self.slice( start_col, end_col )
137       
138    def column_iter( self ):
139        for i in range( self.text_size ):
140            yield [ c.text[i] for c in self.components ]
141
142    def limit_to_species( self, species ):
143        new = Alignment( score=self.score, attributes=self.attributes )
144        new.text_size = self.text_size
145        for component in self.components:
146            if component.src.split('.')[0] in species:
147                new.add_component( component )
148        return new
149
150    def remove_all_gap_columns( self ):
151        """
152        Remove any columns containing only gaps from alignment components,
153        text of components is modified IN PLACE.
154        """
155        seqs = []
156        for c in self.components:
157            try:
158                seqs.append( list( c.text ) )
159            except TypeError:
160                seqs.append( None )
161        i = 0
162        text_size = self.text_size
163        while i < text_size:
164            all_gap = True
165            for seq in seqs:
166                if seq is None: continue
167                if seq[i] != '-': all_gap = False
168            if all_gap:
169                for seq in seqs:
170                    if seq is None: continue
171                    del seq[i]
172                text_size -= 1
173            else:
174                i += 1
175        for i in range( len( self.components ) ):
176            if seqs[i] is None: continue
177            self.components[i].text = ''.join( seqs[i] )
178        self.text_size = text_size
179       
180    def __eq__( self, other ):
181        if other is None or type( other ) != type( self ):
182            return False
183        if self.score != other.score:
184            return False
185        if self.attributes != other.attributes:
186            return False
187        if len( self.components ) != len( other.components ):
188            return False
189        for c1, c2 in zip( self.components, other.components ):
190            if c1 != c2:
191                return False
192        return True
193       
194    def __ne__( self, other ):
195        return not( self.__eq__( other ) )
196   
197    def __deepcopy__( self, memo ):
198        from copy import deepcopy
199        new = Alignment( score=self.score, attributes=deepcopy( self.attributes ), species_to_lengths=deepcopy( self.species_to_lengths ) )
200        for component in self.components:
201            new.add_component( deepcopy( component ) )
202        return new
203   
204class Component( object ):
205
206    def __init__( self, src='', start=0, size=0, strand=None, src_size=None, text='' ):
207        self._alignment = None
208        self.src = src
209        self.start = start          # Nota Bene:  start,size,strand are as they
210        self.size = size            # .. appear in a MAF file-- origin-zero, end
211        self.strand = strand        # .. excluded, and minus strand counts from
212        self._src_size = src_size   # .. end of sequence
213        self.text = text
214        self.quality = None
215        # Optional fields to keep track of synteny status (only makes sense
216        # when the alignment is part of an ordered set)
217        self.synteny_left = None
218        self.synteny_right = None
219        self.synteny_empty = None
220        # If true, this component actually represents a non-aligning region,
221        # and has no text.
222        self.empty = False
223        # Index maps a coordinate (distance along + strand from + start) to alignment column
224        self.index = None
225
226    def __str__( self ):
227        if self.empty:
228            rval = "e %s %d %d %s %d %s" % ( self.src, self.start,
229                                             self.size, self.strand,
230                                             self.src_size, self.synteny_empty )
231        else:
232            rval = "s %s %d %d %s %d %s" % ( self.src, self.start,
233                                             self.size, self.strand,
234                                             self.src_size, self.text )
235            if self.synteny_left and self.synteny_right:
236                rval += "\ni %s %s %d %s %d" % ( self.src,
237                                                 self.synteny_left[0], self.synteny_left[1],
238                                                 self.synteny_right[0], self.synteny_right[1] )
239        return rval
240
241    def get_end( self ):
242        return self.start + self.size
243    end = property( fget=get_end )
244
245    def get_src_size( self ):
246        if self._src_size == None:
247            if self._alignment == None: raise "component has no src_size"
248            self._src_size = self._alignment().src_size( self.src )
249        return self._src_size
250    def set_src_size( self,src_size ):
251        self._src_size = src_size
252    src_size = property( fget=get_src_size, fset=set_src_size )
253
254    def get_forward_strand_start( self ):
255        if self.strand == '-': return self.src_size - self.end
256        else: return self.start
257    forward_strand_start = property( fget=get_forward_strand_start )
258       
259    def get_forward_strand_end( self ):
260        if self.strand == '-': return self.src_size - self.start
261        else: return self.end
262    forward_strand_end = property( fget=get_forward_strand_end)
263
264    def reverse_complement( self ):
265        start = self.src_size - self.end
266        if self.strand == "+": strand = "-"
267        else: strand = "+"
268        comp = [ch for ch in self.text.translate(DNA_COMP)]
269        comp.reverse()
270        text = "".join(comp)
271        new = Component( self.src, start, self.size, strand, self._src_size, text )
272        new._alignment = self._alignment
273        return new
274
275    def slice( self, start, end ):
276        new = Component( src=self.src, start=self.start, strand=self.strand, src_size=self._src_size )
277        new._alignment = self._alignment
278        new.text = self.text[start:end]
279
280        #for i in range( 0, start ):
281        #    if self.text[i] != '-': new.start += 1
282        #for c in new.text:
283        #    if c != '-': new.size += 1
284        new.start += start - self.text.count( '-', 0, start )
285        new.size = len( new.text ) - new.text.count( '-' )
286
287        # FIXME: This annotation probably means nothing after slicing if
288        # one of the ends changes. In general the 'i' rows of a MAF only
289        # make sense in context (relative to the previous and next alignments
290        # in a stream, slicing breaks that).
291        new.synteny_left = self.synteny_left
292        new.synteny_right = self.synteny_right
293
294        return new
295
296    def slice_by_coord( self, start, end ):
297        """
298        Return the slice of the component corresponding to a coordinate interval.
299
300        start and end are relative to the + strand, regardless of the component's strand.
301
302        """
303        start_col = self.coord_to_col( start ) 
304        end_col = self.coord_to_col( end ) 
305        if (self.strand == '-'):
306            (start_col,end_col) = (end_col,start_col)
307        return self.slice( start_col, end_col )
308   
309    def coord_to_col( self, pos ):
310        """
311        Return the alignment column index corresponding to coordinate pos.
312
313        pos is relative to the + strand, regardless of the component's strand.
314
315        """
316        start,end = self.get_forward_strand_start(),self.get_forward_strand_end()
317        if pos < start or pos > end:
318            raise "Range error: %d not in %d-%d" % ( pos, start, end )
319        if not self.index:
320            self.index = list()
321            if (self.strand == '-'):
322                # nota bene: for - strand self.index[x] maps to one column
323                # higher than is actually associated with the position;  thus
324                # when slice_by_component() and slice_by_coord() flip the ends,
325                # the resulting slice is correct
326                for x in range( len(self.text)-1,-1,-1 ):
327                    if not self.text[x] == '-':
328                        self.index.append( x + 1 )
329                self.index.append( 0 )
330            else:
331                for x in range( len(self.text) ):
332                    if not self.text[x] == '-':
333                        self.index.append(x)
334                self.index.append( len(self.text) )
335        x = None
336        try:
337            x = self.index[ pos - start ]
338        except:
339            raise "Error in index."
340        return x
341   
342   
343    def __eq__( self, other ):
344        if other is None or type( other ) != type( self ):
345            return False
346        return ( self.src == other.src
347                 and self.start == other.start
348                 and self.size == other.size           
349                 and self.strand == other.strand       
350                 and self._src_size == other._src_size   
351                 and self.text == other.text
352                 and self.synteny_left == other.synteny_left
353                 and self.synteny_right == other.synteny_right
354                 and self.synteny_empty == other.synteny_empty
355                 and self.empty == other.empty )
356       
357    def __ne__( self, other ):
358        return not( self.__eq__( other ) )
359   
360    def __deepcopy__( self, memo ):
361        new = Component( src=self.src, start=self.start, size=self.size, strand=self.strand, src_size=self._src_size, text=self.text )
362        new._alignment = self._alignment
363        new.quality = self.quality
364        new.synteny_left = self.synteny_left
365        new.synteny_right = self.synteny_right
366        new.synteny_empty = self.synteny_empty
367        new.empty = self.empty
368        new.index = self.index
369        return new
370
371def get_reader( format, infile, species_to_lengths=None ):
372    import bx.align.maf, bx.align.axt, bx.align.lav
373    if format == "maf": return bx.align.maf.Reader( infile, species_to_lengths )
374    elif format == "axt": return bx.align.axt.Reader( infile, species_to_lengths )
375    elif format == "lav": return bx.align.lav.Reader( infile )
376    else: raise "Unknown alignment format %s" % format
377
378def get_writer( format, outfile, attributes={} ):
379    import bx.align.maf, bx.align.axt, bx.align.lav
380    if format == "maf": return bx.align.maf.Writer( outfile, attributes )
381    elif format == "axt": return bx.align.axt.Writer( outfile, attributes )
382    elif format == "lav": return bx.align.lav.Writer( outfile, attributes )
383    else: raise "Unknown alignment format %s" % format
384
385def get_indexed( format, filename, index_filename=None, keep_open=False, species_to_lengths=None ):
386    import bx.align.maf, bx.align.axt, bx.align.lav
387    if format == "maf": return bx.align.maf.Indexed( filename, index_filename, keep_open, species_to_lengths )
388    elif format == "axt": return bx.align.axt.Indexed( filename, index_filename, keep_open, species_to_lengths )
389    elif format == "lav": raise "LAV support for Indexed has not been implemented"
390    else: raise "Unknown alignment format %s" % format
391
392def shuffle_columns( a ):
393    """Randomize the columns of an alignment"""
394    mask = range( a.text_size )
395    random.shuffle( mask )
396    for c in a.components:
397        c.text = ''.join( [ c.text[i] for i in mask ] )
398
399def src_split( src ): # splits src into species,chrom
400    dot = src.rfind( "." )
401    if dot == -1: return None,src
402    else:         return src[:dot],src[dot+1:]
403
404def src_merge( species,chrom,contig=None ): # creates src (inverse of src_split)
405    if species == None: src = chrom
406    else:               src = species + "." + chrom
407    if contig != None: src += "[%s]" % contig
408    return src
409
410# ---- Read C extension if available ---------------------------------------
411
412try:
413    from _core import coord_to_col
414except:
415    def coord_to_col( start, text, pos ):
416        col = 0
417        while start < pos:
418            if text[col] != '-':
419                start += 1
420            col += 1
421        return col
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。