root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/align/maf.py

リビジョン 3, 8.2 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Support for the `MAF`_ multiple sequence alignment format used by `multiz`_.
3
4.. _MAF: http://genome.ucsc.edu/FAQ/FAQformat.html#format5
5.. _multiz: http://www.bx.psu.edu/miller_lab/
6"""
7
8from bx.align import *
9
10from StringIO import StringIO
11import os
12
13import itertools
14from bx import interval_index_file
15
16from bx.misc.seekbzip2 import SeekableBzip2File
17
18MAF_INVERSE_STATUS = 'V'
19MAF_INSERT_STATUS = 'I'
20MAF_CONTIG_STATUS = 'C'
21MAF_CONTIG_NESTED_STATUS = 'c'
22MAF_NEW_STATUS = 'N'
23MAF_NEW_NESTED_STATUS = 'n'
24MAF_MAYBE_NEW_STATUS = 'S'
25MAF_MAYBE_NEW_NESTED_STATUS = 's'
26MAF_MISSING_STATUS = 'M'
27
28class MAFIndexedAccess( interval_index_file.AbstractIndexedAccess ):
29    """
30    Indexed access to a MAF file.
31    """
32    def read_at_current_offset( self, file, **kwargs ):
33        """
34        Read the MAF block at the current position in `file` and return an
35        instance of `Alignment`.
36        """
37        return read_next_maf( file, **kwargs )
38
39class MAFMultiIndexedAccess( interval_index_file.AbstractMultiIndexedAccess ):
40    """
41    Indexed access to multiple MAF files.
42    """
43    indexed_access_class = MAFIndexedAccess
44     
45Indexed = MAFIndexedAccess
46"""Deprecated: `MAFIndexedAccess` is also available under the name `Indexed`."""
47
48MultiIndexed = MAFMultiIndexedAccess
49"""Deprecated: `MAFMultiIndexedAccess` is also available under the name `MultiIndexed`."""
50
51class Reader( object ):
52    """
53    Iterate over all maf blocks in a file in order
54    """
55    def __init__( self, file, **kwargs ):
56        self.file = file
57        self.maf_kwargs = kwargs
58        # Read and verify maf header, store any attributes
59        fields = self.file.readline().split()
60        if fields[0] != '##maf': raise "File does not have MAF header"
61        self.attributes = parse_attributes( fields[1:] )
62
63    def next( self ):
64        return read_next_maf( self.file, **self.maf_kwargs )
65
66    def __iter__( self ):
67        return ReaderIter( self )
68
69    def close( self ):
70        self.file.close()
71
72class ReaderIter( object ):
73    """
74    Adapts a `Reader` to the iterator protocol.
75    """
76    def __init__( self, reader ):
77        self.reader = reader
78    def __iter__( self ):
79        return self
80    def next( self ):
81        v = self.reader.next()
82        if not v: raise StopIteration
83        return v
84
85class Writer( object ):
86
87    def __init__( self, file, attributes={} ):
88        self.file = file
89        # Write header, Webb's maf code wants version first, we accomodate
90        if not attributes.has_key('version'): attributes['version'] = 1
91        self.file.write( "##maf version=%s" % attributes['version'] )
92        for key in attributes:
93            if key == 'version': continue
94            self.file.writelines( " %s=%s" % ( key, attributes[key] ) )
95        self.file.write( "\n" )
96
97    def write( self, alignment ):
98        self.file.write( "a score=" + str( alignment.score ) )
99        for key in alignment.attributes:
100            self.file.write( " %s=%s" % ( key, alignment.attributes[key] ) )
101        self.file.write( "\n" )
102        # Components
103        rows = []
104        for c in alignment.components:
105            # "Empty component" generates an 'e' row
106            if c.empty:
107                rows.append( ( "e", c.src, str( c.start ), str( c.size ), c.strand, str( c.src_size ), c.synteny_empty ) )
108                continue
109            # Regular component
110            rows.append( ( "s", c.src, str( c.start ), str( c.size ), c.strand, str( c.src_size ), c.text ) )
111            # If component has quality, write a q row
112            if c.quality is not None:
113                rows.append( ( "q", c.src, "", "", "", "", c.quality ) )
114            # If component has synteny follow up with an 'i' row
115            if c.synteny_left and c.synteny_right:
116                rows.append( ( "i", c.src, "", "", "", "", " ".join( map( str, c.synteny_left + c.synteny_right ) ) ) )
117        self.file.write( format_tabular( rows, "llrrrrl" ) )
118        self.file.write( "\n" )
119
120    def close( self ):
121        self.file.close()
122
123# ---- Helper methods -------------------------------------------------------
124
125def from_string( string, **kwargs ):
126    return read_next_maf( StringIO( string ), **kwargs )
127
128def read_next_maf( file, species_to_lengths=None, parse_e_rows=False ):
129    """
130    Read the next MAF block from `file` and return as an `Alignment`
131    instance. If `parse_i_rows` is true, empty components will be created
132    when e rows are encountered.
133    """
134    alignment = Alignment(species_to_lengths=species_to_lengths)
135    # Attributes line
136    line = readline( file, skip_blank=True )
137    if not line: return None
138    fields = line.split()
139    if fields[0] != 'a': raise "Expected 'a ...' line"
140    alignment.attributes = parse_attributes( fields[1:] )
141    if 'score' in alignment.attributes:
142        alignment.score = alignment.attributes['score']
143        del alignment.attributes['score']
144    else:
145        alignment.score = 0
146    # Sequence lines
147    last_component = None
148    while 1:
149        line = readline( file )
150        # EOF or Blank line terminates alignment components
151        if not line or line.isspace(): break
152        if line.isspace(): break
153        # Parse row
154        fields = line.split()
155        if fields[0] == 's':
156            # An 's' row contains sequence for a component
157            component = Component()
158            component.src = fields[1]
159            component.start = int( fields[2] )
160            component.size = int( fields[3] )
161            component.strand = fields[4]
162            component.src_size = int( fields[5] )
163            if len(fields) > 6: component.text = fields[6].strip()
164            # Add to set
165            alignment.add_component( component )
166            last_component = component
167        elif fields[0] == 'e':
168            # An 'e' row, when no bases align for a given species this tells
169            # us something about the synteny
170            if parse_e_rows:
171                component = Component()
172                component.empty = True
173                component.src = fields[1]
174                component.start = int( fields[2] )
175                component.size = int( fields[3] )
176                component.strand = fields[4]
177                component.src_size = int( fields[5] )
178                component.text = None
179                synteny = fields[6].strip()
180                assert len( synteny ) == 1, \
181                    "Synteny status in 'e' rows should be denoted with a single character code"
182                component.synteny_empty = synteny
183                alignment.add_component( component )
184                last_component = component
185        elif fields[0] == 'i':
186            # An 'i' row, indicates left and right synteny status for the
187            # previous component, we hope ;)
188            assert fields[1] == last_component.src, "'i' row does not follow matching 's' row"
189            last_component.synteny_left = ( fields[2], int( fields[3] ) )
190            last_component.synteny_right = ( fields[4], int( fields[5] ) )
191        elif fields[0] == 'q':
192            assert fields[1] == last_component.src, "'q' row does not follow matching 's' row"
193            # TODO: Should convert this to an integer array?
194            last_component.quality = fields[2]
195           
196    return alignment
197
198def readline( file, skip_blank=False ):
199    """Read a line from provided file, skipping any blank or comment lines"""
200    while 1:
201        line = file.readline()
202        #print "every line: %r" % line
203        if not line: return None
204        if line[0] != '#' and not ( skip_blank and line.isspace() ):
205            return line
206
207def parse_attributes( fields ):
208    """Parse list of key=value strings into a dict"""
209    attributes = {}
210    for field in fields:
211        pair = field.split( '=' )
212        attributes[ pair[0] ] = pair[1]
213    return attributes
214
215def format_tabular( rows, align=None ):
216    if len( rows ) == 0: return ""
217    lengths = [ len( col ) for col in rows[ 0 ] ]
218    for row in rows[1:]:
219        for i in range( 0, len( row ) ):
220            lengths[ i ] = max( lengths[ i ], len( row[ i ] ) )
221    rval = ""
222    for row in rows:
223        for i in range( 0, len( row ) ):
224            if align and align[ i ] == "l":
225                rval += row[ i ].ljust( lengths[ i ] )
226            else:
227                rval += row[ i ].rjust( lengths[ i ] )
228            rval += " "
229        rval += "\n"
230    return rval
231       
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。