1 | """ |
---|
2 | Support for parsing phylogenetic tree's in newick format. |
---|
3 | |
---|
4 | TODO: Tree/Edge should be a generic data structure, not newick specific. |
---|
5 | """ |
---|
6 | |
---|
7 | from bx_extras.pyparsing import * |
---|
8 | |
---|
9 | __all__ = [ "Tree", "Edge", "NewickParser", "newick_parser" ] |
---|
10 | |
---|
11 | def indent( s ): |
---|
12 | return "\n".join( " " + line for line in s.split( "\n" ) ) |
---|
13 | |
---|
14 | def print_( p, s ): |
---|
15 | print p, type(s), s |
---|
16 | return s |
---|
17 | |
---|
18 | class Tree( object ): |
---|
19 | def __init__( self, label, edges=None ): |
---|
20 | self.label = label |
---|
21 | self.edges = edges |
---|
22 | def pretty( self ): |
---|
23 | if self.edges: |
---|
24 | return "Tree( '%s',\n%s\n)" % ( self.label, indent( "\n".join( repr( edge ) for edge in self.edges ) ) ) |
---|
25 | else: |
---|
26 | return "Tree( '%s' )" % self.label |
---|
27 | def __cmp__( self, other ): |
---|
28 | if isinstance( other, Tree ): |
---|
29 | return cmp( self.__dict__, other.__dict__ ) |
---|
30 | else: |
---|
31 | return 1 |
---|
32 | def __repr__( self ): |
---|
33 | return "Tree( %s, %s )" % ( repr( self.label ), repr( self.edges ) ) |
---|
34 | |
---|
35 | class Edge( object ): |
---|
36 | def __init__( self, length, tip ): |
---|
37 | self.length = length |
---|
38 | self.tip = tip |
---|
39 | def pretty( self ): |
---|
40 | return "Edge( %s, \n%s\n)" % ( repr( self.length ), indent( repr( self.tip ) ) ) |
---|
41 | def __cmp__( self, other ): |
---|
42 | if isinstance( other, Edge ): |
---|
43 | return cmp( self.__dict__, other.__dict__ ) |
---|
44 | else: |
---|
45 | return 1 |
---|
46 | def __repr__( self ): |
---|
47 | return "Edge( %s, %s )" % ( repr( self.length ), repr( self.tip ) ) |
---|
48 | |
---|
49 | def create_parser(): |
---|
50 | """ |
---|
51 | Create a 'pyparsing' parser for newick format trees roughly based on the |
---|
52 | grammer here: |
---|
53 | http://evolution.genetics.washington.edu/phylip/newick_doc.html |
---|
54 | |
---|
55 | Problems: |
---|
56 | - Is a single leaf a valid tree? |
---|
57 | - Branch length on root? Doesn't make sense to me, and forces the root |
---|
58 | to be an edge. |
---|
59 | """ |
---|
60 | # Basic tokens |
---|
61 | real = Combine( Word( "+-" + nums, nums ) + |
---|
62 | Optional( "." + Optional( Word( nums ) ) ) + |
---|
63 | Optional( CaselessLiteral( "E" ) + Word( "+-" + nums, nums ) ) ) |
---|
64 | lpar = Suppress( "(" ) |
---|
65 | rpar = Suppress( ")" ) |
---|
66 | colon = Suppress( ":" ) |
---|
67 | semi = Suppress( ";" ) |
---|
68 | quot = Suppress( "'" ) |
---|
69 | # Labels are either unquoted or single quoted, if unquoted underscores will be replaced with spaces |
---|
70 | quoted_label = QuotedString( "'", None, "''" ).setParseAction( lambda s, l, t: t[0] ) |
---|
71 | simple_label = Word( alphas + nums + "_." ).setParseAction( lambda s, l, t: t[0].replace( "_", " " ) ) |
---|
72 | label = quoted_label | simple_label |
---|
73 | # Branch length is a real number (note though that exponents are not in the spec!) |
---|
74 | branch_length = real.setParseAction( lambda s, l, t: float( t[0] ) ) |
---|
75 | # Need to forward declare this due to circularity |
---|
76 | node_list = Forward() |
---|
77 | # A node might have an list of edges (for a subtree), a label, and/or a branch length |
---|
78 | node = ( Optional( node_list, None ) + Optional( label, "" ) + Optional( colon + branch_length, None ) ) \ |
---|
79 | .setParseAction( lambda s, l, t: Edge( t[2], Tree( t[1] or None, t[0] ) ) ) |
---|
80 | node_list << ( lpar + delimitedList( node ) + rpar ) \ |
---|
81 | .setParseAction( lambda s, l, t: [ t.asList() ] ) |
---|
82 | # The root cannot have a branch length |
---|
83 | tree = ( node_list + Optional( label, "" ) + semi )\ |
---|
84 | .setParseAction( lambda s, l, t: Tree( t[1] or None, t[0] ) ) |
---|
85 | # Return the outermost element |
---|
86 | return tree |
---|
87 | |
---|
88 | class NewickParser( object ): |
---|
89 | """ |
---|
90 | Class wrapping a parser for building Trees from newick format strings |
---|
91 | """ |
---|
92 | def __init__( self ): |
---|
93 | self.parser = create_parser() |
---|
94 | def parse_string( self, s ): |
---|
95 | return self.parser.parseString( s )[0] |
---|
96 | |
---|
97 | newick_parser = NewickParser() |
---|