[3] | 1 | """ |
---|
| 2 | Support for parsing phylogenetic tree's in newick format. |
---|
| 3 | |
---|
| 4 | TODO: Tree/Edge should be a generic data structure, not newick specific. |
---|
| 5 | """ |
---|
| 6 | |
---|
| 7 | from bx_extras.pyparsing import * |
---|
| 8 | |
---|
| 9 | __all__ = [ "Tree", "Edge", "NewickParser", "newick_parser" ] |
---|
| 10 | |
---|
| 11 | def indent( s ): |
---|
| 12 | return "\n".join( " " + line for line in s.split( "\n" ) ) |
---|
| 13 | |
---|
| 14 | def print_( p, s ): |
---|
| 15 | print p, type(s), s |
---|
| 16 | return s |
---|
| 17 | |
---|
| 18 | class Tree( object ): |
---|
| 19 | def __init__( self, label, edges=None ): |
---|
| 20 | self.label = label |
---|
| 21 | self.edges = edges |
---|
| 22 | def pretty( self ): |
---|
| 23 | if self.edges: |
---|
| 24 | return "Tree( '%s',\n%s\n)" % ( self.label, indent( "\n".join( repr( edge ) for edge in self.edges ) ) ) |
---|
| 25 | else: |
---|
| 26 | return "Tree( '%s' )" % self.label |
---|
| 27 | def __cmp__( self, other ): |
---|
| 28 | if isinstance( other, Tree ): |
---|
| 29 | return cmp( self.__dict__, other.__dict__ ) |
---|
| 30 | else: |
---|
| 31 | return 1 |
---|
| 32 | def __repr__( self ): |
---|
| 33 | return "Tree( %s, %s )" % ( repr( self.label ), repr( self.edges ) ) |
---|
| 34 | |
---|
| 35 | class Edge( object ): |
---|
| 36 | def __init__( self, length, tip ): |
---|
| 37 | self.length = length |
---|
| 38 | self.tip = tip |
---|
| 39 | def pretty( self ): |
---|
| 40 | return "Edge( %s, \n%s\n)" % ( repr( self.length ), indent( repr( self.tip ) ) ) |
---|
| 41 | def __cmp__( self, other ): |
---|
| 42 | if isinstance( other, Edge ): |
---|
| 43 | return cmp( self.__dict__, other.__dict__ ) |
---|
| 44 | else: |
---|
| 45 | return 1 |
---|
| 46 | def __repr__( self ): |
---|
| 47 | return "Edge( %s, %s )" % ( repr( self.length ), repr( self.tip ) ) |
---|
| 48 | |
---|
| 49 | def create_parser(): |
---|
| 50 | """ |
---|
| 51 | Create a 'pyparsing' parser for newick format trees roughly based on the |
---|
| 52 | grammer here: |
---|
| 53 | http://evolution.genetics.washington.edu/phylip/newick_doc.html |
---|
| 54 | |
---|
| 55 | Problems: |
---|
| 56 | - Is a single leaf a valid tree? |
---|
| 57 | - Branch length on root? Doesn't make sense to me, and forces the root |
---|
| 58 | to be an edge. |
---|
| 59 | """ |
---|
| 60 | # Basic tokens |
---|
| 61 | real = Combine( Word( "+-" + nums, nums ) + |
---|
| 62 | Optional( "." + Optional( Word( nums ) ) ) + |
---|
| 63 | Optional( CaselessLiteral( "E" ) + Word( "+-" + nums, nums ) ) ) |
---|
| 64 | lpar = Suppress( "(" ) |
---|
| 65 | rpar = Suppress( ")" ) |
---|
| 66 | colon = Suppress( ":" ) |
---|
| 67 | semi = Suppress( ";" ) |
---|
| 68 | quot = Suppress( "'" ) |
---|
| 69 | # Labels are either unquoted or single quoted, if unquoted underscores will be replaced with spaces |
---|
| 70 | quoted_label = QuotedString( "'", None, "''" ).setParseAction( lambda s, l, t: t[0] ) |
---|
| 71 | simple_label = Word( alphas + nums + "_." ).setParseAction( lambda s, l, t: t[0].replace( "_", " " ) ) |
---|
| 72 | label = quoted_label | simple_label |
---|
| 73 | # Branch length is a real number (note though that exponents are not in the spec!) |
---|
| 74 | branch_length = real.setParseAction( lambda s, l, t: float( t[0] ) ) |
---|
| 75 | # Need to forward declare this due to circularity |
---|
| 76 | node_list = Forward() |
---|
| 77 | # A node might have an list of edges (for a subtree), a label, and/or a branch length |
---|
| 78 | node = ( Optional( node_list, None ) + Optional( label, "" ) + Optional( colon + branch_length, None ) ) \ |
---|
| 79 | .setParseAction( lambda s, l, t: Edge( t[2], Tree( t[1] or None, t[0] ) ) ) |
---|
| 80 | node_list << ( lpar + delimitedList( node ) + rpar ) \ |
---|
| 81 | .setParseAction( lambda s, l, t: [ t.asList() ] ) |
---|
| 82 | # The root cannot have a branch length |
---|
| 83 | tree = ( node_list + Optional( label, "" ) + semi )\ |
---|
| 84 | .setParseAction( lambda s, l, t: Tree( t[1] or None, t[0] ) ) |
---|
| 85 | # Return the outermost element |
---|
| 86 | return tree |
---|
| 87 | |
---|
| 88 | class NewickParser( object ): |
---|
| 89 | """ |
---|
| 90 | Class wrapping a parser for building Trees from newick format strings |
---|
| 91 | """ |
---|
| 92 | def __init__( self ): |
---|
| 93 | self.parser = create_parser() |
---|
| 94 | def parse_string( self, s ): |
---|
| 95 | return self.parser.parseString( s )[0] |
---|
| 96 | |
---|
| 97 | newick_parser = NewickParser() |
---|