root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/align/axt.py @ 3

リビジョン 3, 7.2 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Support for reading and writing the `AXT`_ format used for pairwise
3alignments.
4
5.. _AXT: http://genome.ucsc.edu/goldenPath/help/axt.html
6"""
7
8from bx.align import *
9
10import itertools
11from bx import interval_index_file
12
13# Tools for dealing with pairwise alignments in AXT format
14
15class MultiIndexed( object ):
16    """Similar to 'indexed' but wraps more than one axt_file"""
17    def __init__( self, axt_filenames, keep_open=False ):
18        self.indexes = [ Indexed( axt_file, axt_file + ".index" ) for axt_file in axt_filenames ]
19    def get( self, src, start, end ):
20        blocks = []
21        for index in self.indexes: blocks += index.get( src, start, end )
22        return blocks
23
24
25class Indexed( object ):
26    """Indexed access to a axt using overlap queries, requires an index file"""
27
28    def __init__( self, axt_filename, index_filename=None, keep_open=False, species1 = None, species2=None, species_to_lengths=None, support_ids=False ):
29        if index_filename is None: index_filename = axt_filename + ".index"
30        self.indexes = interval_index_file.Indexes( filename=index_filename )
31        self.axt_filename = axt_filename
32        # nota bene: (self.species1 = species1 or "species1") is incorrect if species1=""
33        self.species1 = species1
34        if (self.species1 == None): self.species1 = "species1"
35        self.species2 = species2
36        if (self.species2 == None): self.species2 = "species2"
37        self.species_to_lengths = species_to_lengths
38        self.support_ids        = support_ids            # for extra text at end of axt header lines
39        if keep_open:
40            self.f = open( axt_filename )
41        else:
42            self.f = None
43
44    def get( self, src, start, end ):
45        intersections = self.indexes.find( src, start, end )
46        return itertools.imap( self.get_axt_at_offset, [ val for start, end, val in intersections ] )
47
48    def get_axt_at_offset( self, offset ):
49        if self.f:
50            self.f.seek( offset )
51            return read_next_axt( self.f, self.species1, self.species2, self.species_to_lengths, self.support_ids )
52        else:
53            f = open( self.axt_filename )
54            try:
55                f.seek( offset )
56                return read_next_axt( f, self.species1, self.species2, self.species_to_lengths, self.support_ids )
57            finally:
58                f.close()
59
60class Reader( object ):
61    """Iterate over all axt blocks in a file in order"""
62   
63    def __init__( self, file, species1 = None, species2=None, species_to_lengths=None, support_ids=False ):
64        self.file = file
65        # nota bene: (self.species1 = species1 or "species1") is incorrect if species1=""
66        self.species1 = species1
67        if (self.species1 == None): self.species1 = "species1"
68        self.species2 = species2
69        if (self.species2 == None): self.species2 = "species2"
70        self.species_to_lengths = species_to_lengths
71        self.support_ids        = support_ids            # for extra text at end of axt header lines
72        self.attributes = {}
73
74    def next( self ):
75        return read_next_axt( self.file, self.species1, self.species2, self.species_to_lengths, self.support_ids )
76
77    def __iter__( self ):
78        return ReaderIter( self )
79
80    def close( self ):
81        self.file.close()
82
83class ReaderIter( object ):
84    def __init__( self, reader ):
85        self.reader = reader
86    def __iter__( self ):
87        return self
88    def next( self ):
89        v = self.reader.next()
90        if not v: raise StopIteration
91        return v
92
93class Writer( object ):
94
95    def __init__( self, file, attributes={} ):
96        self.file = file
97        self.block = 0
98        self.src_split = True
99        if ("src_split" in attributes):
100            self.src_split = attributes["src_split"]
101
102    def write( self, alignment ):
103        if (len(alignment.components) != 2):
104            raise "%d-component alignment is not compatible with axt" % \
105                   len(alignment.components)
106        c1 = alignment.components[0]
107        c2 = alignment.components[1]
108
109        if c1.strand != "+":
110            c1 = c1.reverse_complement()
111            c2 = c2.reverse_complement()
112
113        if (self.src_split):
114            spec1,chr1 = src_split( c1.src )
115            spec2,chr2 = src_split( c2.src )
116        else:
117            chr1,chr2 = c1.src,c2.src
118
119        self.file.write( "%d %s %d %d %s %d %d %s %s\n" % \
120              (self.block,chr1,c1.start+1,c1.start+c1.size,
121               chr2,c2.start+1,c2.start+c2.size,c2.strand,
122               alignment.score))
123        self.file.write( "%s\n" % c1.text )
124        self.file.write( "%s\n" % c2.text )
125        self.file.write( "\n" )
126        self.block += 1
127
128    def close( self ):
129        self.file.close()
130
131# ---- Helper methods ---------------------------------------------------------
132
133# typical axt block:
134#   0 chr19 3001012 3001075 chr11 70568380 70568443 - 3500 [optional text]
135#   TCAGCTCATAAATCACCTCCTGCCACAAGCCTGGCCTGGTCCCAGGAGAGTGTCCAGGCTCAGA
136#   TCTGTTCATAAACCACCTGCCATGACAAGCCTGGCCTGTTCCCAAGACAATGTCCAGGCTCAGA
137# start and stop are origin-1, inclusive
138# first species is always on plus strand
139# when second species is on minus strand, start and stop are counted from sequence end
140
141def read_next_axt( file, species1, species2, species_to_lengths=None, support_ids=False ):
142    line = readline( file, skip_blank=True )
143    if not line: return
144    fields = line.split()
145    if (len(fields) < 9) or ((not support_ids) and (len(fields) > 9)):
146        raise "bad axt-block header: %s" % line
147    attributes = {}
148    if (len(fields) > 9):
149        attributes["id"] = "_".join(fields[9:])
150    seq1 = readline( file )
151    if not line or line.isspace(): raise "incomplete axt-block; header: %s" % line
152    seq2 = readline( file )
153    if not line or line.isspace(): raise "incomplete axt-block; header: %s" % line
154    # Build 2 component alignment
155    alignment = Alignment(attributes=attributes,species_to_lengths=species_to_lengths)
156    # Build component for species 1
157    component = Component()
158    component.src = fields[1]
159    if (species1 != ""): component.src = species1 + "." + component.src
160    component.start = int( fields[2] ) - 1    # (axt intervals are origin-1
161    end = int( fields[3] )                    #  and inclusive on both ends)
162    component.size = end - component.start
163    component.strand = "+"
164    component.text = seq1.strip()
165    alignment.add_component( component )
166    # Build component for species 2
167    component = Component()
168    component.src = fields[4]
169    if (species2 != ""): component.src = species2 + "." + component.src
170    component.start = int( fields[5] ) - 1
171    end = int( fields[6] )
172    component.size = end - component.start
173    component.strand = fields[7]
174    component.text = seq2.strip()
175    alignment.add_component( component )
176    # add score
177    try:
178        alignment.score = int( fields[8] )
179    except:
180        try:
181            alignment.score = float( fields[8] )
182        except:
183            alignment.score = fields[8]
184    return alignment
185
186def readline( file, skip_blank=False ):
187    """Read a line from provided file, skipping any blank or comment lines"""
188    while 1:
189        line = file.readline()
190        if not line: return None
191        if line[0] != '#' and not ( skip_blank and line.isspace() ):
192            return line
193
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。