root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/intervals/operations/find_clusters.py

リビジョン 3, 5.8 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Find clusters of intervals within a set of intervals.  A cluster is a
3group (of size minregions) of intervals within a specific distance (of
4mincols) of each other.
5
6Returns Cluster objects, which have a chrom, start, end, and lines (a
7list of linenumbers from the original file).  The original can then be
8ran through with the linenumbers to extract clustered regions without
9disturbing original order, or the clusters may themselves be written
10as intervals.
11"""
12
13import random
14import math
15
16import traceback
17import fileinput
18from warnings import warn
19
20from bx.intervals.cluster import *
21from bx.intervals.io import *
22from bx.intervals.operations import *
23
24
25def find_clusters(reader, mincols=1, minregions=2):
26    extra = dict()
27    chroms = dict()
28    linenum = -1
29    for interval in reader:
30        linenum += 1
31        if not type( interval ) is GenomicInterval:
32            extra[linenum] = interval
33        else:
34            if interval.chrom not in chroms:
35                chroms[interval.chrom] = ClusterTree( mincols, minregions )
36            try:
37                chroms[interval.chrom].insert( interval.start, interval.end, linenum )
38            except OverflowError, e:
39                try:
40                    # This will work only if reader is a NiceReaderWrapper
41                    reader.skipped += 1
42                    if reader.skipped < 10:
43                        reader.skipped_lines.append( ( reader.linenum, reader.current_line, str( e ) ) )
44                except:
45                    pass
46                continue
47    return chroms, extra
48
49
50### DEPRECATED: Use the ClusterTree in bx.intervals.cluster for this.
51### It does the same thing, but is a C implementation.
52class ClusterNode( object ):
53    def __init__( self, start, end, linenum, mincols, minregions ):
54        # Python lacks the binomial distribution, so we convert a
55        # uniform into a binomial because it naturally scales with
56        # tree size.  Also, python's uniform is perfect since the
57        # upper limit is not inclusive, which gives us undefined here.
58        self.priority = math.ceil( (-1.0 / math.log(.5)) * math.log( -1.0 / (random.uniform(0,1) - 1)))
59        self.start = start
60        self.end = end
61        self.left = None
62        self.right = None
63        self.lines = [linenum]
64        self.mincols = mincols
65        self.minregions = minregions
66       
67    def insert( self, start, end, linenum ):
68        if start - self.mincols > self.end:
69            # insert to right tree
70            if self.right:
71                self.right = self.right.insert( start, end, linenum )
72            else:
73                self.right = ClusterNode(start, end, linenum, self.mincols, self.minregions)
74            # rebalance tree
75            if self.priority < self.right.priority:
76                return self.rotateleft()
77        elif end + self.mincols < self.start:
78            # insert to left tree
79            if self.left:
80                self.left = self.left.insert( start, end, linenum )
81            else:
82                self.left = ClusterNode(start, end, linenum, self.mincols, self.minregions)
83            # rebalance tree
84            if self.priority < self.left.priority:
85                return self.rotateright()
86        else:
87            # insert here
88            self.start = min(self.start, start)
89            self.end = max(self.end, end)
90            self.lines.append(linenum)
91            # recursive call to push nodes up
92            if self.left:
93                self.left = self.left.push_up(self)
94            if self.right:
95                self.right = self.right.push_up(self)
96        return self
97
98    def rotateright( self ):
99        root = self.left
100        self.left = self.left.right
101        root.right = self
102        return root
103       
104    def rotateleft( self ):
105        root = self.right
106        self.right = self.right.left
107        root.left = self
108        return root
109       
110    def push_up( self, topnode ):
111        # Note: this function does not affect heap property
112        # Distance method removed for inline, faster?
113        distance = max(self.start, topnode.start) - min(self.end, topnode.end)
114        if distance <= self.mincols:
115            topnode.start = min(self.start, topnode.start)
116            topnode.end = max(self.end, topnode.end)
117            for linenum in self.lines:
118                topnode.lines.append(linenum)
119            if self.right:
120                return self.right.push_up( topnode )
121            if self.left:
122                return self.left.push_up( topnode )
123            return None
124        if self.end < topnode.start and self.right:
125            self.right = self.right.push_up( topnode )
126        if self.start > topnode.end and self.left:
127            self.left = self.left.push_up( topnode )
128        return self
129
130    def getintervals( self ):
131        if self.left:
132            for start, end in self.left.getintervals(minregions):
133                yield start, end
134        if len(self.lines) >= minregions:
135            yield self.start, self.end
136        if self.right:
137            for start, end in self.right.getintervals(minregions):
138                yield start, end
139
140    def getlines( self ):
141        if self.left:
142            for line in self.left.getlines():
143                yield line
144        if len(self.lines) >= minregions:
145            for line in self.lines:
146                yield line
147        if self.right:
148            for line in self.right.getlines():
149                yield line
150               
151## def main():
152##     f1 = fileinput.FileInput("big.bed")
153##     g1 = GenomicIntervalReader(f1)
154##     returntree, extra = find_clusters(g1, mincols=50)
155##     print "All found"
156##     for chrom, value in returntree.items():
157##         for start, end in value.getregions():
158##             print chrom+"\t"+str(start)+"\t"+str(end)
159##         for line in value.getlines():
160##             print "Line:\t"+str(line)
161
162## main()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。