root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/intervals/operations/join.py @ 3

リビジョン 3, 4.7 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Join two sets of intervals using their overlap as the key.  The
3intervals MUST be sorted by chrom(lexicographically),
4start(arithmetically) and end(arithmetically).  This works by simply
5walking through the inputs in O(n) time.
6"""
7
8import psyco_full
9
10import math
11import traceback
12import fileinput
13from warnings import warn
14
15from bx.intervals.io import *
16from bx.intervals.operations import *
17from quicksect import IntervalTree
18
19def join(leftSet, rightSet, mincols=1, leftfill=True, rightfill=True):
20    # Read rightSet into memory:
21    rightlen = 0
22    leftlen = 0
23    rightTree = IntervalTree()
24    for item in rightSet:
25        if type( item ) is GenomicInterval:
26            rightTree.insert( item, rightSet.linenum, item.fields )
27            if rightlen == 0: rightlen = item.nfields
28
29    for interval in leftSet:
30        if leftlen == 0 and type( interval ) is GenomicInterval:
31            leftlen = interval.nfields
32        if not (type( interval ) is GenomicInterval):
33            yield interval
34        else:
35            result = []
36            rightTree.intersect( interval, lambda node: result.append( node ) )
37            overlap_not_met = 0
38            for item in result:
39                if item.start in range(interval.start,interval.end+1) and item.end not in range(interval.start,interval.end+1):
40                    overlap = interval.end-item.start
41                elif item.end in range(interval.start,interval.end+1) and item.start not in range(interval.start,interval.end+1):
42                    overlap = item.end-interval.start
43                elif item.start in range(interval.start,interval.end+1) and item.end in range(interval.start,interval.end+1):
44                    overlap = item.end-item.start
45                else:   #the intersecting item's start and end are outside the interval range
46                    overlap = interval.end-interval.start
47                if overlap < mincols:
48                    overlap_not_met += 1
49                    continue
50                outfields = list(interval)
51                map(outfields.append, item.other)
52                setattr( item, "visited", True )
53                yield outfields
54            if (len(result) == 0 or overlap_not_met == len(result)) and rightfill:
55                outfields = list(interval)
56                for x in range(rightlen): outfields.append(".")
57                yield outfields
58
59    if leftfill:
60        def report_unvisited( node, results ):
61            if not hasattr(node, "visited"):
62                results.append( node )
63        results = []
64        rightTree.traverse( lambda x: report_unvisited( x, results ) )
65        for item in results:
66            outfields = list()
67            for x in range(leftlen): outfields.append(".")
68            map(outfields.append, item.other)
69            yield outfields
70
71
72def interval_cmp(a, b):
73    interval1 = a[0]
74    interval2 = b[0]
75    if not (type( interval1 ) is GenomicInterval and type( interval2 ) is GenomicInterval):
76        return 0
77    # Both are intervals
78    if interval1.chrom == interval2.chrom:
79        center1 = interval1.start + ((interval1.end - interval1.start) / 2)
80        center2 = interval2.start + ((interval2.end - interval2.start) / 2)
81        return center1 - center2
82    else:
83        if interval1.chrom > interval2.chrom:
84            return 1
85        else:
86            return -1
87
88    return 0
89
90def findintersect(interval, sortedlist, mincols):
91    # find range of intervals that intersect via a binary search
92    # find lower bound
93    x = len(sortedlist) / 2
94    n = int(math.pow(2,math.ceil(math.log(len(sortedlist),2))))
95
96    not_found = True
97    not_done = True
98    while not_found and not_done:
99        n = n / 2
100        if n == 0:
101            n = 1
102            not_done = False
103        if x >= len(sortedlist):
104            x -= n
105        elif x < 0:
106            x += n
107        else:
108            if findoverlap(sortedlist[x][0], interval) >= mincols:
109                not_found = False
110            else:
111                comp = interval_cmp(sortedlist[x], [interval, 0])
112                if comp > 0:
113                    x -= n
114                else:
115                    x += n
116
117    print "\t".join(sortedlist[x][0].fields)
118    print "not_found = " + str(not_found)
119    if not_found:
120        return 0,-1
121
122    lowerbound = x
123    middlebound = x
124    upperbound = x
125    while (lowerbound > -1) and (findoverlap(sortedlist[lowerbound-1][0],interval) >= mincols):
126        lowerbound -= 1
127    while (upperbound+1 < len(sortedlist)) and (findoverlap(sortedlist[upperbound+1][0],interval) >= mincols):
128        upperbound += 1
129
130    return lowerbound, upperbound
131
132def findoverlap(a, b):
133    # overlapping
134    if a.chrom == b.chrom:
135        return min(a.end, b.end) - max(a.start, b.start)
136    else:
137        return 0
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。