root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/bx/intervals/random.py @ 3

リビジョン 3, 9.2 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Classes for generating random sets of intervals over larger regions.
3"""
4
5from bx.bitset import *
6import bisect
7random = __import__( 'random' )
8
9class MaxtriesException( Exception ):
10    pass
11
12def throw_random_list( lengths, mask, allow_overlap=False ):
13    rval = []
14    throw_random_gap_list( lengths, mask, lambda s, e: rval.append( ( s, e ) ), allow_overlap )
15    assert sum( b - a for a, b in rval ) == sum( lengths )
16    return rval
17
18def throw_random_bits( lengths, mask, allow_overlap=False ):
19    rval = BitSet( mask.size )
20    throw_random_gap_list( lengths, mask, lambda s, e: rval.set_range( s, e - s ), allow_overlap )
21    if not allow_overlap:
22        assert rval.count_range( 0, rval.size ) == sum( lengths )
23    return rval
24
25def throw_random_gap_list( lengths, mask, save_interval_func, allow_overlap=False ):
26    """
27    Generates a set of non-overlapping random intervals from a length
28    distribution.
29   
30    `lengths`: list containing the length of each interval to be generated.
31               We expect this to be sorted by decreasing length to minimize
32               the chance of failure (MaxtriesException) and for some
33               performance gains when allow_overlap==True and there are
34               duplicate lengths
35    `mask`: a BitSet in which set bits represent regions not to place
36            intervals. The size of the region is also determined from the
37            mask.
38    """
39    # Use mask to find the gaps;  gaps is a list of (length,start,end)
40    lengths = [length for length in lengths if length > 0]
41    min_length = min( lengths )
42    gaps = []
43    start = end = 0
44    while 1:
45        start = mask.next_clear( end )
46        if start == mask.size: break
47        end = mask.next_set( start )
48        if end-start >= min_length:
49            gaps.append( ( end-start, start, None ) )
50    # Sort (long regions first)   
51    gaps.sort()
52    gaps.reverse()
53    # Throw
54    throw_random_private( lengths, gaps, save_interval_func, allow_overlap, three_args=False )
55
56def throw_random_intervals( lengths, regions, save_interval_func=None, allow_overlap=False ):
57    """
58    Generates a set of non-overlapping random intervals from a length
59    distribution.
60   
61    `lengths`: list containing the length of each interval to be generated.
62               We expect this to be sorted by decreasing length to minimize
63               the chance of failure (MaxtriesException) and for some
64               performance gains when allow_overlap==True and there are
65               duplicate lengths.
66    `regions`: A list of regions in which intervals can be placed.  Elements
67               are tuples or lists of the form (start, end, ...), where ...
68               indicates any number of items (including zero).
69    `save_interval_func`: A function accepting three arguments which will be
70                          passed the (start,stop,region) for each generated
71                          interval, where region is an entry in the regions
72                          list.  If this is None, the generated intervals will
73                          be returned as a list of elements copied from the
74                          region with start and end modified.
75    """
76    # Copy regions
77    regions = [( x[1]-x[0], x[0], x ) for x in regions]
78    # Sort (long regions first)   
79    regions.sort()
80    regions.reverse()
81    # Throw
82    if (save_interval_func != None):
83        throw_random_private( lengths, regions, save_interval_func, allow_overlap )
84        return
85    else:
86        intervals = []
87        save_interval_func = lambda s, e, rgn: intervals.append( overwrite_start_end ( s, e, rgn ) )
88        throw_random_private( lengths, regions, save_interval_func, allow_overlap )
89        return intervals
90
91def overwrite_start_end(s,e,rgn):
92    rgn = list(rgn)
93    rgn[0] = s
94    rgn[1] = e
95    return tuple(rgn)
96
97
98def throw_random_private( lengths, regions, save_interval_func, allow_overlap=False, three_args=True ):
99    """
100    (Internal function;  we expect calls only through the interface functions
101    above)
102   
103    `lengths`: A list containing the length of each interval to be generated.
104    `regions`: A list of regions in which intervals can be placed, sorted by
105               decreasing length.  Elements are triples of the form (length,
106               start, extra), This list CAN BE MODIFIED by this function.
107    `save_interval_func`: A function accepting three arguments which will be
108                          passed the (start,stop,extra) for each generated
109                          interval.
110    """
111
112    # Implementation:
113    #   We keep a list of the regions, sorted from largest to smallest.  We then
114    #   place each length by following steps:
115    #     (1) construct a candidate counts array (cc array)
116    #     (2) choose a candidate at random
117    #     (3) find region containing that candidate
118    #     (4) map candidate to position in that region
119    #     (5) split region if not allowing overlaps
120    #     (6) report placed segment
121    #
122    #   The cc array is only constructed if there's a change (different length
123    #   to place, or the region list has changed).  It contains, for each
124    #   region, the total number of number of candidate positions in regions
125    #   *preceding* it in the region list:
126    #     cc[i] = sum over k in 0..(i-1) of length[i] - L + 1
127    #   where N is the number of regions and L is the length being thrown.
128    #   At the same time, we determine the total number of candidates (the total
129    #   number of places the current length can be placed) and the index range
130    #   of regions into which the length will fit.
131    #
132    #   example:
133    #     for L = 20
134    #     i =           0   1   2   3   4   5   6   7   8   9
135    #     length[i] =  96  66  56  50  48  40  29  17  11   8
136    #     cc[i] =       0  77 124 161 192 221 242   X   X   X
137    #     candidates = 252
138    #     lo_rgn = 0
139    #     hi_rgn = 6
140    #
141    #   The candidate is chosen in (0..candidates-1).  The candidate counts
142    #   array allows us to do a binary search to locate the region that holds that
143    #   candidate.  Continuing the example above, we choose a random candidate
144    #   s in (0..251).  If s happens to be in (124..160), it will be mapped to
145    #   region 2 at start position s-124.
146    #
147    #   During the binary search, if we are looking at region 3, if s < cc[3]
148    #   then the desired region is region 2 or lower.  Otherwise it is region 3 or
149    #   higher.
150
151    min_length = min( lengths )
152    prev_length = None # (force initial cc array construction)
153    cc = [0] * (len( regions ) + len(lengths) - 1)
154    num_thrown = 0
155    for length in lengths:
156        # construct cc array (only needed if length has changed or region list has
157        # changed)
158        if length != prev_length:
159            prev_length = length
160            assert len( cc ) >= len( regions )
161            candidates = 0
162            hi_rgn = 0
163            for region in regions:
164                rgn_len = region[0]
165                if rgn_len < length:
166                    break
167                cc[hi_rgn] = candidates
168                candidates += rgn_len - length + 1
169                hi_rgn += 1
170            if candidates == 0:
171                raise MaxtriesException( "No region can fit an interval of length %d (we threw %d of %d)" \
172                                       % ( length, num_thrown,len( lengths ) ) )
173            hi_rgn -= 1
174        # Select a candidate
175        s = random.randrange( candidates )
176        #..
177        #..for ix in range( len( regions ) ):
178        #..    region = regions[ix]
179        #..    if ix <= hi_rgn: print "%2s: %5s %5s %5s" % ( ix, region[1], region[0], cc[ix] )
180        #..    else:            print "%2s: %5s %5s %5s" % ( ix, region[1], region[0], "X" )
181        #..print "s = %s (of %s candidates)" % ( s, candidates )
182        # Locate region containing that candidate, by binary search
183        lo = 0
184        hi = hi_rgn
185        while hi > lo:
186            mid = (lo + hi + 1) / 2     # (we round up to prevent infinite loop)
187            if s < cc[mid]: hi = mid-1  # (s <  num candidates from 0..mid-1)
188            else:           lo = mid    # (s >= num candidates from 0..mid-1)
189        s -= cc[lo]
190        # If we are not allowing overlaps we will remove the placed interval
191        # from the region list
192        if allow_overlap:
193            rgn_length, rgn_start, rgn_extra = regions[lo]
194        else:
195            # Remove the chosen region and split
196            rgn_length, rgn_start, rgn_extra = regions.pop( lo )
197            rgn_end = rgn_start + rgn_length
198            assert s >= 0
199            assert rgn_start + s + length <= rgn_end, "Expected: %d + %d + %d == %d <= %d" % ( rgn_start, s, length, rgn_start + s + length, rgn_end )
200            regions.reverse()
201            if s >= min_length:
202                bisect.insort( regions, ( s, rgn_start, rgn_extra ) )
203            if s + length <= rgn_length - min_length:
204                bisect.insort( regions, ( rgn_length - ( s + length ), rgn_start + s + length, rgn_extra ) )
205            regions.reverse()
206            prev_length = None # (force cc array construction)
207        # Save the new interval
208        if (three_args):
209            save_interval_func( rgn_start + s, rgn_start + s + length, rgn_extra )
210        else:
211            save_interval_func( rgn_start + s, rgn_start + s + length )
212        num_thrown += 1
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。