1 | """ |
---|
2 | Classes for generating random sets of intervals over larger regions. |
---|
3 | """ |
---|
4 | |
---|
5 | from bx.bitset import * |
---|
6 | import bisect |
---|
7 | random = __import__( 'random' ) |
---|
8 | |
---|
9 | class MaxtriesException( Exception ): |
---|
10 | pass |
---|
11 | |
---|
12 | def throw_random_list( lengths, mask, allow_overlap=False ): |
---|
13 | rval = [] |
---|
14 | throw_random_gap_list( lengths, mask, lambda s, e: rval.append( ( s, e ) ), allow_overlap ) |
---|
15 | assert sum( b - a for a, b in rval ) == sum( lengths ) |
---|
16 | return rval |
---|
17 | |
---|
18 | def throw_random_bits( lengths, mask, allow_overlap=False ): |
---|
19 | rval = BitSet( mask.size ) |
---|
20 | throw_random_gap_list( lengths, mask, lambda s, e: rval.set_range( s, e - s ), allow_overlap ) |
---|
21 | if not allow_overlap: |
---|
22 | assert rval.count_range( 0, rval.size ) == sum( lengths ) |
---|
23 | return rval |
---|
24 | |
---|
25 | def throw_random_gap_list( lengths, mask, save_interval_func, allow_overlap=False ): |
---|
26 | """ |
---|
27 | Generates a set of non-overlapping random intervals from a length |
---|
28 | distribution. |
---|
29 | |
---|
30 | `lengths`: list containing the length of each interval to be generated. |
---|
31 | We expect this to be sorted by decreasing length to minimize |
---|
32 | the chance of failure (MaxtriesException) and for some |
---|
33 | performance gains when allow_overlap==True and there are |
---|
34 | duplicate lengths |
---|
35 | `mask`: a BitSet in which set bits represent regions not to place |
---|
36 | intervals. The size of the region is also determined from the |
---|
37 | mask. |
---|
38 | """ |
---|
39 | # Use mask to find the gaps; gaps is a list of (length,start,end) |
---|
40 | lengths = [length for length in lengths if length > 0] |
---|
41 | min_length = min( lengths ) |
---|
42 | gaps = [] |
---|
43 | start = end = 0 |
---|
44 | while 1: |
---|
45 | start = mask.next_clear( end ) |
---|
46 | if start == mask.size: break |
---|
47 | end = mask.next_set( start ) |
---|
48 | if end-start >= min_length: |
---|
49 | gaps.append( ( end-start, start, None ) ) |
---|
50 | # Sort (long regions first) |
---|
51 | gaps.sort() |
---|
52 | gaps.reverse() |
---|
53 | # Throw |
---|
54 | throw_random_private( lengths, gaps, save_interval_func, allow_overlap, three_args=False ) |
---|
55 | |
---|
56 | def throw_random_intervals( lengths, regions, save_interval_func=None, allow_overlap=False ): |
---|
57 | """ |
---|
58 | Generates a set of non-overlapping random intervals from a length |
---|
59 | distribution. |
---|
60 | |
---|
61 | `lengths`: list containing the length of each interval to be generated. |
---|
62 | We expect this to be sorted by decreasing length to minimize |
---|
63 | the chance of failure (MaxtriesException) and for some |
---|
64 | performance gains when allow_overlap==True and there are |
---|
65 | duplicate lengths. |
---|
66 | `regions`: A list of regions in which intervals can be placed. Elements |
---|
67 | are tuples or lists of the form (start, end, ...), where ... |
---|
68 | indicates any number of items (including zero). |
---|
69 | `save_interval_func`: A function accepting three arguments which will be |
---|
70 | passed the (start,stop,region) for each generated |
---|
71 | interval, where region is an entry in the regions |
---|
72 | list. If this is None, the generated intervals will |
---|
73 | be returned as a list of elements copied from the |
---|
74 | region with start and end modified. |
---|
75 | """ |
---|
76 | # Copy regions |
---|
77 | regions = [( x[1]-x[0], x[0], x ) for x in regions] |
---|
78 | # Sort (long regions first) |
---|
79 | regions.sort() |
---|
80 | regions.reverse() |
---|
81 | # Throw |
---|
82 | if (save_interval_func != None): |
---|
83 | throw_random_private( lengths, regions, save_interval_func, allow_overlap ) |
---|
84 | return |
---|
85 | else: |
---|
86 | intervals = [] |
---|
87 | save_interval_func = lambda s, e, rgn: intervals.append( overwrite_start_end ( s, e, rgn ) ) |
---|
88 | throw_random_private( lengths, regions, save_interval_func, allow_overlap ) |
---|
89 | return intervals |
---|
90 | |
---|
91 | def overwrite_start_end(s,e,rgn): |
---|
92 | rgn = list(rgn) |
---|
93 | rgn[0] = s |
---|
94 | rgn[1] = e |
---|
95 | return tuple(rgn) |
---|
96 | |
---|
97 | |
---|
98 | def throw_random_private( lengths, regions, save_interval_func, allow_overlap=False, three_args=True ): |
---|
99 | """ |
---|
100 | (Internal function; we expect calls only through the interface functions |
---|
101 | above) |
---|
102 | |
---|
103 | `lengths`: A list containing the length of each interval to be generated. |
---|
104 | `regions`: A list of regions in which intervals can be placed, sorted by |
---|
105 | decreasing length. Elements are triples of the form (length, |
---|
106 | start, extra), This list CAN BE MODIFIED by this function. |
---|
107 | `save_interval_func`: A function accepting three arguments which will be |
---|
108 | passed the (start,stop,extra) for each generated |
---|
109 | interval. |
---|
110 | """ |
---|
111 | |
---|
112 | # Implementation: |
---|
113 | # We keep a list of the regions, sorted from largest to smallest. We then |
---|
114 | # place each length by following steps: |
---|
115 | # (1) construct a candidate counts array (cc array) |
---|
116 | # (2) choose a candidate at random |
---|
117 | # (3) find region containing that candidate |
---|
118 | # (4) map candidate to position in that region |
---|
119 | # (5) split region if not allowing overlaps |
---|
120 | # (6) report placed segment |
---|
121 | # |
---|
122 | # The cc array is only constructed if there's a change (different length |
---|
123 | # to place, or the region list has changed). It contains, for each |
---|
124 | # region, the total number of number of candidate positions in regions |
---|
125 | # *preceding* it in the region list: |
---|
126 | # cc[i] = sum over k in 0..(i-1) of length[i] - L + 1 |
---|
127 | # where N is the number of regions and L is the length being thrown. |
---|
128 | # At the same time, we determine the total number of candidates (the total |
---|
129 | # number of places the current length can be placed) and the index range |
---|
130 | # of regions into which the length will fit. |
---|
131 | # |
---|
132 | # example: |
---|
133 | # for L = 20 |
---|
134 | # i = 0 1 2 3 4 5 6 7 8 9 |
---|
135 | # length[i] = 96 66 56 50 48 40 29 17 11 8 |
---|
136 | # cc[i] = 0 77 124 161 192 221 242 X X X |
---|
137 | # candidates = 252 |
---|
138 | # lo_rgn = 0 |
---|
139 | # hi_rgn = 6 |
---|
140 | # |
---|
141 | # The candidate is chosen in (0..candidates-1). The candidate counts |
---|
142 | # array allows us to do a binary search to locate the region that holds that |
---|
143 | # candidate. Continuing the example above, we choose a random candidate |
---|
144 | # s in (0..251). If s happens to be in (124..160), it will be mapped to |
---|
145 | # region 2 at start position s-124. |
---|
146 | # |
---|
147 | # During the binary search, if we are looking at region 3, if s < cc[3] |
---|
148 | # then the desired region is region 2 or lower. Otherwise it is region 3 or |
---|
149 | # higher. |
---|
150 | |
---|
151 | min_length = min( lengths ) |
---|
152 | prev_length = None # (force initial cc array construction) |
---|
153 | cc = [0] * (len( regions ) + len(lengths) - 1) |
---|
154 | num_thrown = 0 |
---|
155 | for length in lengths: |
---|
156 | # construct cc array (only needed if length has changed or region list has |
---|
157 | # changed) |
---|
158 | if length != prev_length: |
---|
159 | prev_length = length |
---|
160 | assert len( cc ) >= len( regions ) |
---|
161 | candidates = 0 |
---|
162 | hi_rgn = 0 |
---|
163 | for region in regions: |
---|
164 | rgn_len = region[0] |
---|
165 | if rgn_len < length: |
---|
166 | break |
---|
167 | cc[hi_rgn] = candidates |
---|
168 | candidates += rgn_len - length + 1 |
---|
169 | hi_rgn += 1 |
---|
170 | if candidates == 0: |
---|
171 | raise MaxtriesException( "No region can fit an interval of length %d (we threw %d of %d)" \ |
---|
172 | % ( length, num_thrown,len( lengths ) ) ) |
---|
173 | hi_rgn -= 1 |
---|
174 | # Select a candidate |
---|
175 | s = random.randrange( candidates ) |
---|
176 | #.. |
---|
177 | #..for ix in range( len( regions ) ): |
---|
178 | #.. region = regions[ix] |
---|
179 | #.. if ix <= hi_rgn: print "%2s: %5s %5s %5s" % ( ix, region[1], region[0], cc[ix] ) |
---|
180 | #.. else: print "%2s: %5s %5s %5s" % ( ix, region[1], region[0], "X" ) |
---|
181 | #..print "s = %s (of %s candidates)" % ( s, candidates ) |
---|
182 | # Locate region containing that candidate, by binary search |
---|
183 | lo = 0 |
---|
184 | hi = hi_rgn |
---|
185 | while hi > lo: |
---|
186 | mid = (lo + hi + 1) / 2 # (we round up to prevent infinite loop) |
---|
187 | if s < cc[mid]: hi = mid-1 # (s < num candidates from 0..mid-1) |
---|
188 | else: lo = mid # (s >= num candidates from 0..mid-1) |
---|
189 | s -= cc[lo] |
---|
190 | # If we are not allowing overlaps we will remove the placed interval |
---|
191 | # from the region list |
---|
192 | if allow_overlap: |
---|
193 | rgn_length, rgn_start, rgn_extra = regions[lo] |
---|
194 | else: |
---|
195 | # Remove the chosen region and split |
---|
196 | rgn_length, rgn_start, rgn_extra = regions.pop( lo ) |
---|
197 | rgn_end = rgn_start + rgn_length |
---|
198 | assert s >= 0 |
---|
199 | assert rgn_start + s + length <= rgn_end, "Expected: %d + %d + %d == %d <= %d" % ( rgn_start, s, length, rgn_start + s + length, rgn_end ) |
---|
200 | regions.reverse() |
---|
201 | if s >= min_length: |
---|
202 | bisect.insort( regions, ( s, rgn_start, rgn_extra ) ) |
---|
203 | if s + length <= rgn_length - min_length: |
---|
204 | bisect.insort( regions, ( rgn_length - ( s + length ), rgn_start + s + length, rgn_extra ) ) |
---|
205 | regions.reverse() |
---|
206 | prev_length = None # (force cc array construction) |
---|
207 | # Save the new interval |
---|
208 | if (three_args): |
---|
209 | save_interval_func( rgn_start + s, rgn_start + s + length, rgn_extra ) |
---|
210 | else: |
---|
211 | save_interval_func( rgn_start + s, rgn_start + s + length ) |
---|
212 | num_thrown += 1 |
---|