| 1 | """ |
|---|
| 2 | Classes for generating random sets of intervals over larger regions. |
|---|
| 3 | """ |
|---|
| 4 | |
|---|
| 5 | from bx.bitset import * |
|---|
| 6 | import bisect |
|---|
| 7 | random = __import__( 'random' ) |
|---|
| 8 | |
|---|
| 9 | class MaxtriesException( Exception ): |
|---|
| 10 | pass |
|---|
| 11 | |
|---|
| 12 | def throw_random_list( lengths, mask, allow_overlap=False ): |
|---|
| 13 | rval = [] |
|---|
| 14 | throw_random_gap_list( lengths, mask, lambda s, e: rval.append( ( s, e ) ), allow_overlap ) |
|---|
| 15 | assert sum( b - a for a, b in rval ) == sum( lengths ) |
|---|
| 16 | return rval |
|---|
| 17 | |
|---|
| 18 | def throw_random_bits( lengths, mask, allow_overlap=False ): |
|---|
| 19 | rval = BitSet( mask.size ) |
|---|
| 20 | throw_random_gap_list( lengths, mask, lambda s, e: rval.set_range( s, e - s ), allow_overlap ) |
|---|
| 21 | if not allow_overlap: |
|---|
| 22 | assert rval.count_range( 0, rval.size ) == sum( lengths ) |
|---|
| 23 | return rval |
|---|
| 24 | |
|---|
| 25 | def throw_random_gap_list( lengths, mask, save_interval_func, allow_overlap=False ): |
|---|
| 26 | """ |
|---|
| 27 | Generates a set of non-overlapping random intervals from a length |
|---|
| 28 | distribution. |
|---|
| 29 | |
|---|
| 30 | `lengths`: list containing the length of each interval to be generated. |
|---|
| 31 | We expect this to be sorted by decreasing length to minimize |
|---|
| 32 | the chance of failure (MaxtriesException) and for some |
|---|
| 33 | performance gains when allow_overlap==True and there are |
|---|
| 34 | duplicate lengths |
|---|
| 35 | `mask`: a BitSet in which set bits represent regions not to place |
|---|
| 36 | intervals. The size of the region is also determined from the |
|---|
| 37 | mask. |
|---|
| 38 | """ |
|---|
| 39 | # Use mask to find the gaps; gaps is a list of (length,start,end) |
|---|
| 40 | lengths = [length for length in lengths if length > 0] |
|---|
| 41 | min_length = min( lengths ) |
|---|
| 42 | gaps = [] |
|---|
| 43 | start = end = 0 |
|---|
| 44 | while 1: |
|---|
| 45 | start = mask.next_clear( end ) |
|---|
| 46 | if start == mask.size: break |
|---|
| 47 | end = mask.next_set( start ) |
|---|
| 48 | if end-start >= min_length: |
|---|
| 49 | gaps.append( ( end-start, start, None ) ) |
|---|
| 50 | # Sort (long regions first) |
|---|
| 51 | gaps.sort() |
|---|
| 52 | gaps.reverse() |
|---|
| 53 | # Throw |
|---|
| 54 | throw_random_private( lengths, gaps, save_interval_func, allow_overlap, three_args=False ) |
|---|
| 55 | |
|---|
| 56 | def throw_random_intervals( lengths, regions, save_interval_func=None, allow_overlap=False ): |
|---|
| 57 | """ |
|---|
| 58 | Generates a set of non-overlapping random intervals from a length |
|---|
| 59 | distribution. |
|---|
| 60 | |
|---|
| 61 | `lengths`: list containing the length of each interval to be generated. |
|---|
| 62 | We expect this to be sorted by decreasing length to minimize |
|---|
| 63 | the chance of failure (MaxtriesException) and for some |
|---|
| 64 | performance gains when allow_overlap==True and there are |
|---|
| 65 | duplicate lengths. |
|---|
| 66 | `regions`: A list of regions in which intervals can be placed. Elements |
|---|
| 67 | are tuples or lists of the form (start, end, ...), where ... |
|---|
| 68 | indicates any number of items (including zero). |
|---|
| 69 | `save_interval_func`: A function accepting three arguments which will be |
|---|
| 70 | passed the (start,stop,region) for each generated |
|---|
| 71 | interval, where region is an entry in the regions |
|---|
| 72 | list. If this is None, the generated intervals will |
|---|
| 73 | be returned as a list of elements copied from the |
|---|
| 74 | region with start and end modified. |
|---|
| 75 | """ |
|---|
| 76 | # Copy regions |
|---|
| 77 | regions = [( x[1]-x[0], x[0], x ) for x in regions] |
|---|
| 78 | # Sort (long regions first) |
|---|
| 79 | regions.sort() |
|---|
| 80 | regions.reverse() |
|---|
| 81 | # Throw |
|---|
| 82 | if (save_interval_func != None): |
|---|
| 83 | throw_random_private( lengths, regions, save_interval_func, allow_overlap ) |
|---|
| 84 | return |
|---|
| 85 | else: |
|---|
| 86 | intervals = [] |
|---|
| 87 | save_interval_func = lambda s, e, rgn: intervals.append( overwrite_start_end ( s, e, rgn ) ) |
|---|
| 88 | throw_random_private( lengths, regions, save_interval_func, allow_overlap ) |
|---|
| 89 | return intervals |
|---|
| 90 | |
|---|
| 91 | def overwrite_start_end(s,e,rgn): |
|---|
| 92 | rgn = list(rgn) |
|---|
| 93 | rgn[0] = s |
|---|
| 94 | rgn[1] = e |
|---|
| 95 | return tuple(rgn) |
|---|
| 96 | |
|---|
| 97 | |
|---|
| 98 | def throw_random_private( lengths, regions, save_interval_func, allow_overlap=False, three_args=True ): |
|---|
| 99 | """ |
|---|
| 100 | (Internal function; we expect calls only through the interface functions |
|---|
| 101 | above) |
|---|
| 102 | |
|---|
| 103 | `lengths`: A list containing the length of each interval to be generated. |
|---|
| 104 | `regions`: A list of regions in which intervals can be placed, sorted by |
|---|
| 105 | decreasing length. Elements are triples of the form (length, |
|---|
| 106 | start, extra), This list CAN BE MODIFIED by this function. |
|---|
| 107 | `save_interval_func`: A function accepting three arguments which will be |
|---|
| 108 | passed the (start,stop,extra) for each generated |
|---|
| 109 | interval. |
|---|
| 110 | """ |
|---|
| 111 | |
|---|
| 112 | # Implementation: |
|---|
| 113 | # We keep a list of the regions, sorted from largest to smallest. We then |
|---|
| 114 | # place each length by following steps: |
|---|
| 115 | # (1) construct a candidate counts array (cc array) |
|---|
| 116 | # (2) choose a candidate at random |
|---|
| 117 | # (3) find region containing that candidate |
|---|
| 118 | # (4) map candidate to position in that region |
|---|
| 119 | # (5) split region if not allowing overlaps |
|---|
| 120 | # (6) report placed segment |
|---|
| 121 | # |
|---|
| 122 | # The cc array is only constructed if there's a change (different length |
|---|
| 123 | # to place, or the region list has changed). It contains, for each |
|---|
| 124 | # region, the total number of number of candidate positions in regions |
|---|
| 125 | # *preceding* it in the region list: |
|---|
| 126 | # cc[i] = sum over k in 0..(i-1) of length[i] - L + 1 |
|---|
| 127 | # where N is the number of regions and L is the length being thrown. |
|---|
| 128 | # At the same time, we determine the total number of candidates (the total |
|---|
| 129 | # number of places the current length can be placed) and the index range |
|---|
| 130 | # of regions into which the length will fit. |
|---|
| 131 | # |
|---|
| 132 | # example: |
|---|
| 133 | # for L = 20 |
|---|
| 134 | # i = 0 1 2 3 4 5 6 7 8 9 |
|---|
| 135 | # length[i] = 96 66 56 50 48 40 29 17 11 8 |
|---|
| 136 | # cc[i] = 0 77 124 161 192 221 242 X X X |
|---|
| 137 | # candidates = 252 |
|---|
| 138 | # lo_rgn = 0 |
|---|
| 139 | # hi_rgn = 6 |
|---|
| 140 | # |
|---|
| 141 | # The candidate is chosen in (0..candidates-1). The candidate counts |
|---|
| 142 | # array allows us to do a binary search to locate the region that holds that |
|---|
| 143 | # candidate. Continuing the example above, we choose a random candidate |
|---|
| 144 | # s in (0..251). If s happens to be in (124..160), it will be mapped to |
|---|
| 145 | # region 2 at start position s-124. |
|---|
| 146 | # |
|---|
| 147 | # During the binary search, if we are looking at region 3, if s < cc[3] |
|---|
| 148 | # then the desired region is region 2 or lower. Otherwise it is region 3 or |
|---|
| 149 | # higher. |
|---|
| 150 | |
|---|
| 151 | min_length = min( lengths ) |
|---|
| 152 | prev_length = None # (force initial cc array construction) |
|---|
| 153 | cc = [0] * (len( regions ) + len(lengths) - 1) |
|---|
| 154 | num_thrown = 0 |
|---|
| 155 | for length in lengths: |
|---|
| 156 | # construct cc array (only needed if length has changed or region list has |
|---|
| 157 | # changed) |
|---|
| 158 | if length != prev_length: |
|---|
| 159 | prev_length = length |
|---|
| 160 | assert len( cc ) >= len( regions ) |
|---|
| 161 | candidates = 0 |
|---|
| 162 | hi_rgn = 0 |
|---|
| 163 | for region in regions: |
|---|
| 164 | rgn_len = region[0] |
|---|
| 165 | if rgn_len < length: |
|---|
| 166 | break |
|---|
| 167 | cc[hi_rgn] = candidates |
|---|
| 168 | candidates += rgn_len - length + 1 |
|---|
| 169 | hi_rgn += 1 |
|---|
| 170 | if candidates == 0: |
|---|
| 171 | raise MaxtriesException( "No region can fit an interval of length %d (we threw %d of %d)" \ |
|---|
| 172 | % ( length, num_thrown,len( lengths ) ) ) |
|---|
| 173 | hi_rgn -= 1 |
|---|
| 174 | # Select a candidate |
|---|
| 175 | s = random.randrange( candidates ) |
|---|
| 176 | #.. |
|---|
| 177 | #..for ix in range( len( regions ) ): |
|---|
| 178 | #.. region = regions[ix] |
|---|
| 179 | #.. if ix <= hi_rgn: print "%2s: %5s %5s %5s" % ( ix, region[1], region[0], cc[ix] ) |
|---|
| 180 | #.. else: print "%2s: %5s %5s %5s" % ( ix, region[1], region[0], "X" ) |
|---|
| 181 | #..print "s = %s (of %s candidates)" % ( s, candidates ) |
|---|
| 182 | # Locate region containing that candidate, by binary search |
|---|
| 183 | lo = 0 |
|---|
| 184 | hi = hi_rgn |
|---|
| 185 | while hi > lo: |
|---|
| 186 | mid = (lo + hi + 1) / 2 # (we round up to prevent infinite loop) |
|---|
| 187 | if s < cc[mid]: hi = mid-1 # (s < num candidates from 0..mid-1) |
|---|
| 188 | else: lo = mid # (s >= num candidates from 0..mid-1) |
|---|
| 189 | s -= cc[lo] |
|---|
| 190 | # If we are not allowing overlaps we will remove the placed interval |
|---|
| 191 | # from the region list |
|---|
| 192 | if allow_overlap: |
|---|
| 193 | rgn_length, rgn_start, rgn_extra = regions[lo] |
|---|
| 194 | else: |
|---|
| 195 | # Remove the chosen region and split |
|---|
| 196 | rgn_length, rgn_start, rgn_extra = regions.pop( lo ) |
|---|
| 197 | rgn_end = rgn_start + rgn_length |
|---|
| 198 | assert s >= 0 |
|---|
| 199 | assert rgn_start + s + length <= rgn_end, "Expected: %d + %d + %d == %d <= %d" % ( rgn_start, s, length, rgn_start + s + length, rgn_end ) |
|---|
| 200 | regions.reverse() |
|---|
| 201 | if s >= min_length: |
|---|
| 202 | bisect.insort( regions, ( s, rgn_start, rgn_extra ) ) |
|---|
| 203 | if s + length <= rgn_length - min_length: |
|---|
| 204 | bisect.insort( regions, ( rgn_length - ( s + length ), rgn_start + s + length, rgn_extra ) ) |
|---|
| 205 | regions.reverse() |
|---|
| 206 | prev_length = None # (force cc array construction) |
|---|
| 207 | # Save the new interval |
|---|
| 208 | if (three_args): |
|---|
| 209 | save_interval_func( rgn_start + s, rgn_start + s + length, rgn_extra ) |
|---|
| 210 | else: |
|---|
| 211 | save_interval_func( rgn_start + s, rgn_start + s + length ) |
|---|
| 212 | num_thrown += 1 |
|---|