root/galaxy-central/eggs/GeneTrack-2.0.0_beta_1_dev_48da9e998f0caf01c5be731e926f4b0481f658f0-py2.6.egg/genetrack/fitlib.py

リビジョン 3, 7.0 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2Data fitting and peak prediction routines
3"""
4import genetrack
5from genetrack import logger, conf
6from itertools import *
7import numpy, operator
8from math import log, exp
9
10def normal_function( w, sigma ):
11    """
12    Defaulf fitting function, it returns values
13    from a normal distribution over a certain width.
14
15    The function is not normalized thus will be a representation of the sum of readcounts.
16    """
17    log2    = log(2)
18    sigma2  = float(sigma)**2
19    lo, hi  = int(-w), int(w+1)
20    pi = numpy.pi
21
22    # function definition, not normalized
23    # thus will correspond to read counts
24    def normal_func(index):
25        return exp( -index*index/sigma2/2 )   
26   
27    values = map( normal_func, range(lo, hi) )
28    values = numpy.array( values, numpy.float )
29
30    return abs(lo), hi, values
31
32def gaussian_smoothing(x, y, sigma=20, epsilon=0.1 ):
33    """
34    Fits data represented by f(x)=y by a sum of normal curves where
35    each curve corresponds to a normal function of variance=sigma and
36    height equal to the y coordinate.
37
38    Parameters x and y are lists.
39
40    Returns a tuple of with the new x, and y coordinates.
41    """
42    if len(x)==0:
43        return x, y
44
45    # this is a joyfully simple, marvelously elegant and superfast solution
46    # that's possible thanks to numpy. I bow before thy greatness, NumPY!!!
47
48    # transform to numpy arrays
49    x = numpy.array( x, numpy.int )
50    y = numpy.array( y, numpy.float )
51
52    # a sanity check
53    assert len(x)==len(y), "Data lenghts must match!"
54
55    # operate within 5 standard deviations
56    w = 5 * sigma
57
58    # precompute the fitting values for a given sigma,
59    lo, hi, normal = normal_function( w=w, sigma=sigma )
60
61    # shift the original vector by the first index, so that
62    # the first index starts at the value lo
63    # this copies the array
64    zero_x = x - x[0] + lo
65
66    # the size will influence memory consumption
67    # long vectors need to be stiched together externally
68    # uses around 100MB per 10 million size
69    # on live displays there is no need to fit over large regions (over 100K)
70    # as the features won't be visible by eye
71    size  = zero_x[-1] + lo + hi
72
73    # this will hold the new fitted values
74    new_y = numpy.zeros( size, numpy.float )
75
76    # performs the smoothing by mutating the array values in place
77    for index, value in izip(zero_x, y):
78        lox = index - lo
79        hix = index + hi
80        # this is where the magic happens
81        new_y[ lox:hix ] += value * normal
82   
83    # keep only values above the epsilon
84    # this cuts out (potentially massive) regions where there are no measurements
85    new_x = ( new_y > epsilon ).nonzero()[0]
86    new_y = new_y.take(new_x)
87
88    # now shift back to get the real indices
89    new_x =  new_x + x[0] - lo
90
91    return new_x, new_y
92
93def detect_peaks( x, y ):
94    """
95    Detects peaks (local maxima) from an iterators x and y
96    where f(x)=y. Will not propely detect plateus!
97
98    Returns a list of tuples where the two
99    elements correspond to the peak index and peak value.
100   
101    >>> y = [ 0.0, 1.0, 2.5, 1.0, 3.5, 1.0, 0.0, 0.0, 10.5, 2.0, 1.0, 0.0 ]
102    >>> x = range(len(y))
103    >>> peaks = detect_peaks( x=x, y=y )
104    >>> peaks
105    [(2, 2.5), (4, 3.5), (8, 10.5)]
106    >>> select_peaks( peaks, exclusion=1)
107    [(2, 2.5), (4, 3.5), (8, 10.5)]
108    >>> select_peaks( peaks, exclusion=2)
109    [(4, 3.5), (8, 10.5)]
110    """
111    peaks = []
112    # finds local maxima
113    for i in xrange(1, len(y)-1 ):
114        left, mid, right = y[i-1], y[i], y[i+1]
115        if left < mid >= right:
116            peaks.append( (x[i], mid) )
117    return peaks
118
119def select_peaks( peaks, exclusion, threshold=0):
120    """
121    Selects maximal non-overlapping peaks with a given exclusion zone
122    and over a given treshold.
123
124    Takes as input a list of (index, value) tuples corresponding to
125    all local maxima. Returns a filtered list of tuples (index, value)
126    with the maxima that pass the conditions.
127
128    >>> peaks = [ (0, 20), (100, 19), (500, 4), (10**6, 2) ]
129    >>> select_peaks( peaks, exclusion=200)
130    [(0, 20), (500, 4), (1000000, 2)]
131    """
132
133    # zero exclusion allows all peaks to pass
134    if exclusion == 0:
135        return peaks
136
137    # sort by peak height
138    work  = [ (y, x) for x, y in peaks if y >= threshold ]
139    work.sort()
140    work.reverse()
141
142    # none of the values passed the treshold
143    if not work:
144        return []
145
146    # this will store the selected peaks
147    selected = []
148
149    # we assume that peaks are sorted already increasing order by x
150    xmin, xmax = peaks[0][0], peaks[-1][0]
151   
152    # we create an occupancy vector to keep track of empty regions
153    size  = xmax - xmin + exclusion
154    shift = xmin - exclusion
155
156    # exclusion will be applied for both ends
157    # the size must fit into memory, int8 is fairly small though
158    # chop large chromosomes into chunks and predict on each
159    empty = numpy.ones(size + exclusion, numpy.int8)
160
161    # starting with the largest select from the existing peaks
162    for peaky, peakx in work:
163
164        # get the peak index as occupancy vector index
165        locind = peakx - shift
166       
167        # check region
168        if empty[locind]:
169           
170            # store this peak
171            selected.append( ( peakx, peaky ) )
172
173           # block the region
174            left  = locind - exclusion
175            right = locind + exclusion
176            empty[left:right] = numpy.zeros (right - left, numpy.int8)
177   
178    selected.sort()
179    return selected
180
181def fixed_width_predictor(x, y, params):   
182    """
183    Generates peaks from a x,y dataset.
184
185    >>> from genetrack import util
186    >>>
187    >>> y = [ 0.0, 1.0, 2.5, 1.0, 3.5, 1.0, 0.0, 0.0, 10.5, 2.0, 1.0, 0.0 ]
188    >>> x = range(len(y))
189    >>>
190    >>> params = util.Params(feature_width=1, minimum_peak=0, zoom_value=1)
191    >>> fixed_width_predictor(x, y, params=params)
192    [(2, 2, '2.5'), (4, 4, '3.5'), (8, 8, '10.5')]
193    >>>
194    >>> params = util.Params(feature_width=2, minimum_peak=3, zoom_value=1)
195    >>> fixed_width_predictor(x, y, params=params)
196    [(3, 5, '3.5'), (7, 9, '10.5')]
197    """
198
199    width = params.feature_width
200    all_peaks = detect_peaks(x=x, y=y )
201    sel_peaks = select_peaks(peaks=all_peaks, exclusion=width, threshold=params.minimum_peak)
202
203    #print params
204    #print sel_peaks
205    # generate the fixed lenght intervals with open
206    h = width/2
207    if int(params.zoom_value)> 5000:
208        results = [ (m - h, m + h, '' ) for m, v in sel_peaks ]   
209    else:
210        results = [ (m - h, m + h, '%.1f' % v ) for m, v in sel_peaks ]
211
212    return results
213
214def test(verbose=0):
215    """
216    Main testrunnner
217    """
218    import doctest
219    doctest.testmod( verbose=verbose )
220
221def test_plot():
222    "Visualize results via matplotlib"
223    from pylab import plot, show
224
225    x = [ 1, 101, 102, 103,  500, 503,  ]
226    y = [ 1,   1,   2,   3,    5,    1,  ]
227
228    nx, ny  = gaussian_smoothing(x, y, sigma=30)
229
230    plot(nx, ny, 'bo-')
231    show()
232
233if __name__ == '__main__':
234    test(verbose=0)
235   
236    peaks = [ (0, 20), (100, 19), (500, 4), (10**6, 2) ]
237    print select_peaks( peaks, exclusion=200)
238
239    #test_plot()
240   
241
242
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。