1 | """ |
---|
2 | Genetrack default smoothing and peak predictions. |
---|
3 | |
---|
4 | The program may be invoked in multiple ways. As a standalone script:: |
---|
5 | |
---|
6 | python peakpred.py |
---|
7 | |
---|
8 | As a python module:: |
---|
9 | |
---|
10 | python -m genetrack.scripts.peakpred |
---|
11 | |
---|
12 | Or in other python scripts:: |
---|
13 | |
---|
14 | >>> |
---|
15 | >>> from genetrack.scripts import peakpred |
---|
16 | >>> peakpred.predict(inpname, outname, options) |
---|
17 | >>> |
---|
18 | |
---|
19 | Run the script with no parameters to see the options that it takes. |
---|
20 | |
---|
21 | **Observed runtime**: |
---|
22 | |
---|
23 | """ |
---|
24 | import os, sys, csv |
---|
25 | from genetrack import logger, util, hdflib, fitlib |
---|
26 | |
---|
27 | TWOSTRAND = "two" |
---|
28 | |
---|
29 | commify = util.commify |
---|
30 | |
---|
31 | def output(stream, peaks, chrom, w=73, strand='+', ): |
---|
32 | "Outputs peaks to a stream" |
---|
33 | |
---|
34 | logger.debug('writing %s peaks on strand %s' % (commify(len(peaks)), strand)) |
---|
35 | for mid, value in peaks: |
---|
36 | start, end = mid - w, mid + w |
---|
37 | stream.write("%s\t%d\t%d\t.\t%f\t%s\n" % (chrom, start, end, value, strand)) |
---|
38 | |
---|
39 | def predict(inpname, outname, options): |
---|
40 | """ |
---|
41 | Generate the peak predictions on a genome wide scale |
---|
42 | """ |
---|
43 | if options.strand == TWOSTRAND: |
---|
44 | logger.info('operating in twostrand mode') |
---|
45 | |
---|
46 | if options.index: |
---|
47 | index = hdflib.PositionalData(fname='', index=inpname, nobuild=True, workdir=options.workdir) |
---|
48 | else: |
---|
49 | index = hdflib.PositionalData(fname=inpname, nobuild=True, workdir=options.workdir) |
---|
50 | |
---|
51 | fp = file(outname, 'wt') |
---|
52 | |
---|
53 | for label in index.labels: |
---|
54 | table = index.table(label) |
---|
55 | size = table.cols.idx[-1] |
---|
56 | info = util.commify(size) |
---|
57 | logger.info('predicting on %s of total size %s' % (label, info)) |
---|
58 | lo = 0 |
---|
59 | hi = min( (size, options.maxsize) ) |
---|
60 | |
---|
61 | while True: |
---|
62 | if lo >= size: |
---|
63 | break |
---|
64 | perc = '%.1f%%' % (100.0*lo/size) |
---|
65 | logger.info('processing %s %s:%s (%s)' % (label, lo, hi, perc)) |
---|
66 | |
---|
67 | # get the data |
---|
68 | res = index.query(start=lo, end=hi, label=label) |
---|
69 | |
---|
70 | |
---|
71 | # exclusion zone |
---|
72 | w = options.exclude/2 |
---|
73 | |
---|
74 | def predict(x, y): |
---|
75 | fx, fy = fitlib.gaussian_smoothing(x=x, y=y, sigma=options.sigma, epsilon=options.level ) |
---|
76 | peaks = fitlib.detect_peaks(x=fx, y=fy ) |
---|
77 | if options.mode != 'all': |
---|
78 | peaks = fitlib.select_peaks(peaks=peaks, exclusion=options.exclude, threshold=options.level) |
---|
79 | return peaks |
---|
80 | |
---|
81 | if options.strand == TWOSTRAND: |
---|
82 | # operates in two strand mode |
---|
83 | for yval, strand in [ (res.fwd, '+'), (res.rev, '-') ]: |
---|
84 | logger.debug('processing strand %s' % strand) |
---|
85 | peaks = predict(x=res.idx, y=yval) |
---|
86 | output(stream=fp, peaks=peaks, chrom=label, w=w, strand=strand) |
---|
87 | else: |
---|
88 | # combine strands |
---|
89 | peaks = predict(x=res.idx, y=res.val) |
---|
90 | output(stream=fp, peaks=peaks, chrom=label, w=w, strand='+') |
---|
91 | |
---|
92 | # switching to a higher interval |
---|
93 | lo = hi |
---|
94 | hi += options.maxsize |
---|
95 | |
---|
96 | fp.close() |
---|
97 | |
---|
98 | def option_parser(): |
---|
99 | "The option parser may be constructed in other tools invoking this script" |
---|
100 | import optparse |
---|
101 | |
---|
102 | usage = "usage: %prog -i inputfile -o outputfile" |
---|
103 | |
---|
104 | parser = optparse.OptionParser(usage=usage) |
---|
105 | |
---|
106 | # setting the input file name |
---|
107 | parser.add_option( |
---|
108 | '-i', '--input', action="store", |
---|
109 | dest="inpname", type='str', default=None, |
---|
110 | help="the input hdf file name (required)" |
---|
111 | ) |
---|
112 | |
---|
113 | # setting the output file name |
---|
114 | parser.add_option( |
---|
115 | '-o', '--output', action="store", |
---|
116 | dest="outname", type='str', default=None, |
---|
117 | help="the output file name (required)" |
---|
118 | ) |
---|
119 | |
---|
120 | # verbosity can be 0,1 and 2 (increasing verbosity) |
---|
121 | parser.add_option( |
---|
122 | '-v', '--verbosity', action="store", |
---|
123 | dest="verbosity", type="int", default=2, |
---|
124 | help="sets the verbosity (0, 1, 2) (default=1)", |
---|
125 | ) |
---|
126 | |
---|
127 | # sigma correction |
---|
128 | parser.add_option( |
---|
129 | '--sigma', action="store", |
---|
130 | dest="sigma", type="float", default=20, |
---|
131 | help="the smoothing factor", |
---|
132 | ) |
---|
133 | |
---|
134 | # the exclusion zone |
---|
135 | parser.add_option( |
---|
136 | '--exclusion', action="store", |
---|
137 | dest="exclude", type="int", default=100, |
---|
138 | help="the exclusion zone", |
---|
139 | ) |
---|
140 | |
---|
141 | # the exclusion zone |
---|
142 | parser.add_option( |
---|
143 | '--strand', action="store", |
---|
144 | dest="strand", type="str", default="ALL", |
---|
145 | help="strand", |
---|
146 | ) |
---|
147 | |
---|
148 | # threshold |
---|
149 | parser.add_option( |
---|
150 | '--level', action="store", |
---|
151 | dest="level", type="float", default=1, |
---|
152 | help="the minimum signal necessary to call peaks", |
---|
153 | ) |
---|
154 | |
---|
155 | # currently not used |
---|
156 | parser.add_option( |
---|
157 | '--mode', action="store", |
---|
158 | dest="mode", type="str", default='maximal', |
---|
159 | help="the peak prediction method: 'maximal', 'threshold' or 'all'", |
---|
160 | ) |
---|
161 | |
---|
162 | parser.add_option( |
---|
163 | '--maxsize', action="store", |
---|
164 | dest="maxsize", type="int", default=10**7, |
---|
165 | help="the size of the largest internal array allocated (default=10 million)", |
---|
166 | ) |
---|
167 | |
---|
168 | parser.add_option( |
---|
169 | '--test', action="store_true", |
---|
170 | dest="test", default=False, |
---|
171 | help="demo mode used in testing", |
---|
172 | ) |
---|
173 | |
---|
174 | parser.add_option( |
---|
175 | '-w', '--workdir', action="store", |
---|
176 | dest="workdir", type='str', default=None, |
---|
177 | help="work directory (optional)" |
---|
178 | ) |
---|
179 | |
---|
180 | parser.add_option( |
---|
181 | '-x', '--index', action="store_true", |
---|
182 | dest="index", default=False, |
---|
183 | help="treat input file as binary index" |
---|
184 | ) |
---|
185 | |
---|
186 | return parser |
---|
187 | |
---|
188 | |
---|
189 | if __name__ == '__main__': |
---|
190 | import optparse |
---|
191 | |
---|
192 | parser = option_parser() |
---|
193 | |
---|
194 | options, args = parser.parse_args() |
---|
195 | |
---|
196 | logger.disable(options.verbosity) |
---|
197 | |
---|
198 | from genetrack import conf |
---|
199 | |
---|
200 | # trigger test mode |
---|
201 | if options.test: |
---|
202 | options.inpname = conf.testdata('test-hdflib-input.gtrack') |
---|
203 | options.outname = conf.testdata('predictions.bed') |
---|
204 | |
---|
205 | # missing input file name |
---|
206 | if not options.inpname and not options.outname: |
---|
207 | parser.print_help() |
---|
208 | else: |
---|
209 | print 'Sigma = %s' % options.sigma |
---|
210 | print 'Minimum peak = %s' % options.level |
---|
211 | print 'Peak-to-peak = %s' % options.exclude |
---|
212 | |
---|
213 | predict(options.inpname, options.outname, options) |
---|