[3] | 1 | """ |
---|
| 2 | Genetrack default smoothing and peak predictions. |
---|
| 3 | |
---|
| 4 | The program may be invoked in multiple ways. As a standalone script:: |
---|
| 5 | |
---|
| 6 | python peakpred.py |
---|
| 7 | |
---|
| 8 | As a python module:: |
---|
| 9 | |
---|
| 10 | python -m genetrack.scripts.peakpred |
---|
| 11 | |
---|
| 12 | Or in other python scripts:: |
---|
| 13 | |
---|
| 14 | >>> |
---|
| 15 | >>> from genetrack.scripts import peakpred |
---|
| 16 | >>> peakpred.predict(inpname, outname, options) |
---|
| 17 | >>> |
---|
| 18 | |
---|
| 19 | Run the script with no parameters to see the options that it takes. |
---|
| 20 | |
---|
| 21 | **Observed runtime**: |
---|
| 22 | |
---|
| 23 | """ |
---|
| 24 | import os, sys, csv |
---|
| 25 | from genetrack import logger, util, hdflib, fitlib |
---|
| 26 | |
---|
| 27 | TWOSTRAND = "two" |
---|
| 28 | |
---|
| 29 | commify = util.commify |
---|
| 30 | |
---|
| 31 | def output(stream, peaks, chrom, w=73, strand='+', ): |
---|
| 32 | "Outputs peaks to a stream" |
---|
| 33 | |
---|
| 34 | logger.debug('writing %s peaks on strand %s' % (commify(len(peaks)), strand)) |
---|
| 35 | for mid, value in peaks: |
---|
| 36 | start, end = mid - w, mid + w |
---|
| 37 | stream.write("%s\t%d\t%d\t.\t%f\t%s\n" % (chrom, start, end, value, strand)) |
---|
| 38 | |
---|
| 39 | def predict(inpname, outname, options): |
---|
| 40 | """ |
---|
| 41 | Generate the peak predictions on a genome wide scale |
---|
| 42 | """ |
---|
| 43 | if options.strand == TWOSTRAND: |
---|
| 44 | logger.info('operating in twostrand mode') |
---|
| 45 | |
---|
| 46 | if options.index: |
---|
| 47 | index = hdflib.PositionalData(fname='', index=inpname, nobuild=True, workdir=options.workdir) |
---|
| 48 | else: |
---|
| 49 | index = hdflib.PositionalData(fname=inpname, nobuild=True, workdir=options.workdir) |
---|
| 50 | |
---|
| 51 | fp = file(outname, 'wt') |
---|
| 52 | |
---|
| 53 | for label in index.labels: |
---|
| 54 | table = index.table(label) |
---|
| 55 | size = table.cols.idx[-1] |
---|
| 56 | info = util.commify(size) |
---|
| 57 | logger.info('predicting on %s of total size %s' % (label, info)) |
---|
| 58 | lo = 0 |
---|
| 59 | hi = min( (size, options.maxsize) ) |
---|
| 60 | |
---|
| 61 | while True: |
---|
| 62 | if lo >= size: |
---|
| 63 | break |
---|
| 64 | perc = '%.1f%%' % (100.0*lo/size) |
---|
| 65 | logger.info('processing %s %s:%s (%s)' % (label, lo, hi, perc)) |
---|
| 66 | |
---|
| 67 | # get the data |
---|
| 68 | res = index.query(start=lo, end=hi, label=label) |
---|
| 69 | |
---|
| 70 | |
---|
| 71 | # exclusion zone |
---|
| 72 | w = options.exclude/2 |
---|
| 73 | |
---|
| 74 | def predict(x, y): |
---|
| 75 | fx, fy = fitlib.gaussian_smoothing(x=x, y=y, sigma=options.sigma, epsilon=options.level ) |
---|
| 76 | peaks = fitlib.detect_peaks(x=fx, y=fy ) |
---|
| 77 | if options.mode != 'all': |
---|
| 78 | peaks = fitlib.select_peaks(peaks=peaks, exclusion=options.exclude, threshold=options.level) |
---|
| 79 | return peaks |
---|
| 80 | |
---|
| 81 | if options.strand == TWOSTRAND: |
---|
| 82 | # operates in two strand mode |
---|
| 83 | for yval, strand in [ (res.fwd, '+'), (res.rev, '-') ]: |
---|
| 84 | logger.debug('processing strand %s' % strand) |
---|
| 85 | peaks = predict(x=res.idx, y=yval) |
---|
| 86 | output(stream=fp, peaks=peaks, chrom=label, w=w, strand=strand) |
---|
| 87 | else: |
---|
| 88 | # combine strands |
---|
| 89 | peaks = predict(x=res.idx, y=res.val) |
---|
| 90 | output(stream=fp, peaks=peaks, chrom=label, w=w, strand='+') |
---|
| 91 | |
---|
| 92 | # switching to a higher interval |
---|
| 93 | lo = hi |
---|
| 94 | hi += options.maxsize |
---|
| 95 | |
---|
| 96 | fp.close() |
---|
| 97 | |
---|
| 98 | def option_parser(): |
---|
| 99 | "The option parser may be constructed in other tools invoking this script" |
---|
| 100 | import optparse |
---|
| 101 | |
---|
| 102 | usage = "usage: %prog -i inputfile -o outputfile" |
---|
| 103 | |
---|
| 104 | parser = optparse.OptionParser(usage=usage) |
---|
| 105 | |
---|
| 106 | # setting the input file name |
---|
| 107 | parser.add_option( |
---|
| 108 | '-i', '--input', action="store", |
---|
| 109 | dest="inpname", type='str', default=None, |
---|
| 110 | help="the input hdf file name (required)" |
---|
| 111 | ) |
---|
| 112 | |
---|
| 113 | # setting the output file name |
---|
| 114 | parser.add_option( |
---|
| 115 | '-o', '--output', action="store", |
---|
| 116 | dest="outname", type='str', default=None, |
---|
| 117 | help="the output file name (required)" |
---|
| 118 | ) |
---|
| 119 | |
---|
| 120 | # verbosity can be 0,1 and 2 (increasing verbosity) |
---|
| 121 | parser.add_option( |
---|
| 122 | '-v', '--verbosity', action="store", |
---|
| 123 | dest="verbosity", type="int", default=2, |
---|
| 124 | help="sets the verbosity (0, 1, 2) (default=1)", |
---|
| 125 | ) |
---|
| 126 | |
---|
| 127 | # sigma correction |
---|
| 128 | parser.add_option( |
---|
| 129 | '--sigma', action="store", |
---|
| 130 | dest="sigma", type="float", default=20, |
---|
| 131 | help="the smoothing factor", |
---|
| 132 | ) |
---|
| 133 | |
---|
| 134 | # the exclusion zone |
---|
| 135 | parser.add_option( |
---|
| 136 | '--exclusion', action="store", |
---|
| 137 | dest="exclude", type="int", default=100, |
---|
| 138 | help="the exclusion zone", |
---|
| 139 | ) |
---|
| 140 | |
---|
| 141 | # the exclusion zone |
---|
| 142 | parser.add_option( |
---|
| 143 | '--strand', action="store", |
---|
| 144 | dest="strand", type="str", default="ALL", |
---|
| 145 | help="strand", |
---|
| 146 | ) |
---|
| 147 | |
---|
| 148 | # threshold |
---|
| 149 | parser.add_option( |
---|
| 150 | '--level', action="store", |
---|
| 151 | dest="level", type="float", default=1, |
---|
| 152 | help="the minimum signal necessary to call peaks", |
---|
| 153 | ) |
---|
| 154 | |
---|
| 155 | # currently not used |
---|
| 156 | parser.add_option( |
---|
| 157 | '--mode', action="store", |
---|
| 158 | dest="mode", type="str", default='maximal', |
---|
| 159 | help="the peak prediction method: 'maximal', 'threshold' or 'all'", |
---|
| 160 | ) |
---|
| 161 | |
---|
| 162 | parser.add_option( |
---|
| 163 | '--maxsize', action="store", |
---|
| 164 | dest="maxsize", type="int", default=10**7, |
---|
| 165 | help="the size of the largest internal array allocated (default=10 million)", |
---|
| 166 | ) |
---|
| 167 | |
---|
| 168 | parser.add_option( |
---|
| 169 | '--test', action="store_true", |
---|
| 170 | dest="test", default=False, |
---|
| 171 | help="demo mode used in testing", |
---|
| 172 | ) |
---|
| 173 | |
---|
| 174 | parser.add_option( |
---|
| 175 | '-w', '--workdir', action="store", |
---|
| 176 | dest="workdir", type='str', default=None, |
---|
| 177 | help="work directory (optional)" |
---|
| 178 | ) |
---|
| 179 | |
---|
| 180 | parser.add_option( |
---|
| 181 | '-x', '--index', action="store_true", |
---|
| 182 | dest="index", default=False, |
---|
| 183 | help="treat input file as binary index" |
---|
| 184 | ) |
---|
| 185 | |
---|
| 186 | return parser |
---|
| 187 | |
---|
| 188 | |
---|
| 189 | if __name__ == '__main__': |
---|
| 190 | import optparse |
---|
| 191 | |
---|
| 192 | parser = option_parser() |
---|
| 193 | |
---|
| 194 | options, args = parser.parse_args() |
---|
| 195 | |
---|
| 196 | logger.disable(options.verbosity) |
---|
| 197 | |
---|
| 198 | from genetrack import conf |
---|
| 199 | |
---|
| 200 | # trigger test mode |
---|
| 201 | if options.test: |
---|
| 202 | options.inpname = conf.testdata('test-hdflib-input.gtrack') |
---|
| 203 | options.outname = conf.testdata('predictions.bed') |
---|
| 204 | |
---|
| 205 | # missing input file name |
---|
| 206 | if not options.inpname and not options.outname: |
---|
| 207 | parser.print_help() |
---|
| 208 | else: |
---|
| 209 | print 'Sigma = %s' % options.sigma |
---|
| 210 | print 'Minimum peak = %s' % options.level |
---|
| 211 | print 'Peak-to-peak = %s' % options.exclude |
---|
| 212 | |
---|
| 213 | predict(options.inpname, options.outname, options) |
---|