root/galaxy-central/eggs/GeneTrack-2.0.0_beta_1_dev_48da9e998f0caf01c5be731e926f4b0481f658f0-py2.6.egg/genetrack/scripts/tabs2genetrack.py @ 3

リビジョン 3, 9.2 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1"""
2File transformer. Takes a tabuolar file (bed or gff) and
3transforms them to a file used as genetrack input.
4Optional parameters may be used to shift the 5' ends with
5a specified amount. This is useful if the file corresponds to data
6with fixed fragment widths you can move each fragment to the
7center location. The program may be invoked in multiple ways.
8As a standalone script::
9
10    python tabs2genetrack.py
11
12As a python module::
13
14    python -m genetrack.scripts.tabs2genetrack
15
16Or in other python scripts::
17
18>>>
19>>> from genetrack.scripts import tabs2genetrack
20>>> tabs2genetrack.transform( inpname, outname, format='bed', shift=0)
21>>>
22
23Run the script with no parameters to see the options that it takes.
24
25A genetrack input file format is a tab delimited text file
26described in the API documentation of the PositionalData
27class: `genetrack.hdflib.PositionalData`
28The transformation is a three step process, *transform*,
29*sort* and *consolidate*. It will create files in the
30genetrack temporary data directory and it will remove the intermediate files
31when the process is complete.
32
33**Observed runtime**: tranformation rate of 2 million lines per minute
34
35**Note1**: The script will invoke the system `sort` command to sort the file
36that is substantially faster under Unix than Windows.
37
38"""
39import os, sys, csv, shutil
40from itertools import *
41from genetrack import logger, conf, util, hdflib
42
43def consolidate( inpname, outname, format):
44    """
45    Consolidates an input file.
46    Merges multiple indentical indices into one line
47    """
48    fp = open(outname, 'wt')
49
50    # recover the original basename
51    basename = os.path.basename(inpname).replace('.sorted', '')
52
53    # create a few information headers
54    fp.write("#\n# created with tabs2genetrack\n")
55    fp.write("# source: %s, format %s\n#\n" % (basename, format) )
56    fp.write("chrom\tindex\tforward\treverse\tvalue\n")
57   
58    # the main reader
59    reader = csv.reader(open(inpname, 'rb'), delimiter='\t')
60
61    # will duplicate some code to avoid an extra conditional internally
62    chrom, index, fwd, rev, val = reader.next()
63    lastindex, fwd, rev, val = int(index), int(fwd), int(rev), int(val)
64    collect = [ chrom, lastindex, fwd, rev, val ]
65
66    for row in reader:
67        chrom, index, fwd, rev, val = row
68        index, fwd, rev, val = int(index), int(fwd), int(rev), int(val)
69        if index == lastindex:
70            collect[2] += fwd
71            collect[3] += rev
72            collect[4] += val
73        else:
74            # write out the collected data
75            fp.write( '%s\t%s\t%s\t%s\t%s\n' % tuple(collect) )
76            collect = [ chrom, index, fwd, rev, val ]
77            lastindex = index
78
79    fp.close()
80
81def transform(inpname, outname, format, shift=0, index=False, options=None):
82    """
83    Transforms reads stored in bedfile to a genetrack input file.
84    Requires at least 6 bed columns to access the strand.
85    """
86
87    # detect file formats
88    if format == "BED":
89        CHROM, START, END, STRAND = 0, 1, 2, 5
90    elif format == "GFF":
91        CHROM, START, END, STRAND = 0, 3, 4, 6
92    else:
93        raise Exception('Invalid file format' % format)
94
95    # two sanity checks, one day someone will thank me
96    if format == 'BED' and inpname.endswith('gff'):
97        raise Exception('BED format on a gff file?')
98    if format == 'GFF' and inpname.endswith('bed'):
99        raise Exception('GFF format on a bed file?')
100
101    # find the basename of the outputname
102    basename = os.path.basename(outname)
103   
104    # two files store intermediate results
105    flat = conf.tempdata( '%s.flat' % basename )
106    sorted  = conf.tempdata( '%s.sorted' % basename )
107
108    # check for track information on first line,
109    # much faster this way than conditional checking on each line
110    fp = file(inpname, 'rU')
111    first = fp.readline()
112    fp.close()
113
114    # create the reader
115    reader = csv.reader(file(inpname, 'rU'), delimiter='\t')
116
117    # skip if trackline exists
118    if first.startswith == 'track':
119        reader.next()
120
121    # unwind the comments
122    list(takewhile(lambda x: x[0].startswith('#'), reader))
123
124    # copious timing info for those who enjoy these
125    timer, full = util.Timer(), util.Timer()
126
127    logger.debug("parsing '%s'" % inpname)
128    logger.debug("output to '%s'" % outname)
129
130    # create the unsorted output file and apply corrections
131    logger.debug("unsorted flat file '%s'" % flat)
132
133    fp = file(flat, 'wt')
134    for linec, row in enumerate(reader):
135        try:
136            chrom, start, end, strand = row[CHROM], row[START], row[END], row[STRAND]
137        except Exception, exc:
138            first = row[0][0]
139            # may be hitting the end of the file with other comments
140            if  first == '>':
141                break # hit the sequence content of the gff file
142            elif first == '#':
143                continue # hit upon some comments
144            else:
145                logger.error(row)
146                raise Exception(exc)
147
148        if strand == '+':
149            # on forward strand, 5' is at start
150            idx = int(start) + shift
151            fwd, rev, val = 1, 0, 1
152        elif strand == '-':
153            # on reverse strand, 5' is at end
154            idx = int(end) - shift
155            fwd, rev, val = 0, 1, 1
156        else:
157            # no strand specified, generate interval centers
158            idx = (int(start)+int(end))/2
159            fwd, rev, val = 0, 0, 1
160
161        # it is essential be able to sort the index as a string!
162        fp.write('%s\t%012d\t%s\t%s\t%s\n' % (chrom, idx, fwd, rev, val))
163
164    fp.close()
165    linet = util.commify(linec)
166    logger.debug("parsing %s lines finished in %s" % (linet, timer.report()))
167
168    # if it is producing coverage then it will expand reads into full intervaals
169
170    # now let the sorting commence
171    cmd = "sort %s > %s" % (flat, sorted)
172    logger.debug("sorting into '%s'" % sorted)
173    os.system(cmd)
174    logger.debug("sorting finished in %s" % timer.report() )
175
176    logger.debug("consolidating into '%s'" % outname)
177    consolidate( sorted, outname, format=format)
178    logger.debug("consolidate finished in %s" % timer.report() )
179    logger.debug("output saved to '%s'" % outname)
180    logger.debug("full conversion finished in %s" % full.report() )
181
182    # attempting to cleanup the remaining files
183    for name in (flat, sorted):
184        logger.debug("removing temporary file '%s'" % name )
185        os.remove(name)
186
187    # also runs the indexing on it
188    if index:
189        logger.debug("loading the index from '%s'" % outname)
190        # create the work directory
191        if options.workdir and not os.path.isdir(options.workdir):
192            os.mkdir(options.workdir)
193        result = hdflib.PositionalData(fname=outname, update=True, workdir=options.workdir)
194        logger.debug("indexing finished in %s" % timer.report() )
195        result.close()
196
197        logger.debug("moving index to main output '%s'" % outname )
198
199        # remove the intermediate file
200        os.remove(outname)
201
202        # move the index as the output file
203        shutil.move(result.index, outname)
204
205def option_parser():
206    "The option parser may be constructed in other tools invoking this script"
207    import optparse
208
209    usage = "usage: %prog -i inputfile -o outputfile -f format"
210
211    parser = optparse.OptionParser(usage=usage)
212
213    # setting the input file name
214    parser.add_option(
215        '-i', '--input', action="store",
216        dest="inpname", type='str', default=None,
217        help="the input file name (required)"
218    )
219
220    # setting the output file name
221    parser.add_option(
222        '-o', '--output', action="store",
223        dest="outname", type='str', default=None,
224        help="output file name (required)"
225    )
226
227    # file formats
228    parser.add_option(
229        '-f', '--format', action="store",
230        dest="format", type="str", default='',
231        help="input file format, bed or gff (required)",
232    )
233   
234    # correction shift added in 5' direction for start/end coordinates
235    parser.add_option(
236        '-s', '--shift', action="store",
237        dest="shift", type="int", default=0,
238        help="shift for the 5' end on each strand (default=0)",
239    )
240
241    # verbosity can be 0,1 and 2 (increasing verbosity)
242    parser.add_option(
243        '-v', '--verbosity', action="store",
244        dest="verbosity", type="int", default=1,
245        help="sets the verbosity (0, 1) (default=1)",
246    )
247
248    parser.add_option("-x", "--index",
249        action="store_true", dest="index", default=False,
250        help="also creates an hdf index for the file")
251
252    parser.add_option(
253        '-w', '--workdir', action="store",
254        dest="workdir", type='str', default=None,
255        help="work directory (optional)"
256    )
257
258    return parser
259
260if __name__ == '__main__':
261   
262    parser = option_parser()
263
264    options, args = parser.parse_args()
265
266    # uppercase the format
267    options.format = options.format.upper()
268
269    if options.format not in ('BED', 'GFF'):
270        sys.stdout = sys.stderr
271        parser.print_help()
272        sys.exit(-1)
273
274    logger.disable(options.verbosity)
275
276    # missing file names
277    if not (options.inpname and options.outname and options.format):
278        parser.print_help()
279        sys.exit(-1)
280    else:
281        transform(inpname=options.inpname, outname=options.outname,\
282            format=options.format, shift=options.shift, index=options.index, options=options)
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。