Context Navigation

tabs2genetrack.py

リビジョン 3, 9.2 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	"""
2	File transformer. Takes a tabuolar file (bed or gff) and
3	transforms them to a file used as genetrack input.
4	Optional parameters may be used to shift the 5' ends with
5	a specified amount. This is useful if the file corresponds to data
6	with fixed fragment widths you can move each fragment to the
7	center location. The program may be invoked in multiple ways.
8	As a standalone script::
9
10	python tabs2genetrack.py
11
12	As a python module::
13
14	python -m genetrack.scripts.tabs2genetrack
15
16	Or in other python scripts::
17
18	>>>
19	>>> from genetrack.scripts import tabs2genetrack
20	>>> tabs2genetrack.transform( inpname, outname, format='bed', shift=0)
21	>>>
22
23	Run the script with no parameters to see the options that it takes.
24
25	A genetrack input file format is a tab delimited text file
26	described in the API documentation of the PositionalData
27	class: `genetrack.hdflib.PositionalData`
28	The transformation is a three step process, transform,
29	sort and consolidate. It will create files in the
30	genetrack temporary data directory and it will remove the intermediate files
31	when the process is complete.
32
33	Observed runtime: tranformation rate of 2 million lines per minute
34
35	Note1: The script will invoke the system `sort` command to sort the file
36	that is substantially faster under Unix than Windows.
37
38	"""
39	import os, sys, csv, shutil
40	from itertools import *
41	from genetrack import logger, conf, util, hdflib
42
43	def consolidate( inpname, outname, format):
44	"""
45	Consolidates an input file.
46	Merges multiple indentical indices into one line
47	"""
48	fp = open(outname, 'wt')
49
50	# recover the original basename
51	basename = os.path.basename(inpname).replace('.sorted', '')
52
53	# create a few information headers
54	fp.write("#\n# created with tabs2genetrack\n")
55	fp.write("# source: %s, format %s\n#\n" % (basename, format) )
56	fp.write("chrom\tindex\tforward\treverse\tvalue\n")
57
58	# the main reader
59	reader = csv.reader(open(inpname, 'rb'), delimiter='\t')
60
61	# will duplicate some code to avoid an extra conditional internally
62	chrom, index, fwd, rev, val = reader.next()
63	lastindex, fwd, rev, val = int(index), int(fwd), int(rev), int(val)
64	collect = [ chrom, lastindex, fwd, rev, val ]
65
66	for row in reader:
67	chrom, index, fwd, rev, val = row
68	index, fwd, rev, val = int(index), int(fwd), int(rev), int(val)
69	if index == lastindex:
70	collect[2] += fwd
71	collect[3] += rev
72	collect[4] += val
73	else:
74	# write out the collected data
75	fp.write( '%s\t%s\t%s\t%s\t%s\n' % tuple(collect) )
76	collect = [ chrom, index, fwd, rev, val ]
77	lastindex = index
78
79	fp.close()
80
81	def transform(inpname, outname, format, shift=0, index=False, options=None):
82	"""
83	Transforms reads stored in bedfile to a genetrack input file.
84	Requires at least 6 bed columns to access the strand.
85	"""
86
87	# detect file formats
88	if format == "BED":
89	CHROM, START, END, STRAND = 0, 1, 2, 5
90	elif format == "GFF":
91	CHROM, START, END, STRAND = 0, 3, 4, 6
92	else:
93	raise Exception('Invalid file format' % format)
94
95	# two sanity checks, one day someone will thank me
96	if format == 'BED' and inpname.endswith('gff'):
97	raise Exception('BED format on a gff file?')
98	if format == 'GFF' and inpname.endswith('bed'):
99	raise Exception('GFF format on a bed file?')
100
101	# find the basename of the outputname
102	basename = os.path.basename(outname)
103
104	# two files store intermediate results
105	flat = conf.tempdata( '%s.flat' % basename )
106	sorted = conf.tempdata( '%s.sorted' % basename )
107
108	# check for track information on first line,
109	# much faster this way than conditional checking on each line
110	fp = file(inpname, 'rU')
111	first = fp.readline()
112	fp.close()
113
114	# create the reader
115	reader = csv.reader(file(inpname, 'rU'), delimiter='\t')
116
117	# skip if trackline exists
118	if first.startswith == 'track':
119	reader.next()
120
121	# unwind the comments
122	list(takewhile(lambda x: x[0].startswith('#'), reader))
123
124	# copious timing info for those who enjoy these
125	timer, full = util.Timer(), util.Timer()
126
127	logger.debug("parsing '%s'" % inpname)
128	logger.debug("output to '%s'" % outname)
129
130	# create the unsorted output file and apply corrections
131	logger.debug("unsorted flat file '%s'" % flat)
132
133	fp = file(flat, 'wt')
134	for linec, row in enumerate(reader):
135	try:
136	chrom, start, end, strand = row[CHROM], row[START], row[END], row[STRAND]
137	except Exception, exc:
138	first = row[0][0]
139	# may be hitting the end of the file with other comments
140	if first == '>':
141	break # hit the sequence content of the gff file
142	elif first == '#':
143	continue # hit upon some comments
144	else:
145	logger.error(row)
146	raise Exception(exc)
147
148	if strand == '+':
149	# on forward strand, 5' is at start
150	idx = int(start) + shift
151	fwd, rev, val = 1, 0, 1
152	elif strand == '-':
153	# on reverse strand, 5' is at end
154	idx = int(end) - shift
155	fwd, rev, val = 0, 1, 1
156	else:
157	# no strand specified, generate interval centers
158	idx = (int(start)+int(end))/2
159	fwd, rev, val = 0, 0, 1
160
161	# it is essential be able to sort the index as a string!
162	fp.write('%s\t%012d\t%s\t%s\t%s\n' % (chrom, idx, fwd, rev, val))
163
164	fp.close()
165	linet = util.commify(linec)
166	logger.debug("parsing %s lines finished in %s" % (linet, timer.report()))
167
168	# if it is producing coverage then it will expand reads into full intervaals
169
170	# now let the sorting commence
171	cmd = "sort %s > %s" % (flat, sorted)
172	logger.debug("sorting into '%s'" % sorted)
173	os.system(cmd)
174	logger.debug("sorting finished in %s" % timer.report() )
175
176	logger.debug("consolidating into '%s'" % outname)
177	consolidate( sorted, outname, format=format)
178	logger.debug("consolidate finished in %s" % timer.report() )
179	logger.debug("output saved to '%s'" % outname)
180	logger.debug("full conversion finished in %s" % full.report() )
181
182	# attempting to cleanup the remaining files
183	for name in (flat, sorted):
184	logger.debug("removing temporary file '%s'" % name )
185	os.remove(name)
186
187	# also runs the indexing on it
188	if index:
189	logger.debug("loading the index from '%s'" % outname)
190	# create the work directory
191	if options.workdir and not os.path.isdir(options.workdir):
192	os.mkdir(options.workdir)
193	result = hdflib.PositionalData(fname=outname, update=True, workdir=options.workdir)
194	logger.debug("indexing finished in %s" % timer.report() )
195	result.close()
196
197	logger.debug("moving index to main output '%s'" % outname )
198
199	# remove the intermediate file
200	os.remove(outname)
201
202	# move the index as the output file
203	shutil.move(result.index, outname)
204
205	def option_parser():
206	"The option parser may be constructed in other tools invoking this script"
207	import optparse
208
209	usage = "usage: %prog -i inputfile -o outputfile -f format"
210
211	parser = optparse.OptionParser(usage=usage)
212
213	# setting the input file name
214	parser.add_option(
215	'-i', '--input', action="store",
216	dest="inpname", type='str', default=None,
217	help="the input file name (required)"
218	)
219
220	# setting the output file name
221	parser.add_option(
222	'-o', '--output', action="store",
223	dest="outname", type='str', default=None,
224	help="output file name (required)"
225	)
226
227	# file formats
228	parser.add_option(
229	'-f', '--format', action="store",
230	dest="format", type="str", default='',
231	help="input file format, bed or gff (required)",
232	)
233
234	# correction shift added in 5' direction for start/end coordinates
235	parser.add_option(
236	'-s', '--shift', action="store",
237	dest="shift", type="int", default=0,
238	help="shift for the 5' end on each strand (default=0)",
239	)
240
241	# verbosity can be 0,1 and 2 (increasing verbosity)
242	parser.add_option(
243	'-v', '--verbosity', action="store",
244	dest="verbosity", type="int", default=1,
245	help="sets the verbosity (0, 1) (default=1)",
246	)
247
248	parser.add_option("-x", "--index",
249	action="store_true", dest="index", default=False,
250	help="also creates an hdf index for the file")
251
252	parser.add_option(
253	'-w', '--workdir', action="store",
254	dest="workdir", type='str', default=None,
255	help="work directory (optional)"
256	)
257
258	return parser
259
260	if __name__ == '__main__':
261
262	parser = option_parser()
263
264	options, args = parser.parse_args()
265
266	# uppercase the format
267	options.format = options.format.upper()
268
269	if options.format not in ('BED', 'GFF'):
270	sys.stdout = sys.stderr
271	parser.print_help()
272	sys.exit(-1)
273
274	logger.disable(options.verbosity)
275
276	# missing file names
277	if not (options.inpname and options.outname and options.format):
278	parser.print_help()
279	sys.exit(-1)
280	else:
281	transform(inpname=options.inpname, outname=options.outname,\
282	format=options.format, shift=options.shift, index=options.index, options=options)

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/GeneTrack-2.0.0_beta_1_dev_48da9e998f0caf01c5be731e926f4b0481f658f0-py2.6.egg/genetrack/scripts/tabs2genetrack.py

異なるフォーマットでダウンロード: