Context Navigation

bfast_wrapper.py

リビジョン 2, 17.7 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	Runs BFAST on single-end or paired-end data.
5	TODO: more documentation
6
7	TODO:
8	- auto-detect gzip or bz2
9	- split options (?)
10	- queue lengths (?)
11	- assumes reference always has been indexed
12	- main and secondary indexes
13	- scoring matrix file ?
14	- read group file ?
15
16	usage: bfast_wrapper.py [options]
17	-r, --ref=r: The reference genome to use or index
18	-f, --fastq=f: The fastq file to use for the mapping
19	-F, --output=u: The file to save the output (SAM format)
20	-s, --fileSource=s: Whether to use a previously indexed reference sequence or one from history (indexed or history)
21	-p, --params=p: Parameter setting to use (pre_set or full)
22	-n, --numThreads=n: The number of threads to use
23	-A, --space=A: The encoding space (0: base 1: color)
24	-o, --offsets=o: The offsets for 'match'
25	-l, --loadAllIndexes=l: Load all indexes into memory
26	-k, --keySize=k: truncate key size in 'match'
27	-K, --maxKeyMatches=K: the maximum number of matches to allow before a key is ignored
28	-M, --maxNumMatches=M: the maximum number of matches to allow before the read is discarded
29	-w, --whichStrand=w: the strands to consider (0: both 1: forward 2: reverse)
30	-t, --timing=t: output timing information to stderr
31	-u, --ungapped=u: performed ungapped local alignment
32	-U, --unconstrained=U: performed local alignment without mask constraints
33	-O, --offset=O: the number of bases before and after each hit to consider in local alignment
34	-q, --avgMismatchQuality=q: average mismatch quality
35	-a, --algorithm=a: post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all)
36	-P, --disallowPairing=P: do not choose alignments based on pairing
37	-R, --reverse=R: paired end reads are given on reverse strands
38	-z, --random=z: output a random best scoring alignment
39	-D, --dbkey=D: Dbkey for reference genome
40	"""
41
42	import optparse, os, shutil, subprocess, sys, tempfile
43
44	def stop_err( msg ):
45	sys.stderr.write( '%s\n' % msg )
46	sys.exit()
47
48	def __main__():
49	#Parse Command Line
50	parser = optparse.OptionParser()
51	parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to index and use' )
52	parser.add_option( '-f', '--fastq', dest='fastq', help='The fastq file to use for the mapping' )
53	parser.add_option( '-F', '--output', dest='output', help='The file to save the output (SAM format)' )
54	parser.add_option( '-A', '--space', dest='space', type="choice", default='0', choices=('0','1' ), help='The encoding space (0: base 1: color)' )
55	parser.add_option( '-H', '--suppressHeader', action="store_true", dest='suppressHeader', default=False, help='Suppress header' )
56	parser.add_option( '-n', '--numThreads', dest='numThreads', type="int", default="1", help='The number of threads to use' )
57	parser.add_option( '-t', '--timing', action="store_true", default=False, dest='timing', help='output timming information to stderr' )
58	parser.add_option( '-l', '--loadAllIndexes', action="store_true", default=False, dest='loadAllIndexes', help='Load all indexes into memory' )
59	parser.add_option( '-m', '--indexMask', dest='indexMask', help='String containing info on how to build custom indexes' )
60	parser.add_option( "-b", "--buildIndex", action="store_true", dest="buildIndex", default=False, help='String containing info on how to build custom indexes' )
61	parser.add_option( "--indexRepeatMasker", action="store_true", dest="indexRepeatMasker", default=False, help='Do not index lower case sequences. Such as those created by RepeatMasker' )
62	parser.add_option( '--indexContigOptions', dest='indexContigOptions', default="", help='The contig range options to use for the indexing' )
63	parser.add_option( '--indexExonsFileName', dest='indexExonsFileName', default="", help='The exons file to use for the indexing' )
64
65	parser.add_option( '-o', '--offsets', dest='offsets', default="", help='The offsets for \'match\'' )
66	parser.add_option( '-k', '--keySize', dest='keySize', type="int", default="-1", help='truncate key size in \'match\'' )
67	parser.add_option( '-K', '--maxKeyMatches', dest='maxKeyMatches', type="int", default="-1", help='the maximum number of matches to allow before a key is ignored' )
68	parser.add_option( '-M', '--maxNumMatches', dest='maxNumMatches', type="int", default="-1", help='the maximum number of matches to allow bfore the read is discarded' )
69	parser.add_option( '-w', '--whichStrand', dest='whichStrand', type="choice", default='0', choices=('0','1','2'), help='the strands to consider (0: both 1: forward 2: reverse)' )
70
71	parser.add_option( '--scoringMatrixFileName', dest='scoringMatrixFileName', help='Scoring Matrix file used to score the alignments' )
72	parser.add_option( '-u', '--ungapped', dest='ungapped', action="store_true", default=False, help='performed ungapped local alignment' )
73	parser.add_option( '-U', '--unconstrained', dest='unconstrained', action="store_true", default=False, help='performed local alignment without mask constraints' )
74	parser.add_option( '-O', '--offset', dest='offset', type="int", default="0", help='the number of bases before and after each hit to consider in local alignment' )
75	parser.add_option( '-q', '--avgMismatchQuality', type="int", default="-1", dest='avgMismatchQuality', help='average mismatch quality' )
76
77	parser.add_option( '-a', '--algorithm', dest='algorithm', default='0', type="choice", choices=('0','1','2','3','4' ), help='post processing algorithm (0: no filtering, 1: all passing filters, 2: unique, 3: best scoring unique, 4: best score all' )
78	parser.add_option( '--unpaired', dest='unpaired', action="store_true", default=False, help='do not choose alignments based on pairing' )
79	parser.add_option( '--reverseStrand', dest='reverseStrand', action="store_true", default=False, help='paired end reads are given on reverse strands' )
80	parser.add_option( '--pairedEndInfer', dest='pairedEndInfer', action="store_true", default=False, help='break ties when one end of a paired end read by estimating the insert size distribution' )
81	parser.add_option( '--randomBest', dest='randomBest', action="store_true", default=False, help='output a random best scoring alignment' )
82
83	(options, args) = parser.parse_args()
84
85	buffsize = 1048576
86
87	# make temp directory for bfast, requires trailing slash
88	tmp_dir = '%s/' % tempfile.mkdtemp()
89
90	#'generic' options used in all bfast commands here
91	if options.timing:
92	all_cmd_options = "-t"
93	else:
94	all_cmd_options = ""
95
96	try:
97	if options.buildIndex:
98	reference_filepath = tempfile.NamedTemporaryFile( dir=tmp_dir, suffix='.fa' ).name
99	#build bfast indexes
100	os.symlink( options.ref, reference_filepath )
101
102	#bfast fast2brg
103	try:
104	nuc_space = [ "0" ]
105	if options.space == "1":
106	#color space localalign appears to require nuc space brg
107	nuc_space.append( "1" )
108	for space in nuc_space:
109	cmd = 'bfast fasta2brg -f "%s" -A "%s" %s' % ( reference_filepath, space, all_cmd_options )
110	tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
111	tmp_stderr = open( tmp, 'wb' )
112	proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
113	returncode = proc.wait()
114	tmp_stderr.close()
115	# get stderr, allowing for case where it's very large
116	tmp_stderr = open( tmp, 'rb' )
117	stderr = ''
118	try:
119	while True:
120	stderr += tmp_stderr.read( buffsize )
121	if not stderr or len( stderr ) % buffsize != 0:
122	break
123	except OverflowError:
124	pass
125	tmp_stderr.close()
126	if returncode != 0:
127	raise Exception, stderr
128	except Exception, e:
129	raise Exception, 'Error in \'bfast fasta2brg\'.\n' + str( e )
130
131	#bfast index
132	try:
133	all_index_cmds = 'bfast index %s -f "%s" -A "%s" -n "%s"' % ( all_cmd_options, reference_filepath, options.space, options.numThreads )
134
135	if options.indexRepeatMasker:
136	all_index_cmds += " -R"
137
138	if options.indexContigOptions:
139	index_contig_options = map( int, options.indexContigOptions.split( ',' ) )
140	if index_contig_options[0] >= 0:
141	all_index_cmds += ' -s "%s"' % index_contig_options[0]
142	if index_contig_options[1] >= 0:
143	all_index_cmds += ' -S "%s"' % index_contig_options[1]
144	if index_contig_options[2] >= 0:
145	all_index_cmds += ' -e "%s"' % index_contig_options[2]
146	if index_contig_options[3] >= 0:
147	all_index_cmds += ' -E "%s"' % index_contig_options[3]
148	elif options.indexExonsFileName:
149	all_index_cmds += ' -x "%s"' % options.indexExonsFileName
150
151	index_count = 1
152	for mask, hash_width in [ mask.split( ':' ) for mask in options.indexMask.split( ',' ) ]:
153	cmd = '%s -m "%s" -w "%s" -i "%i"' % ( all_index_cmds, mask, hash_width, index_count )
154	tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
155	tmp_stderr = open( tmp, 'wb' )
156	proc = subprocess.Popen( args=cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
157	returncode = proc.wait()
158	tmp_stderr.close()
159	# get stderr, allowing for case where it's very large
160	tmp_stderr = open( tmp, 'rb' )
161	stderr = ''
162	try:
163	while True:
164	stderr += tmp_stderr.read( buffsize )
165	if not stderr or len( stderr ) % buffsize != 0:
166	break
167	except OverflowError:
168	pass
169	tmp_stderr.close()
170	if returncode != 0:
171	raise Exception, stderr
172	index_count += 1
173	except Exception, e:
174	raise Exception, 'Error in \'bfast index\'.\n' + str( e )
175
176	else:
177	reference_filepath = options.ref
178	assert reference_filepath and os.path.exists( reference_filepath ), 'A valid genome reference was not provided.'
179
180	# set up aligning and generate aligning command options
181	# set up temp output files
182	tmp_bmf = tempfile.NamedTemporaryFile( dir=tmp_dir )
183	tmp_bmf_name = tmp_bmf.name
184	tmp_bmf.close()
185	tmp_baf = tempfile.NamedTemporaryFile( dir=tmp_dir )
186	tmp_baf_name = tmp_baf.name
187	tmp_baf.close()
188
189	bfast_match_cmd = 'bfast match -f "%s" -r "%s" -n "%s" -A "%s" -T "%s" -w "%s" %s' % ( reference_filepath, options.fastq, options.numThreads, options.space, tmp_dir, options.whichStrand, all_cmd_options )
190	bfast_localalign_cmd = 'bfast localalign -f "%s" -m "%s" -n "%s" -A "%s" -o "%s" %s' % ( reference_filepath, tmp_bmf_name, options.numThreads, options.space, options.offset, all_cmd_options )
191	bfast_postprocess_cmd = 'bfast postprocess -O 1 -f "%s" -i "%s" -n "%s" -A "%s" -a "%s" %s' % ( reference_filepath, tmp_baf_name, options.numThreads, options.space, options.algorithm, all_cmd_options )
192
193	if options.offsets:
194	bfast_match_cmd += ' -o "%s"' % options.offsets
195	if options.keySize >= 0:
196	bfast_match_cmd += ' -k "%s"' % options.keySize
197	if options.maxKeyMatches >= 0:
198	bfast_match_cmd += ' -K "%s"' % options.maxKeyMatches
199	if options.maxNumMatches >= 0:
200	bfast_match_cmd += ' -M "%s"' % options.maxNumMatches
201	bfast_localalign_cmd += ' -M "%s"' % options.maxNumMatches
202	if options.scoringMatrixFileName:
203	bfast_localalign_cmd += ' -x "%s"' % options.scoringMatrixFileName
204	bfast_postprocess_cmd += ' -x "%s"' % options.scoringMatrixFileName
205	if options.ungapped:
206	bfast_localalign_cmd += ' -u'
207	if options.unconstrained:
208	bfast_localalign_cmd += ' -U'
209	if options.avgMismatchQuality >= 0:
210	bfast_localalign_cmd += ' -q "%s"' % options.avgMismatchQuality
211	bfast_postprocess_cmd += ' -q "%s"' % options.avgMismatchQuality
212	if options.algorithm == 3:
213	if options.pairedEndInfer:
214	bfast_postprocess_cmd += ' -P'
215	if options.randomBest:
216	bfast_postprocess_cmd += ' -z'
217	if options.unpaired:
218	bfast_postprocess_cmd += ' -U'
219	if options.reverseStrand:
220	bfast_postprocess_cmd += ' -R'
221
222	#instead of using temp files, should we stream through pipes?
223	bfast_match_cmd += " > %s" % tmp_bmf_name
224	bfast_localalign_cmd += " > %s" % tmp_baf_name
225	bfast_postprocess_cmd += " > %s" % options.output
226
227	# need to nest try-except in try-finally to handle 2.4
228	try:
229	# bfast 'match'
230	try:
231	tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
232	tmp_stderr = open( tmp, 'wb' )
233	proc = subprocess.Popen( args=bfast_match_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
234	returncode = proc.wait()
235	tmp_stderr.close()
236	# get stderr, allowing for case where it's very large
237	tmp_stderr = open( tmp, 'rb' )
238	stderr = ''
239	try:
240	while True:
241	stderr += tmp_stderr.read( buffsize )
242	if not stderr or len( stderr ) % buffsize != 0:
243	break
244	except OverflowError:
245	pass
246	tmp_stderr.close()
247	if returncode != 0:
248	raise Exception, stderr
249	except Exception, e:
250	raise Exception, 'Error in \'bfast match\'. \n' + str( e )
251	# bfast 'localalign'
252	try:
253	tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
254	tmp_stderr = open( tmp, 'wb' )
255	proc = subprocess.Popen( args=bfast_localalign_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
256	returncode = proc.wait()
257	tmp_stderr.close()
258	# get stderr, allowing for case where it's very large
259	tmp_stderr = open( tmp, 'rb' )
260	stderr = ''
261	try:
262	while True:
263	stderr += tmp_stderr.read( buffsize )
264	if not stderr or len( stderr ) % buffsize != 0:
265	break
266	except OverflowError:
267	pass
268	tmp_stderr.close()
269	if returncode != 0:
270	raise Exception, stderr
271	except Exception, e:
272	raise Exception, 'Error in \'bfast localalign\'. \n' + str( e )
273	# bfast 'postprocess'
274	try:
275	tmp = tempfile.NamedTemporaryFile( dir=tmp_dir ).name
276	tmp_stderr = open( tmp, 'wb' )
277	proc = subprocess.Popen( args=bfast_postprocess_cmd, shell=True, cwd=tmp_dir, stderr=tmp_stderr.fileno() )
278	returncode = proc.wait()
279	tmp_stderr.close()
280	# get stderr, allowing for case where it's very large
281	tmp_stderr = open( tmp, 'rb' )
282	stderr = ''
283	try:
284	while True:
285	stderr += tmp_stderr.read( buffsize )
286	if not stderr or len( stderr ) % buffsize != 0:
287	break
288	except OverflowError:
289	pass
290	tmp_stderr.close()
291	if returncode != 0:
292	raise Exception, stderr
293	except Exception, e:
294	raise Exception, 'Error in \'bfast postprocess\'. \n' + str( e )
295	# remove header if necessary
296	if options.suppressHeader:
297	tmp_out = tempfile.NamedTemporaryFile( dir=tmp_dir)
298	tmp_out_name = tmp_out.name
299	tmp_out.close()
300	try:
301	shutil.move( options.output, tmp_out_name )
302	except Exception, e:
303	raise Exception, 'Error moving output file before removing headers. \n' + str( e )
304	fout = file( options.output, 'w' )
305	for line in file( tmp_out.name, 'r' ):
306	if len( line ) < 3 or line[0:3] not in [ '@HD', '@SQ', '@RG', '@PG', '@CO' ]:
307	fout.write( line )
308	fout.close()
309	# check that there are results in the output file
310	if os.path.getsize( options.output ) > 0:
311	if "0" == options.space:
312	sys.stdout.write( 'BFAST run on Base Space data' )
313	else:
314	sys.stdout.write( 'BFAST run on Color Space data' )
315	else:
316	raise Exception, 'The output file is empty. You may simply have no matches, or there may be an error with your input file or settings.'
317	except Exception, e:
318	stop_err( 'The alignment failed.\n' + str( e ) )
319	finally:
320	# clean up temp dir
321	if os.path.exists( tmp_dir ):
322	shutil.rmtree( tmp_dir )
323
324	if __name__=="__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/sr_mapping/bfast_wrapper.py

異なるフォーマットでダウンロード: