Context Navigation

bowtie_wrapper.py

リビジョン 2, 23.5 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	Runs Bowtie on single-end or paired-end data.
5	For use with Bowtie v. 0.12.3
6
7	usage: bowtie_wrapper.py [options]
8	-t, --threads=t: The number of threads to run
9	-i, --input1=i: The (forward or single-end) reads file in Sanger FASTQ format
10	-I, --input2=I: The reverse reads file in Sanger FASTQ format
11	-o, --output=o: The output file
12	-4, --dataType=4: The type of data (SOLiD or Solexa)
13	-2, --paired=2: Whether the data is single- or paired-end
14	-g, --genomeSource=g: The type of reference provided
15	-r, --ref=r: The reference genome to use or index
16	-s, --skip=s: Skip the first n reads
17	-a, --alignLimit=a: Only align the first n reads
18	-T, --trimH=T: Trim n bases from high-quality (left) end of each read before alignment
19	-L, --trimL=L: Trim n bases from low-quality (right) end of each read before alignment
20	-m, --mismatchSeed=m: Maximum number of mismatches permitted in the seed
21	-M, --mismatchQual=M: Maximum permitted total of quality values at mismatched read positions
22	-l, --seedLen=l: Seed length
23	-n, --rounding=n: Whether or not to round to the nearest 10 and saturating at 30
24	-P, --maqSoapAlign=P: Choose MAQ- or SOAP-like alignment policy
25	-w, --tryHard=: Whether or not to try as hard as possible to find valid alignments when they exist
26	-v, --valAlign=v: Report up to n valid arguments per read
27	-V, --allValAligns=V: Whether or not to report all valid alignments per read
28	-G, --suppressAlign=G: Suppress all alignments for a read if more than n reportable alignments exist
29	-b, --best=b: Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions
30	-B, --maxBacktracks=B: Maximum number of backtracks permitted when aligning a read
31	-R, --strata=R: Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable
32	-j, --minInsert=j: Minimum insert size for valid paired-end alignments
33	-J, --maxInsert=J: Maximum insert size for valid paired-end alignments
34	-O, --mateOrient=O: The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand
35	-A, --maxAlignAttempt=A: Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate
36	-f, --forwardAlign=f: Whether or not to attempt to align the forward reference strand
37	-E, --reverseAlign=E: Whether or not to attempt to align the reverse-complement reference strand
38	-F, --offrate=F: Override the offrate of the index to n
39	-8, --snpphred=8: SNP penalty on Phred scale
40	-6, --snpfrac=6: Fraction of sites expected to be SNP sites
41	-7, --keepends=7: Keep extreme-end nucleotides and qualities
42	-S, --seed=S: Seed for pseudo-random number generator
43	-C, --params=C: Whether to use default or specified parameters
44	-u, --iautoB=u: Automatic or specified behavior
45	-K, --ipacked=K: Whether or not to use a packed representation for DNA strings
46	-Q, --ibmax=Q: Maximum number of suffixes allowed in a block
47	-Y, --ibmaxdivn=Y: Maximum number of suffixes allowed in a block as a fraction of the length of the reference
48	-D, --idcv=D: The period for the difference-cover sample
49	-U, --inodc=U: Whether or not to disable the use of the difference-cover sample
50	-y, --inoref=y: Whether or not to build the part of the reference index used only in paired-end alignment
51	-z, --ioffrate=z: How many rows get marked during annotation of some or all of the Burrows-Wheeler rows
52	-W, --iftab=W: The size of the lookup table used to calculate an initial Burrows-Wheeler range with respect to the first n characters of the query
53	-X, --intoa=X: Whether or not to convert Ns in the reference sequence to As
54	-N, --iendian=N: Endianness to use when serializing integers to the index file
55	-Z, --iseed=Z: Seed for the pseudorandom number generator
56	-c, --icutoff=c: Number of first bases of the reference sequence to index
57	-x, --indexSettings=x: Whether or not indexing options are to be set
58	-H, --suppressHeader=H: Suppress header
59	--do_not_build_index: Flag to specify that provided file is already indexed and to just use 'as is'
60	"""
61
62	import optparse, os, shutil, subprocess, sys, tempfile
63
64	def stop_err( msg ):
65	sys.stderr.write( '%s\n' % msg )
66	sys.exit()
67
68	def __main__():
69	#Parse Command Line
70	parser = optparse.OptionParser()
71	parser.add_option( '-t', '--threads', dest='threads', help='The number of threads to run' )
72	parser.add_option( '-4', '--dataType', dest='dataType', help='The type of data (SOLiD or Solexa)' )
73	parser.add_option( '-i', '--input1', dest='input1', help='The (forward or single-end) reads file in Sanger FASTQ format' )
74	parser.add_option( '-I', '--input2', dest='input2', help='The reverse reads file in Sanger FASTQ format' )
75	parser.add_option( '-o', '--output', dest='output', help='The output file' )
76	parser.add_option( '-2', '--paired', dest='paired', help='Whether the data is single- or paired-end' )
77	parser.add_option( '-g', '--genomeSource', dest='genomeSource', help='The type of reference provided' )
78	parser.add_option( '-r', '--ref', dest='ref', help='The reference genome to use or index' )
79	parser.add_option( '-s', '--skip', dest='skip', help='Skip the first n reads' )
80	parser.add_option( '-a', '--alignLimit', dest='alignLimit', help='Only align the first n reads' )
81	parser.add_option( '-T', '--trimH', dest='trimH', help='Trim n bases from high-quality (left) end of each read before alignment' )
82	parser.add_option( '-L', '--trimL', dest='trimL', help='Trim n bases from low-quality (right) end of each read before alignment' )
83	parser.add_option( '-m', '--mismatchSeed', dest='mismatchSeed', help='Maximum number of mismatches permitted in the seed' )
84	parser.add_option( '-M', '--mismatchQual', dest='mismatchQual', help='Maximum permitted total of quality values at mismatched read positions' )
85	parser.add_option( '-l', '--seedLen', dest='seedLen', help='Seed length' )
86	parser.add_option( '-n', '--rounding', dest='rounding', help='Whether or not to round to the nearest 10 and saturating at 30' )
87	parser.add_option( '-P', '--maqSoapAlign', dest='maqSoapAlign', help='Choose MAQ- or SOAP-like alignment policy' )
88	parser.add_option( '-w', '--tryHard', dest='tryHard', help='Whether or not to try as hard as possible to find valid alignments when they exist' )
89	parser.add_option( '-v', '--valAlign', dest='valAlign', help='Report up to n valid arguments per read' )
90	parser.add_option( '-V', '--allValAligns', dest='allValAligns', help='Whether or not to report all valid alignments per read' )
91	parser.add_option( '-G', '--suppressAlign', dest='suppressAlign', help='Suppress all alignments for a read if more than n reportable alignments exist' )
92	parser.add_option( '-b', '--best', dest='best', help="Whether or not to make Bowtie guarantee that reported singleton alignments are 'best' in terms of stratum and in terms of the quality values at the mismatched positions" )
93	parser.add_option( '-B', '--maxBacktracks', dest='maxBacktracks', help='Maximum number of backtracks permitted when aligning a read' )
94	parser.add_option( '-R', '--strata', dest='strata', help='Whether or not to report only those alignments that fall in the best stratum if many valid alignments exist and are reportable' )
95	parser.add_option( '-j', '--minInsert', dest='minInsert', help='Minimum insert size for valid paired-end alignments' )
96	parser.add_option( '-J', '--maxInsert', dest='maxInsert', help='Maximum insert size for valid paired-end alignments' )
97	parser.add_option( '-O', '--mateOrient', dest='mateOrient', help='The upstream/downstream mate orientation for valid paired-end alignment against the forward reference strand' )
98	parser.add_option( '-A', '--maxAlignAttempt', dest='maxAlignAttempt', help='Maximum number of attempts Bowtie will make to match an alignment for one mate with an alignment for the opposite mate' )
99	parser.add_option( '-f', '--forwardAlign', dest='forwardAlign', help='Whether or not to attempt to align the forward reference strand' )
100	parser.add_option( '-E', '--reverseAlign', dest='reverseAlign', help='Whether or not to attempt to align the reverse-complement reference strand' )
101	parser.add_option( '-F', '--offrate', dest='offrate', help='Override the offrate of the index to n' )
102	parser.add_option( '-S', '--seed', dest='seed', help='Seed for pseudo-random number generator' )
103	parser.add_option( '-8', '--snpphred', dest='snpphred', help='SNP penalty on Phred scale' )
104	parser.add_option( '-6', '--snpfrac', dest='snpfrac', help='Fraction of sites expected to be SNP sites' )
105	parser.add_option( '-7', '--keepends', dest='keepends', help='Keep extreme-end nucleotides and qualities' )
106	parser.add_option( '-C', '--params', dest='params', help='Whether to use default or specified parameters' )
107	parser.add_option( '-u', '--iautoB', dest='iautoB', help='Automatic or specified behavior' )
108	parser.add_option( '-K', '--ipacked', dest='ipacked', help='Whether or not to use a packed representation for DNA strings' )
109	parser.add_option( '-Q', '--ibmax', dest='ibmax', help='Maximum number of suffixes allowed in a block' )
110	parser.add_option( '-Y', '--ibmaxdivn', dest='ibmaxdivn', help='Maximum number of suffixes allowed in a block as a fraction of the length of the reference' )
111	parser.add_option( '-D', '--idcv', dest='idcv', help='The period for the difference-cover sample' )
112	parser.add_option( '-U', '--inodc', dest='inodc', help='Whether or not to disable the use of the difference-cover sample' )
113	parser.add_option( '-y', '--inoref', dest='inoref', help='Whether or not to build the part of the reference index used only in paired-end alignment' )
114	parser.add_option( '-z', '--ioffrate', dest='ioffrate', help='How many rows get marked during annotation of some or all of the Burrows-Wheeler rows' )
115	parser.add_option( '-W', '--iftab', dest='iftab', help='The size of the lookup table used to calculate an initial Burrows-Wheeler range with respect to the first n characters of the query' )
116	parser.add_option( '-X', '--intoa', dest='intoa', help='Whether or not to convert Ns in the reference sequence to As' )
117	parser.add_option( '-N', '--iendian', dest='iendian', help='Endianness to use when serializing integers to the index file' )
118	parser.add_option( '-Z', '--iseed', dest='iseed', help='Seed for the pseudorandom number generator' )
119	parser.add_option( '-c', '--icutoff', dest='icutoff', help='Number of first bases of the reference sequence to index' )
120	parser.add_option( '-x', '--indexSettings', dest='index_settings', help='Whether or not indexing options are to be set' )
121	parser.add_option( '-H', '--suppressHeader', dest='suppressHeader', help='Suppress header' )
122	parser.add_option( '--do_not_build_index', dest='do_not_build_index', action="store_true", default=False, help='Flag to specify that provided file is already indexed, use as is' )
123	(options, args) = parser.parse_args()
124	stdout = ''
125
126	# make temp directory for placement of indices and copy reference file there if necessary
127	tmp_index_dir = tempfile.mkdtemp()
128	# get type of data (solid or solexa)
129	if options.dataType == 'solid':
130	colorspace = '-C'
131	else:
132	colorspace = ''
133	# index if necessary
134	if options.genomeSource == 'history' and not options.do_not_build_index:
135	# set up commands
136	if options.index_settings =='indexPreSet':
137	indexing_cmds = '%s' % colorspace
138	else:
139	try:
140	if options.iautoB != 'None' and options.iautoB == 'set':
141	iautoB = '--noauto'
142	else:
143	iautoB = ''
144	if options. ipacked != 'None' and options.ipacked == 'packed':
145	ipacked = '--packed'
146	else:
147	ipacked = ''
148	if options.ibmax != 'None' and int( options.ibmax ) >= 1:
149	ibmax = '--bmax %s' % options.ibmax
150	else:
151	ibmax = ''
152	if options.ibmaxdivn != 'None' and int( options.ibmaxdivn ) >= 0:
153	ibmaxdivn = '--bmaxdivn %s' % options.ibmaxdivn
154	else:
155	ibmaxdivn = ''
156	if options.idcv != 'None' and int( options.idcv ) > 0:
157	idcv = '--dcv %s' % options.idcv
158	else:
159	idcv = ''
160	if options.inodc != 'None' and options.inodc == 'nodc':
161	inodc = '--nodc'
162	else:
163	inodc = ''
164	if options.inoref != 'None' and options.inoref == 'noref':
165	inoref = '--noref'
166	else:
167	inoref = ''
168	if options.iftab != 'None' and int( options.iftab ) >= 0:
169	iftab = '--ftabchars %s' % options.iftab
170	else:
171	iftab = ''
172	if options.intoa != 'None' and options.intoa == 'yes':
173	intoa = '--ntoa'
174	else:
175	intoa = ''
176	if options.iendian != 'None' and options.iendian == 'big':
177	iendian = '--big'
178	else:
179	iendian = '--little'
180	if options.iseed != 'None' and int( options.iseed ) > 0:
181	iseed = '--seed %s' % options.iseed
182	else:
183	iseed = ''
184	if options.icutoff != 'None' and int( options.icutoff ) > 0:
185	icutoff = '--cutoff %s' % options.icutoff
186	else:
187	icutoff = ''
188	indexing_cmds = '%s %s %s %s %s %s %s --offrate %s %s %s %s %s %s %s' % \
189	( iautoB, ipacked, ibmax, ibmaxdivn, idcv, inodc,
190	inoref, options.ioffrate, iftab, intoa, iendian,
191	iseed, icutoff, colorspace )
192	except ValueError, e:
193	# clean up temp dir
194	if os.path.exists( tmp_index_dir ):
195	shutil.rmtree( tmp_index_dir )
196	stop_err( "Something is wrong with the indexing parameters and the indexing and alignment could not be run. Make sure you don't have any non-numeric values where they should be numeric.\n" + str( e ) )
197	ref_file = tempfile.NamedTemporaryFile( dir=tmp_index_dir )
198	ref_file_name = ref_file.name
199	ref_file.close()
200	os.symlink( options.ref, ref_file_name )
201	cmd1 = 'bowtie-build %s -f %s %s' % ( indexing_cmds, ref_file_name, ref_file_name )
202	try:
203	tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
204	tmp_stderr = open( tmp, 'wb' )
205	proc = subprocess.Popen( args=cmd1, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
206	returncode = proc.wait()
207	tmp_stderr.close()
208	# get stderr, allowing for case where it's very large
209	tmp_stderr = open( tmp, 'rb' )
210	stderr = ''
211	buffsize = 1048576
212	try:
213	while True:
214	stderr += tmp_stderr.read( buffsize )
215	if not stderr or len( stderr ) % buffsize != 0:
216	break
217	except OverflowError:
218	pass
219	tmp_stderr.close()
220	if returncode != 0:
221	raise Exception, stderr
222	except Exception, e:
223	# clean up temp dir
224	if os.path.exists( tmp_index_dir ):
225	shutil.rmtree( tmp_index_dir )
226	stop_err( 'Error indexing reference sequence\n' + str( e ) )
227	stdout += 'File indexed. '
228	else:
229	ref_file_name = options.ref
230	# set up aligning and generate aligning command options
231	# automatically set threads in both cases
232	if options.suppressHeader == 'true':
233	suppressHeader = '--sam-nohead'
234	else:
235	suppressHeader = ''
236	if options.maxInsert != 'None' and int( options.maxInsert ) > 0:
237	maxInsert = '-X %s' % options.maxInsert
238	else:
239	maxInsert = ''
240	if options.mateOrient != 'None':
241	mateOrient = '--%s' % options.mateOrient
242	else:
243	mateOrient = ''
244	if options.params == 'preSet':
245	aligning_cmds = '%s %s -p %s -S %s -q %s ' % \
246	( maxInsert, mateOrient, options.threads, suppressHeader, colorspace )
247	else:
248	try:
249	if options.skip != 'None' and int( options.skip ) > 0:
250	skip = '-s %s' % options.skip
251	else:
252	skip = ''
253	if options.alignLimit != 'None' and int( options.alignLimit ) >= 0:
254	alignLimit = '-u %s' % options.alignLimit
255	else:
256	alignLimit = ''
257	if options.trimH != 'None' and int( options.trimH ) > 0:
258	trimH = '-5 %s' % options.trimH
259	else:
260	trimH = ''
261	if options.trimL != 'None' and int( options.trimL ) > 0:
262	trimL = '-3 %s' % options.trimL
263	else:
264	trimL = ''
265	if options.mismatchSeed != 'None' and (options.mismatchSeed == '0' or options.mismatchSeed == '1' \
266	or options.mismatchSeed == '2' or options.mismatchSeed == '3'):
267	mismatchSeed = '-n %s' % options.mismatchSeed
268	else:
269	mismatchSeed = ''
270	if options.mismatchQual != 'None' and int( options.mismatchQual ) >= 0:
271	mismatchQual = '-e %s' % options.mismatchQual
272	else:
273	mismatchQual = ''
274	if options.seedLen != 'None' and int( options.seedLen ) >= 5:
275	seedLen = '-l %s' % options.seedLen
276	else:
277	seedLen = ''
278	if options.rounding == 'noRound':
279	rounding = '--nomaqround'
280	else:
281	rounding = ''
282	if options.maqSoapAlign != '-1' and int( options.maqSoapAlign ) >= 0:
283	maqSoapAlign = '-v %s' % options.maqSoapAlign
284	else:
285	maqSoapAlign = ''
286	if options.minInsert != 'None' and int( options.minInsert ) > 0:
287	minInsert = '-I %s' % options.minInsert
288	else:
289	minInsert = ''
290	if options.maxAlignAttempt != 'None' and int( options.maxAlignAttempt ) >= 0:
291	maxAlignAttempt = '--pairtries %s' % options.maxAlignAttempt
292	else:
293	maxAlignAttempt = ''
294	if options.forwardAlign == 'noForward':
295	forwardAlign = '--nofw'
296	else:
297	forwardAlign = ''
298	if options.reverseAlign == 'noReverse':
299	reverseAlign = '--norc'
300	else:
301	reverseAlign = ''
302	if options.maxBacktracks != 'None' and int( options.maxBacktracks ) > 0 and \
303	( options.mismatchSeed == '2' or options.mismatchSeed == '3' ):
304	maxBacktracks = '--maxbts %s' % options.maxBacktracks
305	else:
306	maxBacktracks = ''
307	if options.tryHard == 'doTryHard':
308	tryHard = '-y'
309	else:
310	tryHard = ''
311	if options.valAlign != 'None' and int( options.valAlign ) >= 0:
312	valAlign = '-k %s' % options.valAlign
313	else:
314	valAlign = ''
315	if options.allValAligns == 'doAllValAligns':
316	allValAligns = '-a'
317	else:
318	allValAligns = ''
319	if options.suppressAlign != 'None' and int( options.suppressAlign ) >= 0:
320	suppressAlign = '-m %s' % options.suppressAlign
321	else:
322	suppressAlign = ''
323	if options.best == 'doBest':
324	best = '--best'
325	else:
326	best = ''
327	if options.strata == 'doStrata':
328	strata = '--strata'
329	else:
330	strata = ''
331	if options.offrate != 'None' and int( options.offrate ) >= 0:
332	offrate = '-o %s' % options.offrate
333	else:
334	offrate = ''
335	if options.seed != 'None' and int( options.seed ) >= 0:
336	seed = '--seed %s' % options.seed
337	else:
338	seed = ''
339	if options.snpphred != 'None' and int( options.snpphred ) >= 0:
340	snpphred = '--snpphred %s' % options.snpphred
341	else:
342	snpphred = ''
343	if options.snpfrac != 'None' and float( options.snpfrac ) >= 0:
344	snpfrac = '--snpfrac %s' % options.snpfrac
345	else:
346	snpfrac = ''
347	if options.keepends != 'None' and options.keepends == 'doKeepends':
348	keepends = '--col-keepends'
349	else:
350	keepends = ''
351	aligning_cmds = '%s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s ' \
352	'%s %s %s %s %s %s %s %s %s %s %s -p %s -S %s -q' % \
353	( skip, alignLimit, trimH, trimL, mismatchSeed, mismatchQual,
354	seedLen, rounding, maqSoapAlign, minInsert, maxInsert,
355	mateOrient, maxAlignAttempt, forwardAlign, reverseAlign,
356	maxBacktracks, tryHard, valAlign, allValAligns, suppressAlign,
357	best, strata, offrate, seed, colorspace, snpphred, snpfrac,
358	keepends, options.threads, suppressHeader )
359	except ValueError, e:
360	# clean up temp dir
361	if os.path.exists( tmp_index_dir ):
362	shutil.rmtree( tmp_index_dir )
363	stop_err( 'Something is wrong with the alignment parameters and the alignment could not be run\n' + str( e ) )
364	try:
365	# have to nest try-except in try-finally to handle 2.4
366	try:
367	# prepare actual aligning commands
368	if options.paired == 'paired':
369	cmd2 = 'bowtie %s %s -1 %s -2 %s > %s' % ( aligning_cmds, ref_file_name, options.input1, options.input2, options.output )
370	else:
371	cmd2 = 'bowtie %s %s %s > %s' % ( aligning_cmds, ref_file_name, options.input1, options.output )
372	# align
373	tmp = tempfile.NamedTemporaryFile( dir=tmp_index_dir ).name
374	tmp_stderr = open( tmp, 'wb' )
375	proc = subprocess.Popen( args=cmd2, shell=True, cwd=tmp_index_dir, stderr=tmp_stderr.fileno() )
376	returncode = proc.wait()
377	tmp_stderr.close()
378	# get stderr, allowing for case where it's very large
379	tmp_stderr = open( tmp, 'rb' )
380	stderr = ''
381	buffsize = 1048576
382	try:
383	while True:
384	stderr += tmp_stderr.read( buffsize )
385	if not stderr or len( stderr ) % buffsize != 0:
386	break
387	except OverflowError:
388	pass
389	tmp_stderr.close()
390	if returncode != 0:
391	raise Exception, stderr
392	# check that there are results in the output file
393	if os.path.getsize( options.output ) == 0:
394	raise Exception, 'The output file is empty, there may be an error with your input file or settings.' + '\nextra: ' + str(extra)
395	except Exception, e:
396	stop_err( 'Error aligning sequence. ' + str( e ) )
397	finally:
398	# clean up temp dir
399	if os.path.exists( tmp_index_dir ):
400	shutil.rmtree( tmp_index_dir )
401	stdout += 'Sequence file aligned.\n'
402	sys.stdout.write( stdout )
403
404	if __name__=="__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/sr_mapping/bowtie_wrapper.py

異なるフォーマットでダウンロード: