Context Navigation

shrimp_wrapper.py

リビジョン 2, 25.7 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	TODO
5	1. decrease memory usage
6	2. multi-fasta fastq file, ex. 454
7	3. split reads into small chuncks?
8
9	SHRiMP wrapper
10
11	Inputs:
12	1. reference seq
13	2. reads
14
15	Outputs:
16	1. table of 8 columns:
17	chrom ref_loc read_id read_loc ref_nuc read_nuc quality coverage
18	2. SHRiMP output
19
20	Parameters:
21	-s Spaced Seed (default: 111111011111)
22	-n Seed Matches per Window (default: 2)
23	-t Seed Hit Taboo Length (default: 4)
24	-9 Seed Generation Taboo Length (default: 0)
25	-w Seed Window Length (default: 115.00%)
26	-o Maximum Hits per Read (default: 100)
27	-r Maximum Read Length (default: 1000)
28	-d Kmer Std. Deviation Limit (default: -1 [None])
29
30	-m S-W Match Value (default: 100)
31	-i S-W Mismatch Value (default: -150)
32	-g S-W Gap Open Penalty (Reference) (default: -400)
33	-q S-W Gap Open Penalty (Query) (default: -400)
34	-e S-W Gap Extend Penalty (Reference) (default: -70)
35	-f S-W Gap Extend Penalty (Query) (default: -70)
36	-h S-W Hit Threshold (default: 68.00%)
37
38	Command:
39	%rmapper -s spaced_seed -n seed_matches_per_window -t seed_hit_taboo_length -9 seed_generation_taboo_length -w seed_window_length -o max_hits_per_read -r max_read_length -d kmer -m sw_match_value -i sw_mismatch_value -g sw_gap_open_ref -q sw_gap_open_query -e sw_gap_ext_ref -f sw_gap_ext_query -h sw_hit_threshold <query> <target> > <output> 2> <log>
40
41	SHRiMP output:
42	>7:2:1147:982/1 chr3 + 36586562 36586595 2 35 36 2900 3G16G13
43	>7:2:1147:982/1 chr3 + 95338194 95338225 4 35 36 2700 9T7C14
44	>7:2:587:93/1 chr3 + 14913541 14913577 1 35 36 2960 19--16
45
46	"""
47
48	import os, sys, tempfile, os.path, re
49
50	assert sys.version_info[:2] >= (2.4)
51
52	def stop_err( msg ):
53
54	sys.stderr.write( "%s\n" % msg )
55	sys.exit()
56
57	def reverse_complement(s):
58
59	complement_dna = {"A":"T", "T":"A", "C":"G", "G":"C", "a":"t", "t":"a", "c":"g", "g":"c", "N":"N", "n":"n" , ".":".", "-":"-"}
60	reversed_s = []
61	for i in s:
62	reversed_s.append(complement_dna[i])
63	reversed_s.reverse()
64	return "".join(reversed_s)
65
66	def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size):
67
68	invalid_editstring_char = 0
69
70	all_score_file = score_files.split(',')
71
72	if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.')
73
74	temp_table_name = tempfile.NamedTemporaryFile().name
75	temp_table = open(temp_table_name, 'w')
76
77	outfile = open(table_outfile,'w')
78
79	# reference seq: not a single fasta seq
80	refseq = {}
81	chrom_cov = {}
82	seq = ''
83
84	for i, line in enumerate(file(ref_file)):
85	line = line.rstrip()
86	if not line or line.startswith('#'): continue
87
88	if line.startswith('>'):
89	if seq:
90	if refseq.has_key(title):
91	pass
92	else:
93	refseq[title] = seq
94	chrom_cov[title] = {}
95	seq = ''
96	title = line[1:]
97	else:
98	seq += line
99	if seq:
100	if not refseq.has_key(title):
101	refseq[title] = seq
102	chrom_cov[title] = {}
103
104	# find hits : one end and/or the other
105	hits = {}
106	for i, line in enumerate(file(result_file)):
107	line = line.rstrip()
108	if not line or line.startswith('#'): continue
109
110	#FORMAT: readname contigname strand contigstart contigend readstart readend readlength score editstring
111	fields = line.split('\t')
112	readname = fields[0][1:]
113	chrom = fields[1]
114	strand = fields[2]
115	chrom_start = int(fields[3]) - 1
116	chrom_end = int(fields[4])
117	read_start = fields[5]
118	read_end = fields[6]
119	read_len = fields[7]
120	score = fields[8]
121	editstring = fields[9]
122
123	if hit_per_read == 1:
124	endindex = '1'
125	else:
126	readname, endindex = readname.split('/')
127
128	if hits.has_key(readname):
129	if hits[readname].has_key(endindex):
130	hits[readname][endindex].append([strand, editstring, chrom_start, chrom_end, read_start, chrom])
131	else:
132	hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]]
133	else:
134	hits[readname] = {}
135	hits[readname][endindex] = [[strand, editstring, chrom_start, chrom_end, read_start, chrom]]
136
137	# find score : one end and the other end
138	hits_score = {}
139	readname = ''
140	score = ''
141	for num_score_file in range(len(all_score_file)):
142	score_file = all_score_file[num_score_file]
143	for i, line in enumerate(file(score_file)):
144	line = line.rstrip()
145	if not line or line.startswith('#'): continue
146
147	if line.startswith('>'):
148	if score:
149	if hits.has_key(readname):
150	if len(hits[readname]) == hit_per_read:
151	if hits_score.has_key(readname):
152	if hits_score[readname].has_key(endindex):
153	pass
154	else:
155	hits_score[readname][endindex] = score
156	else:
157	hits_score[readname] = {}
158	hits_score[readname][endindex] = score
159	score = ''
160	if hit_per_read == 1:
161	readname = line[1:]
162	endindex = '1'
163	else:
164	readname, endindex = line[1:].split('/')
165	else:
166	score = line
167
168	if score: # the last one
169	if hits.has_key(readname):
170	if len(hits[readname]) == hit_per_read:
171	if hits_score.has_key(readname):
172	if hits_score[readname].has_key(endindex):
173	pass
174	else:
175	hits_score[readname][endindex] = score
176	else:
177	hits_score[readname] = {}
178	hits_score[readname][endindex] = score
179
180	# call to all mappings
181	for readkey in hits.keys():
182	if len(hits[readkey]) != hit_per_read: continue
183
184	matches = []
185	match_count = 0
186
187	if hit_per_read == 1:
188	if len(hits[readkey]['1']) == 1:
189	matches = [ hits[readkey]['1'] ]
190	match_count = 1
191	else:
192	end1_data = hits[readkey]['1']
193	end2_data = hits[readkey]['2']
194
195	for i, end1_hit in enumerate(end1_data):
196	crin_strand = {'+': False, '-': False}
197	crin_insertSize = {'+': False, '-': False}
198
199	crin_strand[end1_hit[0]] = True
200	crin_insertSize[end1_hit[0]] = int(end1_hit[2])
201
202	for j, end2_hit in enumerate(end2_data):
203	crin_strand[end2_hit[0]] = True
204	crin_insertSize[end2_hit[0]] = int(end2_hit[2])
205
206	if end1_hit[-1] != end2_hit[-1] : continue
207
208	if crin_strand['+'] and crin_strand['-']:
209	if (crin_insertSize['-'] - crin_insertSize['+']) <= insertion_size:
210	matches.append([end1_hit, end2_hit])
211	match_count += 1
212
213	if match_count == 1:
214
215	for x, end_data in enumerate(matches[0]):
216
217	end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data
218	end_read_start = int(end_read_start) - 1
219
220	if end_strand == '-':
221	refsegment = reverse_complement(refseq[end_chrom][end_chr_start:end_chr_end])
222	else:
223	refsegment = refseq[end_chrom][end_chr_start:end_chr_end]
224
225	match_len = 0
226	editindex = 0
227	gap_read = 0
228
229	while editindex < len(end_editstring):
230
231	editchr = end_editstring[editindex]
232	chrA = ''
233	chrB = ''
234	locIndex = []
235
236	if editchr.isdigit():
237	editcode = ''
238
239	while editchr.isdigit() and editindex < len(end_editstring):
240	editcode += editchr
241	editindex += 1
242	if editindex < len(end_editstring): editchr = end_editstring[editindex]
243
244	for baseIndex in range(int(editcode)):
245	chrA += refsegment[match_len+baseIndex]
246	chrB = chrA
247
248	match_len += int(editcode)
249
250	elif editchr == 'x':
251	# crossover: inserted between the appropriate two bases
252	# Two sequencing errors: 4x15x6 (25 matches with 2 crossovers)
253	# Treated as errors in the reads; Do nothing.
254	editindex += 1
255
256	elif editchr.isalpha():
257	editcode = editchr
258	editindex += 1
259	chrA = refsegment[match_len]
260	chrB = editcode
261	match_len += len(editcode)
262
263	elif editchr == '-':
264	editcode = editchr
265	editindex += 1
266	chrA = refsegment[match_len]
267	chrB = editcode
268	match_len += len(editcode)
269	gap_read += 1
270
271	elif editchr == '(':
272	editcode = ''
273
274	while editchr != ')' and editindex < len(end_editstring):
275	if editindex < len(end_editstring): editchr = end_editstring[editindex]
276	editcode += editchr
277	editindex += 1
278
279	editcode = editcode[1:-1]
280	chrA = '-'*len(editcode)
281	chrB = editcode
282
283	else:
284	invalid_editstring_char += 1
285
286	if end_strand == '-':
287
288	chrA = reverse_complement(chrA)
289	chrB = reverse_complement(chrB)
290
291	pos_line = ''
292	rev_line = ''
293
294	for mappingIndex in range(len(chrA)):
295	# reference
296	chrAx = chrA[mappingIndex]
297	# read
298	chrBx = chrB[mappingIndex]
299
300	if chrAx and chrBx and chrBx.upper() != 'N':
301
302	if end_strand == '+':
303
304	chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex
305	read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read
306
307	if chrAx == '-': chrom_loc -= 1
308
309	if chrBx == '-':
310	scoreBx = '-1'
311	else:
312	scoreBx = hits_score[readkey][str(x+1)].split()[read_loc]
313
314	# 1-based on chrom_loc and read_loc
315	pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n'
316
317	else:
318
319	chrom_loc = end_chr_end-match_len+mappingIndex
320	read_loc = end_read_start+match_len-1-mappingIndex-gap_read
321
322	if chrAx == '-': chrom_loc -= 1
323
324	if chrBx == '-':
325	scoreBx = '-1'
326	else:
327	scoreBx = hits_score[readkey][str(x+1)].split()[read_loc]
328
329	# 1-based on chrom_loc and read_loc
330	rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line
331
332	if chrom_cov.has_key(end_chrom):
333
334	if chrom_cov[end_chrom].has_key(chrom_loc):
335	chrom_cov[end_chrom][chrom_loc] += 1
336	else:
337	chrom_cov[end_chrom][chrom_loc] = 1
338
339	else:
340
341	chrom_cov[end_chrom] = {}
342	chrom_cov[end_chrom][chrom_loc] = 1
343
344	if pos_line: temp_table.write('%s\n' %(pos_line.rstrip('\r\n')))
345	if rev_line: temp_table.write('%s\n' %(rev_line.rstrip('\r\n')))
346
347	temp_table.close()
348
349	# chrom-wide coverage
350	for i, line in enumerate(open(temp_table_name)):
351
352	line = line.rstrip()
353	if not line or line.startswith('#'): continue
354
355	fields = line.split()
356	chrom = fields[0]
357	eachBp = int(fields[1])
358	readname = fields[2]
359
360	if hit_per_read == 1:
361	fields[2] = readname.split('/')[0]
362
363	if chrom_cov[chrom].has_key(eachBp):
364	outfile.write('%s\t%d\n' %('\t'.join(fields), chrom_cov[chrom][eachBp]))
365	else:
366	outfile.write('%s\t%d\n' %('\t'.join(fields), 0))
367
368	outfile.close()
369
370	if os.path.exists(temp_table_name): os.remove(temp_table_name)
371
372	if invalid_editstring_char:
373	print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings'
374
375	return True
376
377	def convert_fastqsolexa_to_fasta_qual(infile_name, query_fasta, query_qual):
378
379	outfile_seq = open( query_fasta, 'w' )
380	outfile_score = open( query_qual, 'w' )
381
382	seq_title_startswith = ''
383	qual_title_startswith = ''
384
385	default_coding_value = 64 # Solexa ascii-code
386	fastq_block_lines = 0
387
388	for i, line in enumerate( file( infile_name ) ):
389	line = line.rstrip()
390	if not line or line.startswith( '#' ): continue
391
392	fastq_block_lines = ( fastq_block_lines + 1 ) % 4
393	line_startswith = line[0:1]
394
395	if fastq_block_lines == 1:
396	# first line is @title_of_seq
397	if not seq_title_startswith:
398	seq_title_startswith = line_startswith
399
400	if line_startswith != seq_title_startswith:
401	outfile_seq.close()
402	outfile_score.close()
403	stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
404
405	read_title = line[1:]
406	outfile_seq.write( '>%s\n' % line[1:] )
407
408	elif fastq_block_lines == 2:
409	# second line is nucleotides
410	read_length = len( line )
411	outfile_seq.write( '%s\n' % line )
412
413	elif fastq_block_lines == 3:
414	# third line is +title_of_qualityscore ( might be skipped )
415	if not qual_title_startswith:
416	qual_title_startswith = line_startswith
417
418	if line_startswith != qual_title_startswith:
419	outfile_seq.close()
420	outfile_score.close()
421	stop_err( 'Invalid fastqsolexa format at line %d: %s.' % ( i + 1, line ) )
422
423	quality_title = line[1:]
424	if quality_title and read_title != quality_title:
425	outfile_seq.close()
426	outfile_score.close()
427	stop_err( 'Invalid fastqsolexa format at line %d: sequence title "%s" differes from score title "%s".' % ( i + 1, read_title, quality_title ) )
428
429	if not quality_title:
430	outfile_score.write( '>%s\n' % read_title )
431	else:
432	outfile_score.write( '>%s\n' % line[1:] )
433
434	else:
435	# fourth line is quality scores
436	qual = ''
437	fastq_integer = True
438	# peek: ascii or digits?
439	val = line.split()[0]
440	try:
441	check = int( val )
442	fastq_integer = True
443	except:
444	fastq_integer = False
445
446	if fastq_integer:
447	# digits
448	qual = line
449	else:
450	# ascii
451	quality_score_length = len( line )
452	if quality_score_length == read_length + 1:
453	# first char is qual_score_startswith
454	qual_score_startswith = ord( line[0:1] )
455	line = line[1:]
456	elif quality_score_length == read_length:
457	qual_score_startswith = default_coding_value
458	else:
459	stop_err( 'Invalid fastqsolexa format at line %d: the number of quality scores ( %d ) is not the same as bases ( %d ).' % ( i + 1, quality_score_length, read_length ) )
460
461	for j, char in enumerate( line ):
462	score = ord( char ) - qual_score_startswith # 64
463	qual = "%s%s " % ( qual, str( score ) )
464
465	outfile_score.write( '%s\n' % qual )
466
467	outfile_seq.close()
468	outfile_score.close()
469
470	return True
471
472	def __main__():
473
474	# SHRiMP path
475	shrimp = 'rmapper-ls'
476
477	# I/O
478	input_target_file = sys.argv[1] # fasta
479	shrimp_outfile = sys.argv[2] # shrimp output
480	table_outfile = sys.argv[3] # table output
481	single_or_paired = sys.argv[4].split(',')
482
483	insertion_size = 600
484
485	if len(single_or_paired) == 1: # single or paired
486	type_of_reads = 'single'
487	hit_per_read = 1
488	input_query = single_or_paired[0]
489	query_fasta = tempfile.NamedTemporaryFile().name
490	query_qual = tempfile.NamedTemporaryFile().name
491
492	else: # paired-end
493	type_of_reads = 'paired'
494	hit_per_read = 2
495	input_query_end1 = single_or_paired[0]
496	input_query_end2 = single_or_paired[1]
497	insertion_size = int(single_or_paired[2])
498	query_fasta_end1 = tempfile.NamedTemporaryFile().name
499	query_fasta_end2 = tempfile.NamedTemporaryFile().name
500	query_qual_end1 = tempfile.NamedTemporaryFile().name
501	query_qual_end2 = tempfile.NamedTemporaryFile().name
502
503	# SHRiMP parameters: total = 15, default values
504	spaced_seed = '111111011111'
505	seed_matches_per_window = '2'
506	seed_hit_taboo_length = '4'
507	seed_generation_taboo_length = '0'
508	seed_window_length = '115.0'
509	max_hits_per_read = '100'
510	max_read_length = '1000'
511	kmer = '-1'
512	sw_match_value = '100'
513	sw_mismatch_value = '-150'
514	sw_gap_open_ref = '-400'
515	sw_gap_open_query = '-400'
516	sw_gap_ext_ref = '-70'
517	sw_gap_ext_query = '-70'
518	sw_hit_threshold = '68.0'
519
520	# TODO: put the threshold on each of these parameters
521	if len(sys.argv) > 5:
522
523	try:
524	if sys.argv[5].isdigit():
525	spaced_seed = sys.argv[5]
526	else:
527	stop_err('Error in assigning parameter: Spaced seed.')
528	except:
529	stop_err('Spaced seed must be a combination of 1s and 0s.')
530
531	seed_matches_per_window = sys.argv[6]
532	seed_hit_taboo_length = sys.argv[7]
533	seed_generation_taboo_length = sys.argv[8]
534	seed_window_length = sys.argv[9]
535	max_hits_per_read = sys.argv[10]
536	max_read_length = sys.argv[11]
537	kmer = sys.argv[12]
538	sw_match_value = sys.argv[13]
539	sw_mismatch_value = sys.argv[14]
540	sw_gap_open_ref = sys.argv[15]
541	sw_gap_open_query = sys.argv[16]
542	sw_gap_ext_ref = sys.argv[17]
543	sw_gap_ext_query = sys.argv[18]
544	sw_hit_threshold = sys.argv[19]
545
546	# temp file for shrimp log file
547	shrimp_log = tempfile.NamedTemporaryFile().name
548
549	# convert fastq to fasta and quality score files
550	if type_of_reads == 'single':
551	return_value = convert_fastqsolexa_to_fasta_qual(input_query, query_fasta, query_qual)
552	else:
553	return_value = convert_fastqsolexa_to_fasta_qual(input_query_end1, query_fasta_end1, query_qual_end1)
554	return_value = convert_fastqsolexa_to_fasta_qual(input_query_end2, query_fasta_end2, query_qual_end2)
555
556	# SHRiMP command
557	if type_of_reads == 'single':
558	command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
559
560	try:
561	os.system(command)
562	except Exception, e:
563	if os.path.exists(query_fasta): os.remove(query_fasta)
564	if os.path.exists(query_qual): os.remove(query_qual)
565	stop_err(str(e))
566
567	else: # paired
568	command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
569	command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log])
570
571	try:
572	os.system(command_end1)
573	os.system(command_end2)
574	except Exception, e:
575	if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)
576	if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)
577	if os.path.exists(query_qual_end1): os.remove(query_qual_end1)
578	if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
579	stop_err(str(e))
580
581	# check SHRiMP output: count number of lines
582	num_hits = 0
583	if shrimp_outfile:
584	for i, line in enumerate(file(shrimp_outfile)):
585	line = line.rstrip('\r\n')
586	if not line or line.startswith('#'): continue
587	try:
588	fields = line.split()
589	num_hits += 1
590	except Exception, e:
591	stop_err(str(e))
592
593	if num_hits == 0: # no hits generated
594	err_msg = ''
595	if shrimp_log:
596	for i, line in enumerate(file(shrimp_log)):
597	if line.startswith('error'): # deal with memory error:
598	err_msg += line # error: realloc failed: Cannot allocate memory
599	if re.search('Reads Matched', line): # deal with zero hits
600	if int(line[8:].split()[2]) == 0:
601	err_msg = 'Zero hits found.\n'
602	stop_err('SHRiMP Failed due to:\n' + err_msg)
603
604	# convert to table
605	if type_of_reads == 'single':
606	return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size)
607	else:
608	return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size)
609
610	# remove temp. files
611	if type_of_reads == 'single':
612	if os.path.exists(query_fasta): os.remove(query_fasta)
613	if os.path.exists(query_qual): os.remove(query_qual)
614	else:
615	if os.path.exists(query_fasta_end1): os.remove(query_fasta_end1)
616	if os.path.exists(query_fasta_end2): os.remove(query_fasta_end2)
617	if os.path.exists(query_qual_end1): os.remove(query_qual_end1)
618	if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
619
620	if os.path.exists(shrimp_log): os.remove(shrimp_log)
621
622
623	if __name__ == '__main__': __main__()
624

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/metag_tools/shrimp_wrapper.py

異なるフォーマットでダウンロード: