Context Navigation

sff_extract.py @ 2

リビジョン 2, 51.1 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

行番号
1	#!/usr/bin/python
2	'''This software extracts the seq, qual and ancillary information from an sff
3	file, like the ones used by the 454 sequencer.
4
5	Optionally, it can also split paired-end reads if given the linker sequence.
6	The splitting is done with maximum match, i.e., every occurence of the linker
7	sequence will be removed, even if occuring multiple times.'''
8
9	#copyright Jose Blanca and Bastien Chevreux
10	#COMAV institute, Universidad Politecnica de Valencia (UPV)
11	#Valencia, Spain
12
13	# additions to handle paired end reads by Bastien Chevreux
14	# bugfixes for linker specific lengths: Lionel Guy
15
16	#This program is free software: you can redistribute it and/or modify
17	#it under the terms of the GNU General Public License as published by
18	#the Free Software Foundation, either version 3 of the License, or
19	#(at your option) any later version.
20	#This program is distributed in the hope that it will be useful,
21	#but WITHOUT ANY WARRANTY; without even the implied warranty of
22	#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23	#GNU General Public License for more details.
24	#You should have received a copy of the GNU General Public License
25	#along with this program. If not, see <http://www.gnu.org/licenses/>.
26
27	__author__ = 'Jose Blanca and Bastien Chevreux'
28	__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
29	__license__ = 'GPLv3 or later'
30	__version__ = '0.2.8'
31	__email__ = 'jblanca@btc.upv.es'
32	__status__ = 'beta'
33
34	import struct
35	import sys
36	import os
37	import subprocess
38	import tempfile
39
40
41	fake_sff_name = 'fake_sff_name'
42
43
44	# readname as key: lines with matches from SSAHA, one best match
45	ssahapematches = {}
46	# linker readname as key: length of linker sequence
47	linkerlengths = {}
48
49	# set to true if something really fishy is going on with the sequences
50	stern_warning = True
51
52	def read_bin_fragment(struct_def, fileh, offset=0, data=None,
53	byte_padding=None):
54	'''It reads a chunk of a binary file.
55
56	You have to provide the struct, a file object, the offset (where to start
57	reading).
58	Also you can provide an optional dict that will be populated with the
59	extracted data.
60	If a byte_padding is given the number of bytes read will be a multiple of
61	that number, adding the required pad at the end.
62	It returns the number of bytes reads and the data dict.
63	'''
64	if data is None:
65	data = {}
66
67	#we read each item
68	bytes_read = 0
69	for item in struct_def:
70	#we go to the place and read
71	fileh.seek(offset + bytes_read)
72	n_bytes = struct.calcsize(item[1])
73	buffer = fileh.read(n_bytes)
74	read = struct.unpack('>' + item[1], buffer)
75	if len(read) == 1:
76	read = read[0]
77	data[item[0]] = read
78	bytes_read += n_bytes
79
80	#if there is byte_padding the bytes_to_read should be a multiple of the
81	#byte_padding
82	if byte_padding is not None:
83	pad = byte_padding
84	bytes_read = ((bytes_read + pad - 1) // pad) * pad
85
86	return (bytes_read, data)
87
88
89	def check_magic(magic):
90	'''It checks that the magic number of the file matches the sff magic.'''
91	if magic != 779314790:
92	raise RuntimeError('This file does not seems to be an sff file.')
93
94	def check_version(version):
95	'''It checks that the version is supported, otherwise it raises an error.'''
96	supported = ('\x00', '\x00', '\x00', '\x01')
97	i = 0
98	for item in version:
99	if version[i] != supported[i]:
100	raise RuntimeError('SFF version not supported. Please contact the author of the software.')
101	i += 1
102
103	def read_header(fileh):
104	'''It reads the header from the sff file and returns a dict with the
105	information'''
106	#first we read the first part of the header
107	head_struct = [
108	('magic_number', 'I'),
109	('version', 'cccc'),
110	('index_offset', 'Q'),
111	('index_length', 'I'),
112	('number_of_reads', 'I'),
113	('header_length', 'H'),
114	('key_length', 'H'),
115	('number_of_flows_per_read', 'H'),
116	('flowgram_format_code', 'B'),
117	]
118	data = {}
119	first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
120	offset=0, data=data)
121	check_magic(data['magic_number'])
122	check_version(data['version'])
123	#now that we know the number_of_flows_per_read and the key_length
124	#we can read the second part of the header
125	struct2 = [
126	('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
127	('key_sequence', str(data['key_length']) + 'c')
128	]
129	read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
130	data=data)
131	return data
132
133
134	def read_sequence(header, fileh, fposition):
135	'''It reads one read from the sff file located at the fposition and
136	returns a dict with the information.'''
137	header_length = header['header_length']
138	index_offset = header['index_offset']
139	index_length = header['index_length']
140
141	#the sequence struct
142	read_header_1 = [
143	('read_header_length', 'H'),
144	('name_length', 'H'),
145	('number_of_bases', 'I'),
146	('clip_qual_left', 'H'),
147	('clip_qual_right', 'H'),
148	('clip_adapter_left', 'H'),
149	('clip_adapter_right', 'H'),
150	]
151	def read_header_2(name_length):
152	'''It returns the struct definition for the second part of the header'''
153	return [('name', str(name_length) +'c')]
154	def read_data(number_of_bases):
155	'''It returns the struct definition for the read data section.'''
156	#size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
157	if header['flowgram_format_code'] == 1:
158	flow_type = 'H'
159	else:
160	raise Error('file version not supported')
161	number_of_bases = str(number_of_bases)
162	return [
163	('flowgram_values', str(header['number_of_flows_per_read']) +
164	flow_type),
165	('flow_index_per_base', number_of_bases + 'B'),
166	('bases', number_of_bases + 'c'),
167	('quality_scores', number_of_bases + 'B'),
168	]
169
170	data = {}
171	#we read the first part of the header
172	bytes_read, data = read_bin_fragment(struct_def=read_header_1,
173	fileh=fileh, offset=fposition, data=data)
174
175	read_bin_fragment(struct_def=read_header_2(data['name_length']),
176	fileh=fileh, offset=fposition + bytes_read, data=data)
177	#we join the letters of the name
178	data['name'] = ''.join(data['name'])
179	offset = data['read_header_length']
180	#we read the sequence and the quality
181	read_data_st = read_data(data['number_of_bases'])
182	bytes_read, data = read_bin_fragment(struct_def=read_data_st,
183	fileh=fileh, offset=fposition + offset,
184	data=data, byte_padding=8)
185	#we join the bases
186	data['bases'] = ''.join(data['bases'])
187
188	#print data
189	#print "pre cqr: ", data['clip_qual_right']
190	#print "pre car: ", data['clip_adapter_right']
191	#print "pre cql: ", data['clip_qual_left']
192	#print "pre cal: ", data['clip_adapter_left']
193
194	# correct for the case the right clip is <= than the left clip
195	# in this case, left clip is 0 are set to 0 (right clip == 0 means
196	# "whole sequence")
197	if data['clip_qual_right'] <= data['clip_qual_left'] :
198	data['clip_qual_right'] = 0
199	data['clip_qual_left'] = 0
200	if data['clip_adapter_right'] <= data['clip_adapter_left'] :
201	data['clip_adapter_right'] = 0
202	data['clip_adapter_left'] = 0
203
204	#the clipping section follows the NCBI's guidelines Trace Archive RFC
205	#http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
206	#if there's no adapter clip: qual -> vector
207	#else: qual-> qual
208	# adapter -> vector
209
210	if not data['clip_adapter_left']:
211	data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
212	if not data['clip_adapter_right']:
213	data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
214
215	# see whether we have to override the minimum left clips
216	if config['min_leftclip'] > 0:
217	if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
218	data['clip_adapter_left'] = config['min_leftclip']
219	if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
220	data['clip_qual_left'] = config['min_leftclip']
221
222
223	#print "post cqr: ", data['clip_qual_right']
224	#print "post car: ", data['clip_adapter_right']
225	#print "post cql: ", data['clip_qual_left']
226	#print "post cal: ", data['clip_adapter_left']
227
228
229	# for handling the -c (clip) option gently, we already clip here
230	# and set all clip points to the sequence end points
231	if config['clip']:
232	data['bases'], data['quality_scores'] = clip_read(data)
233
234	data['number_of_bases']=len(data['bases'])
235	data['clip_qual_right'] = data['number_of_bases']
236	data['clip_adapter_right'] = data['number_of_bases']
237	data['clip_qual_left'] = 0
238	data['clip_adapter_left'] = 0
239
240	return data['read_header_length'] + bytes_read, data
241
242
243	def sequences(fileh, header):
244	'''It returns a generator with the data for each read.'''
245	#now we can read all the sequences
246	fposition = header['header_length'] #position in the file
247	reads_read = 0
248	while True:
249	if fposition == header['index_offset']:
250	#we have to skip the index section
251	fposition += index_length
252	continue
253	else:
254	bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
255	fposition=fposition)
256	yield seq_data
257	fposition += bytes_read
258	reads_read += 1
259	if reads_read >= header['number_of_reads']:
260	break
261
262
263	def remove_last_xmltag_in_file(fname, tag=None):
264	'''Given an xml file name and a tag, it removes the last tag of the
265	file if it matches the given tag. Tag removal is performed via file
266	truncation.
267
268	It the given tag is not the last in the file, a RunTimeError will be
269	raised.
270
271	The resulting xml file will be not xml valid. This function is a hack
272	that allows to append records to xml files in a quick and dirty way.
273	'''
274
275	fh = open(fname, 'r+')
276	#we have to read from the end to the start of the file and keep the
277	#string enclosed by </ >
278	i = -1
279	last_tag = [] #the chars that form the last tag
280	start_offset = None #in which byte does the last tag starts?
281	end_offset = None #in which byte does the last tag ends?
282	while True:
283	fh.seek(i, 2)
284	char = fh.read(1)
285	if not char.isspace():
286	last_tag.append(char)
287	if char == '>':
288	end_offset = i
289	if char == '<':
290	start_offset = i
291	break
292	i -= 1
293
294	#we have read the last tag backwards
295	last_tag = ''.join(last_tag[::-1])
296	#we remove the </ and >
297	last_tag = last_tag.rstrip('>').lstrip('</')
298
299	#we check that we're removing the asked tag
300	if tag is not None and tag != last_tag:
301	raise RuntimeError("The given xml tag wasn't the last one in the file")
302
303	# while we are at it: also remove all white spaces in that line :-)
304	i -= 1
305	while True:
306	fh.seek(i, 2)
307	char = fh.read(1)
308	if not char == ' ' and not char == '\t':
309	break;
310	if fh.tell() == 1:
311	break;
312	i -= 1
313
314	fh.truncate();
315
316	fh.close()
317	return last_tag
318
319
320	def create_basic_xml_info(readname, fname):
321	'''Formats a number of read specific infos into XML format.
322	Currently formated: name and the tags set from command line
323	'''
324	to_print = [' <trace>\n']
325	to_print.append(' <trace_name>')
326	to_print.append(readname)
327	to_print.append('</trace_name>\n')
328
329	#extra information
330	#do we have extra info for this file?
331	info = None
332	if config['xml_info']:
333	#with this name?
334	if fname in config['xml_info']:
335	info = config['xml_info'][fname]
336	else:
337	#with no name?
338	try:
339	info = config['xml_info'][fake_sff_name]
340	except KeyError:
341	pass
342	#we print the info that we have
343	if info:
344	for key in info:
345	to_print.append(' <' + key + '>' + info[key] + \
346	'</' + key +'>\n')
347
348	return ''.join(to_print)
349
350
351	def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
352	'''Takes the clip values of the read and formats them into XML
353	Corrects "wrong" values that might have resulted through
354	simplified calculations earlier in the process of conversion
355	(especially during splitting of paired-end reads)
356	'''
357
358	to_print = [""]
359
360	# if right borders are >= to read length, they don't need
361	# to be printed
362	if adapr >= readlen:
363	adapr = 0
364	if qualr >= readlen:
365	qualr = 0
366
367	# BaCh
368	# when called via split_paired_end(), some values may be < 0
369	# (when clip values were 0 previously)
370	# instead of putting tons of if clauses for different calculations there,
371	# I centralise corrective measure here
372	# set all values <0 to 0
373
374	if adapr < 0:
375	adapr = 0
376	if qualr <0:
377	qualr = 0
378	if adapl < 0:
379	adapl = 0
380	if quall <0:
381	quall = 0
382
383	if quall:
384	to_print.append(' <clip_quality_left>')
385	to_print.append(str(quall))
386	to_print.append('</clip_quality_left>\n')
387	if qualr:
388	to_print.append(' <clip_quality_right>')
389	to_print.append(str(qualr))
390	to_print.append('</clip_quality_right>\n')
391	if adapl:
392	to_print.append(' <clip_vector_left>')
393	to_print.append(str(adapl))
394	to_print.append('</clip_vector_left>\n')
395	if adapr:
396	to_print.append(' <clip_vector_right>')
397	to_print.append(str(adapr))
398	to_print.append('</clip_vector_right>\n')
399	return ''.join(to_print)
400
401
402	def create_xml_for_unpaired_read(data, fname):
403	'''Given the data for one read it returns an str with the xml ancillary
404	data.'''
405	to_print = [create_basic_xml_info(data['name'],fname)]
406	#clippings in the XML only if we do not hard clip
407	if not config['clip']:
408	to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
409	to_print.append(' </trace>\n')
410	return ''.join(to_print)
411
412
413	def format_as_fasta(name,seq,qual):
414	name_line = ''.join(('>', name,'\n'))
415	seqstring = ''.join((name_line, seq, '\n'))
416	qual_line = ' '.join([str(q) for q in qual])
417	qualstring = ''.join((name_line, qual_line, '\n'))
418	return seqstring, qualstring
419
420	def format_as_fastq(name,seq,qual):
421	qual_line = ''.join([chr(q+33) for q in qual])
422	#seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
423	seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
424	return seqstring
425
426
427	def get_read_data(data):
428	'''Given the data for one read it returns 2 strs with the fasta seq
429	and fasta qual.'''
430	#seq and qual
431	if config['mix_case']:
432	seq = sequence_case(data)
433	qual = data['quality_scores']
434	else :
435	seq = data['bases']
436	qual = data['quality_scores']
437
438	return seq, qual
439
440	def extract_read_info(data, fname):
441	'''Given the data for one read it returns 3 strs with the fasta seq, fasta
442	qual and xml ancillary data.'''
443
444	seq,qual = get_read_data(data)
445	seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
446
447	#name_line = ''.join(('>', data['name'],'\n'))
448	#seq = ''.join((name_line, seq, '\n'))
449	#qual_line = ' '.join([str(q) for q in qual])
450	#qual = ''.join((name_line, qual_line, '\n'))
451
452	xmlstring = create_xml_for_unpaired_read(data, fname)
453
454	return seqstring, qualstring, xmlstring
455
456	def write_sequence(name,seq,qual,seq_fh,qual_fh):
457	'''Write sequence and quality FASTA and FASTA qual filehandles
458	(or into FASTQ and XML)
459	if sequence length is 0, don't write'''
460
461	if len(seq) == 0 : return
462
463	if qual_fh is None:
464	seq_fh.write(format_as_fastq(name,seq,qual))
465	else:
466	seqstring, qualstring = format_as_fasta(name,seq,qual)
467	seq_fh.write(seqstring)
468	qual_fh.write(qualstring)
469	return
470
471	def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
472	'''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
473	(or into FASTQ and XML)
474	if sequence length is 0, don't write'''
475
476	seq,qual = get_read_data(data)
477	if len(seq) == 0 : return
478
479	write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
480
481	anci = create_xml_for_unpaired_read(data, sff_fh.name)
482	if anci is not None:
483	xml_fh.write(anci)
484	return
485
486
487	def reverse_complement(seq):
488	'''Returns the reverse complement of a DNA sequence as string'''
489
490	compdict = {
491	'a': 't',
492	'c': 'g',
493	'g': 'c',
494	't': 'a',
495	'u': 't',
496	'm': 'k',
497	'r': 'y',
498	'w': 'w',
499	's': 's',
500	'y': 'r',
501	'k': 'm',
502	'v': 'b',
503	'h': 'd',
504	'd': 'h',
505	'b': 'v',
506	'x': 'x',
507	'n': 'n',
508	'A': 'T',
509	'C': 'G',
510	'G': 'C',
511	'T': 'A',
512	'U': 'T',
513	'M': 'K',
514	'R': 'Y',
515	'W': 'W',
516	'S': 'S',
517	'Y': 'R',
518	'K': 'M',
519	'V': 'B',
520	'H': 'D',
521	'D': 'H',
522	'B': 'V',
523	'X': 'X',
524	'N': 'N',
525	'': ''
526	}
527
528	complseq = ''.join([compdict[base] for base in seq])
529	# python hack to reverse a list/string/etc
530	complseq = complseq[::-1]
531	return complseq
532
533
534	def mask_sequence(seq, maskchar, fpos, tpos):
535	'''Given a sequence, mask it with maskchar starting at fpos (including) and
536	ending at tpos (excluding)
537	'''
538
539	if len(maskchar) > 1:
540	raise RuntimeError("Internal error: more than one character given to mask_sequence")
541	if fpos<0:
542	fpos = 0
543	if tpos > len(seq):
544	tpos = len(seq)
545
546	newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
547
548	return newseq
549
550
551	def fragment_sequences(sequence, qualities, splitchar):
552	'''Works like split() on strings, except it does this on a sequence
553	and the corresponding list with quality values.
554	Returns a tuple for each fragment, each sublist has the fragment
555	sequence as first and the fragment qualities as second elemnt'''
556
557	# this is slow (due to zip and list appends... use an iterator over
558	# the sequence find find variations and splices on seq and qual
559
560	if len(sequence) != len(qualities):
561	print sequence, qualities
562	raise RuntimeError("Internal error: length of sequence and qualities don't match???")
563
564	retlist = ([])
565	if len(sequence) == 0:
566	return retlist
567
568	actseq = ([])
569	actqual = ([])
570	if sequence[0] != splitchar:
571	inseq = True
572	else:
573	inseq = False
574	for char,qual in zip(sequence,qualities):
575	if inseq:
576	if char != splitchar:
577	actseq.append(char)
578	actqual.append(qual)
579	else:
580	retlist.append((''.join(actseq), actqual))
581	actseq = ([])
582	actqual = ([])
583	inseq = False
584	else:
585	if char != splitchar:
586	inseq = True
587	actseq.append(char)
588	actqual.append(qual)
589
590	if inseq and len(actseq):
591	retlist.append((''.join(actseq), actqual))
592
593	return retlist
594
595
596	def calc_subseq_boundaries(maskedseq, maskchar):
597	'''E.g.:
598	........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
599	to
600	(0,8),(8,16),(16,26),(26,47),(47,56)
601	'''
602
603	blist = ([])
604	if len(maskedseq) == 0:
605	return blist
606
607	inmask = True
608	if maskedseq[0] != maskchar:
609	inmask = False
610
611	start = 0
612	for spos in range(len(maskedseq)):
613	if inmask and maskedseq[spos] != maskchar:
614	blist.append(([start,spos]))
615	start = spos
616	inmask = False
617	elif not inmask and maskedseq[spos] == maskchar:
618	blist.append(([start,spos]))
619	start = spos
620	inmask = True
621
622	blist.append(([start,spos+1]))
623
624	return blist
625
626
627	def correct_for_smallhits(maskedseq, maskchar, linkername):
628	'''If partial hits were found, take preventive measure: grow
629	the masked areas by 20 bases in each direction
630	Returns either unchanged "maskedseq" or a new sequence
631	with some more characters masked.
632	'''
633	global linkerlengths
634
635	CEBUG = 0
636
637	if CEBUG : print "correct_for_smallhits"
638	if CEBUG : print "Masked seq\n", maskedseq
639	if CEBUG : print "Linkername: ", linkername
640
641	if len(maskedseq) == 0:
642	return maskedseq
643
644	growl=40
645	growl2=growl/2
646
647	boundaries = calc_subseq_boundaries(maskedseq,maskchar)
648	if CEBUG : print "Boundaries: ", boundaries
649
650	foundpartial = False
651	for bounds in boundaries:
652	if CEBUG : print "\tbounds: ", bounds
653	left, right = bounds
654	if left != 0 and right != len(maskedseq):
655	if maskedseq[left] == maskchar:
656	# allow 10% discrepancy
657	# -linkerlengths[linkername]/10
658	# that's a kind of safety net if there are slight sequencing
659	# errors in the linker itself
660	if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
661	if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
662	foundpartial = True
663
664	if not foundpartial:
665	return maskedseq
666
667	# grow
668	newseq = ""
669	for bounds in boundaries:
670	if CEBUG : print "Bounds: ", bounds
671	left, right = bounds
672	if maskedseq[left] == maskchar:
673	newseq += maskedseq[left:right]
674	else:
675	clearstart = 0
676	if left > 0 :
677	clearstart = left+growl2
678	clearstop = len(maskedseq)
679	if right < len(maskedseq):
680	clearstop = right-growl2
681
682	if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
683
684	if clearstop <= clearstart:
685	newseq += maskchar * (right-left)
686	else:
687	if clearstart != left:
688	newseq += maskchar * growl2
689	newseq += maskedseq[clearstart:clearstop]
690	if clearstop != right:
691	newseq += maskchar * growl2
692
693	#print "newseq\n",newseq
694
695	return newseq
696
697
698	def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
699	'''Splits a paired end read and writes sequences into FASTA, FASTA qual
700	and XML traceinfo file. Returns the number of sequences created.
701
702	As the linker sequence may be anywhere in the read, including the ends
703	and overlapping with bad quality sequence, we need to perform some
704	computing and eventually set new clip points.
705
706	If the resulting split yields only one sequence (because linker
707	was not present or overlapping with left or right clip), only one
708	sequence will be written with ".fn" appended to the name.
709
710	If the read can be split, two reads will be written. The side left of
711	the linker will be named ".r" and will be written in reverse complement
712	into the file to conform with what approximately all assemblers expect
713	when reading paired-end data: reads in forward direction in file. The side
714	right of the linker will be named ".f"
715
716	If SSAHA found partial linker (linker sequences < length of linker),
717	the sequences will get a "_pl" furthermore be cut back thoroughly.
718
719	If SSAHA found multiple occurences of the linker, the names will get an
720	additional "_mlc" within the name to show that there was "multiple
721	linker contamination".
722
723	For multiple or partial linker, the "good" parts of the reads are
724	stored with a ".part<number>" name, additionally they will not get
725	template information in the XML
726	'''
727
728	global ssahapematches
729
730	CEBUG = 0
731
732	maskchar = "#"
733
734	if CEBUG : print "Need to split: " + data['name']
735
736	numseqs = 0;
737	readname = data['name']
738	readlen = data['number_of_bases']
739
740	leftclip, rightclip = return_merged_clips(data)
741	seq, qual = get_read_data(data)
742
743	if CEBUG : print "Original read:\n",seq
744
745	maskedseq = seq
746	if leftclip > 0:
747	maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
748	if rightclip < len(maskedseq):
749	maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
750
751	leftclip, rightclip = return_merged_clips(data)
752	readlen = data['number_of_bases']
753
754	if CEBUG : print "Readname:", readname
755	if CEBUG : print "Readlen:", readlen
756	if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
757	if CEBUG : print "matches:", ssahapematches[data['name']]
758
759	for match in ssahapematches[data['name']]:
760	score = int(match[0])
761	linkername = match[2]
762	leftreadhit = int(match[3])
763	rightreadhit = int(match[4])
764	#leftlinkerhit = int(match[5])
765	#rightlinkerhit = int(match[6])
766	#direction = match[7]
767	#hitlen = int(match[8])
768	#hitidentity = float(match[9])
769
770	if CEBUG : print match
771	if CEBUG : print "Match with score:", score
772	if CEBUG : print "Read before:\n", maskedseq
773	maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
774	if CEBUG : print "Masked seq:\n", maskedseq
775
776	correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
777
778	if len(maskedseq) != len(correctedseq):
779	raise RuntimeError("Internal error: maskedseq != correctedseq")
780
781	partialhits = False
782	if correctedseq != maskedseq:
783	if CEBUG : print "Partial hits in", readname
784	if CEBUG : print "Original seq:\n", seq
785	if CEBUG : print "Masked seq:\n", maskedseq
786	if CEBUG : print "Corrected seq\n", correctedseq
787	partialhits = True
788	readname += "_pl"
789	maskedseq = correctedseq
790
791	fragments = fragment_sequences(maskedseq, qual, maskchar)
792
793	if CEBUG : print "Fragments (", len(fragments), "): ", fragments
794
795	mlcflag = False
796	#if len(ssahapematches[data['name']]) > 1:
797	# #print "Multi linker contamination"
798	# mlcflag = True
799	# readname += "_mlc"
800
801	if len(fragments) > 2:
802	if CEBUG : print "Multi linker contamination"
803	mlcflag = True
804	readname += "_mlc"
805
806
807	#print fragments
808	if mlcflag or partialhits:
809	fragcounter = 1
810	readname += ".part"
811	for frag in fragments:
812	actseq = frag[0]
813	if len(actseq) >= 20:
814	actqual = frag[1]
815	oname = readname + str(fragcounter)
816	#seq_fh.write(">"+oname+"\n")
817	#seq_fh.write(actseq+"\n")
818	#qual_fh.write(">"+oname+"\n")
819	#qual_fh.write(' '.join((str(q) for q in actqual)))
820	#qual_fh.write("\n")
821	write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
822	to_print = [create_basic_xml_info(oname,sff_fh.name)]
823	# No clipping in XML ... the multiple and partial fragments
824	# are clipped "hard"
825	# No template ID and trace_end: we don't know the
826	# orientation of the frahments. Even if it were
827	# only two, the fact we had multiple linkers
828	# says something went wrong, so simply do not
829	# write any paired-end information for all these fragments
830	to_print.append(' </trace>\n')
831	xml_fh.write(''.join(to_print))
832	numseqs += 1
833	fragcounter += 1
834	else:
835	if len(fragments) >2:
836	raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
837	# nothing will happen for 0 fragments
838	if len(fragments) == 1:
839	#print "Tada1"
840	boundaries = calc_subseq_boundaries(maskedseq,maskchar)
841	if len(boundaries) < 1 or len(boundaries) >3:
842	raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
843	if len(boundaries) == 3:
844	# case: mask char on both sides of sequence
845	#print "bounds3"
846	data['clip_adapter_left']=1+boundaries[0][1]
847	data['clip_adapter_right']=boundaries[2][0]
848	elif len(boundaries) == 2:
849	# case: mask char left or right of sequence
850	#print "bounds2",
851	if maskedseq[0] == maskchar :
852	# case: mask char left
853	#print "left"
854	data['clip_adapter_left']=1+boundaries[0][1]
855	else:
856	# case: mask char right
857	#print "right"
858	data['clip_adapter_right']=boundaries[1][0]
859	data['name'] = data['name'] + ".fn"
860	write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
861	numseqs = 1
862	elif len(fragments) == 2:
863	#print "Tada2"
864	oname = readname + ".r"
865	seq, qual = get_read_data(data)
866
867	startsearch = False
868	for spos in range(len(maskedseq)):
869	if maskedseq[spos] != maskchar:
870	startsearch = True;
871	else:
872	if startsearch:
873	break
874
875	#print "\nspos: ", spos
876	lseq=seq[:spos]
877	#print "lseq:", lseq
878	actseq = reverse_complement(lseq)
879	lreadlen = len(actseq)
880	lqual = qual[:spos];
881	# python hack to reverse a list/string/etc
882	lqual = lqual[::-1];
883
884	#seq_fh.write(">"+oname+"\n")
885	#seq_fh.write(actseq+"\n")
886	#qual_fh.write(">"+oname+"\n")
887	#qual_fh.write(' '.join((str(q) for q in lqual)))
888	#qual_fh.write("\n")
889
890	write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
891
892	to_print = [create_basic_xml_info(oname,sff_fh.name)]
893	to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
894	to_print.append(' <template_id>')
895	to_print.append(readname)
896	to_print.append('</template_id>\n')
897	to_print.append(' <trace_end>r</trace_end>\n')
898	to_print.append(' </trace>\n')
899	xml_fh.write(''.join(to_print))
900
901	oname = readname + ".f"
902	startsearch = False
903	for spos in range(len(maskedseq)-1,-1,-1):
904	if maskedseq[spos] != maskchar:
905	startsearch = True;
906	else:
907	if startsearch:
908	break
909
910	actseq = seq[spos+1:]
911	actqual = qual[spos+1:];
912
913	#print "\nspos: ", spos
914	#print "rseq:", actseq
915
916	#seq_fh.write(">"+oname+"\n")
917	#seq_fh.write(actseq+"\n")
918	#qual_fh.write(">"+oname+"\n")
919	#qual_fh.write(' '.join((str(q) for q in actqual)))
920	#qual_fh.write("\n")
921	write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
922
923	rreadlen = len(actseq)
924	to_print = [create_basic_xml_info(oname,sff_fh.name)]
925	to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
926	to_print.append(' <template_id>')
927	to_print.append(readname)
928	to_print.append('</template_id>\n')
929	to_print.append(' <trace_end>f</trace_end>\n')
930	to_print.append(' </trace>\n')
931	xml_fh.write(''.join(to_print))
932	numseqs = 2
933
934	return numseqs
935
936
937
938	def extract_reads_from_sff(config, sff_files):
939	'''Given the configuration and the list of sff_files it writes the seqs,
940	qualities and ancillary data into the output file(s).
941
942	If file for paired-end linker was given, first extracts all sequences
943	of an SFF and searches these against the linker(s) with SSAHA2 to
944	create needed information to split reads.
945	'''
946
947	global ssahapematches
948
949
950	if len(sff_files) == 0 :
951	raise RuntimeError("No SFF file given?")
952
953	#we go through all input files
954	for sff_file in sff_files:
955	if not os.path.getsize(sff_file):
956	raise RuntimeError('Empty file? : ' + sff_file)
957	fh = open(sff_file, 'r')
958	fh.close()
959
960	openmode = 'w'
961	if config['append']:
962	openmode = 'a'
963
964	seq_fh = open(config['seq_fname'], openmode)
965	xml_fh = open(config['xml_fname'], openmode)
966	if config['want_fastq']:
967	qual_fh = None
968	try:
969	os.remove(config['qual_fname'])
970	except :
971	python_formattingwithoutbracesisdumb_dummy = 1
972	else:
973	qual_fh = open(config['qual_fname'], openmode)
974
975	if not config['append']:
976	xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
977	else:
978	remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
979
980	#we go through all input files
981	for sff_file in sff_files:
982	#print "Working on '" + sff_file + "':"
983	ssahapematches.clear()
984
985	seqcheckstore = ([])
986
987	debug = 0
988
989	if not debug and config['pelinker_fname']:
990	#print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
991	sys.stdout.flush()
992
993	if 0 :
994	# for debugging
995	pid = os.getpid()
996	tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
997	tmpfasta_fh = open(tmpfasta_fname, 'w')
998	else:
999	tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
1000	suffix = '.fasta')
1001
1002	sff_fh = open(sff_file, 'rb')
1003	header_data = read_header(fileh=sff_fh)
1004	for seq_data in sequences(fileh=sff_fh, header=header_data):
1005	seq,qual = get_read_data(seq_data)
1006	seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
1007	tmpfasta_fh.write(seqstring)
1008	#seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
1009	#tmpfasta_fh.write(seq)
1010	#print "done."
1011	tmpfasta_fh.seek(0)
1012
1013	if 0 :
1014	# for debugging
1015	tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
1016	tmpssaha_fh = open(tmpssaha_fname, 'w+')
1017	else:
1018	tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
1019	suffix = '.ssaha2')
1020
1021	launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
1022	tmpfasta_fh.close()
1023
1024	tmpssaha_fh.seek(0)
1025	read_ssaha_data(tmpssaha_fh)
1026	tmpssaha_fh.close()
1027
1028	if debug:
1029	tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
1030	read_ssaha_data(tmpssaha_fh)
1031
1032	#print "Converting '" + sff_file + "' ... ",
1033	sys.stdout.flush()
1034	sff_fh = open(sff_file, 'rb')
1035	#read_header(infile)
1036	header_data = read_header(fileh=sff_fh)
1037
1038	#now convert all reads
1039	nseqs_sff = 0
1040	nseqs_out = 0
1041	for seq_data in sequences(fileh=sff_fh, header=header_data):
1042	nseqs_sff += 1
1043
1044	seq, qual = clip_read(seq_data)
1045	seqcheckstore.append(seq[0:50])
1046
1047	#if nseqs_sff >1000:
1048	# check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
1049	# sys.exit()
1050
1051	if ssahapematches.has_key(seq_data['name']):
1052	#print "Paired end:",seq_data['name']
1053	nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
1054	else:
1055	#print "Normal:",seq_data['name']
1056	if config['pelinker_fname']:
1057	seq_data['name'] = seq_data['name'] + ".fn"
1058	write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
1059	nseqs_out += 1
1060	#print "done."
1061	#print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
1062	sff_fh.close()
1063
1064	check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
1065	seqcheckstore = ([])
1066
1067	xml_fh.write('</trace_volume>\n')
1068
1069	xml_fh.close()
1070	seq_fh.close()
1071	if qual_fh is not None:
1072	qual_fh.close()
1073
1074	return
1075
1076	def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
1077
1078	global stern_warning
1079
1080	foundproblem = ""
1081	for checklen in range(1,len(seqcheckstore[0])):
1082	foundinloop = False
1083	seqdict = {}
1084	for seq in seqcheckstore:
1085	shortseq = seq[0:checklen]
1086	if shortseq in seqdict:
1087	seqdict[shortseq] += 1
1088	else:
1089	seqdict[shortseq] = 1
1090
1091	for shortseq, count in seqdict.items():
1092	if float(count)/len(seqcheckstore) >= 0.5:
1093	foundinloop = True
1094	stern_warning
1095	foundproblem = "\n"+"" 80
1096	foundproblem += "\nWARNING: "
1097	foundproblem += "weird sequences in file " + sffname + "\n\n"
1098	foundproblem += "After applying left clips, " + str(count) + " sequences (="
1099	foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
1100	foundproblem += "%) start with these bases:\n" + shortseq
1101	foundproblem += "\n\nThis does not look sane.\n\n"
1102	foundproblem += "Countermeasures you probably must take:\n"
1103	foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n"
1104	foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n"
1105	left,right = return_merged_clips(seqdata)
1106	foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
1107	foundproblem += "" 80 + "\n"
1108	if not foundinloop :
1109	break
1110	if len(foundproblem):
1111	print foundproblem
1112
1113
1114	def parse_extra_info(info):
1115	'''It parses the information that will go in the xml file.
1116
1117	There are two formats accepted for the extra information:
1118	key1:value1, key2:value2
1119	or:
1120	file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
1121	'''
1122	if not info:
1123	return info
1124	finfos = info.split(';') #information for each file
1125	data_for_files = {}
1126	for finfo in finfos:
1127	#we split the file name from the rest
1128	items = finfo.split('{')
1129	if len(items) == 1:
1130	fname = fake_sff_name
1131	info = items[0]
1132	else:
1133	fname = items[0]
1134	info = items[1]
1135	#now we get each key,value pair in the info
1136	info = info.replace('}', '')
1137	data = {}
1138	for item in info.split(','):
1139	key, value = item.strip().split(':')
1140	key = key.strip()
1141	value = value.strip()
1142	data[key] = value
1143	data_for_files[fname] = data
1144	return data_for_files
1145
1146
1147	def return_merged_clips(data):
1148	'''It returns the left and right positions to clip.'''
1149	def max(a, b):
1150	'''It returns the max of the two given numbers.
1151
1152	It won't take into account the zero values.
1153	'''
1154	if not a and not b:
1155	return None
1156	if not a:
1157	return b
1158	if not b:
1159	return a
1160	if a >= b:
1161	return a
1162	else:
1163	return b
1164	def min(a, b):
1165	'''It returns the min of the two given numbers.
1166
1167	It won't take into account the zero values.
1168	'''
1169	if not a and not b:
1170	return None
1171	if not a:
1172	return b
1173	if not b:
1174	return a
1175	if a <= b:
1176	return a
1177	else:
1178	return b
1179	left = max(data['clip_adapter_left'], data['clip_qual_left'])
1180	right = min(data['clip_adapter_right'], data['clip_qual_right'])
1181	#maybe both clips where zero
1182	if left is None:
1183	left = 1
1184	if right is None:
1185	right = data['number_of_bases']
1186	return left, right
1187
1188	def sequence_case(data):
1189	'''Given the data for one read it returns the seq with mixed case.
1190
1191	The regions to be clipped will be lower case and the rest upper case.
1192	'''
1193	left, right = return_merged_clips(data)
1194	seq = data['bases']
1195	new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
1196	return new_seq
1197
1198	def clip_read(data):
1199	'''Given the data for one read it returns clipped seq and qual.'''
1200
1201	qual = data['quality_scores']
1202	left, right = return_merged_clips(data)
1203	seq = data['bases']
1204	qual = data['quality_scores']
1205	new_seq = seq[left-1:right]
1206	new_qual = qual[left-1:right]
1207
1208	return new_seq, new_qual
1209
1210
1211
1212	def tests_for_ssaha(linker_fname):
1213	'''Tests whether SSAHA2 can be successfully called.'''
1214
1215	try:
1216	print "Testing whether SSAHA2 is installed and can be launched ... ",
1217	sys.stdout.flush()
1218	fh = open('/dev/null', 'w')
1219	retcode = subprocess.call(["ssaha2", "-v"], stdout = fh)
1220	fh.close()
1221	print "ok."
1222	except :
1223	print "nope? Uh oh ...\n\n"
1224	raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
1225
1226
1227	def load_linker_sequences(linker_fname):
1228	'''Loads all linker sequences into memory, storing only the length
1229	of each linker.'''
1230
1231	global linkerlengths
1232
1233	if not os.path.getsize(linker_fname):
1234	raise RuntimeError("File empty? '" + linker_fname + "'")
1235	fh = open(linker_fname, 'r')
1236	linkerseqs = read_fasta(fh)
1237	if len(linkerseqs) == 0:
1238	raise RuntimeError(linker_fname + ": no sequence found?")
1239	for i in linkerseqs:
1240	if linkerlengths.has_key(i.name):
1241	raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
1242	linkerlengths[i.name] = len(i.sequence)
1243	fh.close()
1244
1245
1246	def launch_ssaha(linker_fname, query_fname, output_fh):
1247	'''Launches SSAHA2 on the linker and query file, string SSAHA2 output
1248	into the output filehandle'''
1249
1250	try:
1251	print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
1252	sys.stdout.flush()
1253	retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
1254	if retcode:
1255	raise RuntimeError('Ups.')
1256	else:
1257	print "ok."
1258	except:
1259	print "\n"
1260	raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
1261
1262	def read_ssaha_data(ssahadata_fh):
1263	'''Given file handle, reads file generated with SSAHA2 (with default
1264	output format) and stores all matches as list ssahapematches
1265	(ssaha paired-end matches) dictionary'''
1266
1267	global ssahapematches
1268
1269	print "Parsing SSAHA2 result file ... ",
1270	sys.stdout.flush()
1271
1272	for line in ssahadata_fh:
1273	if line.startswith('ALIGNMENT'):
1274	ml = line.split()
1275	if len(ml) != 12 :
1276	print "\n", line,
1277	raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
1278	if not ssahapematches.has_key(ml[2]) :
1279	ssahapematches[ml[2]] = ([])
1280	if ml[8] == 'F':
1281	#print line,
1282
1283	# store everything except the first element (output
1284	# format name (ALIGNMENT)) and the last element
1285	# (length)
1286	ssahapematches[ml[2]].append(ml[1:-1])
1287	else:
1288	#print ml
1289	ml[4],ml[5] = ml[5],ml[4]
1290	#print ml
1291	ssahapematches[ml[2]].append(ml[1:-1])
1292
1293	print "done."
1294
1295
1296	##########################################################################
1297	#
1298	# BaCh: This block was shamelessly copied from
1299	# http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
1300	# and then subsequently modified to read fasta correctly
1301	# It's still not fool proof, but should be good enough
1302	#
1303	##########################################################################
1304
1305	class Fasta:
1306	def __init__(self, name, sequence):
1307	self.name = name
1308	self.sequence = sequence
1309
1310	def read_fasta(file):
1311	items = []
1312	aninstance = Fasta('', '')
1313	linenum = 0
1314	for line in file:
1315	linenum += 1
1316	if line.startswith(">"):
1317	if len(aninstance.sequence):
1318	items.append(aninstance)
1319	aninstance = Fasta('', '')
1320	# name == all characters until the first whitespace
1321	# (split()[0]) but without the starting ">" ([1:])
1322	aninstance.name = line.split()[0][1:]
1323	aninstance.sequence = ''
1324	if len(aninstance.name) == 0:
1325	raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
1326
1327	else:
1328	if len(aninstance.name) == 0:
1329	raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
1330	aninstance.sequence += line.strip()
1331
1332	if len(aninstance.name) and len(aninstance.sequence):
1333	items.append(aninstance)
1334
1335	return items
1336	##########################################################################
1337
1338	def version_string ():
1339	return "sff_extract " + __version__
1340
1341	def read_config():
1342	'''It reads the configuration options from the command line arguments and
1343	it returns a dict with them.'''
1344	from optparse import OptionParser, OptionGroup
1345	usage = "usage: %prog [options] sff1 sff2 ..."
1346	desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
1347	" and XML traceinfo format. When a paired-end linker sequence"\
1348	" is given (-l), use SSAHA2 to scan the sequences for the linker,"\
1349	" then split the sequences, removing the linker."
1350	parser = OptionParser(usage = usage, version = version_string(), description = desc)
1351	parser.add_option('-a', '--append', action="store_true", dest='append',
1352	help='append output to existing files', default=False)
1353	parser.add_option('-i', '--xml_info', dest='xml_info',
1354	help='extra info to write in the xml file')
1355	parser.add_option("-l", "--linker_file", dest="pelinker_fname",
1356	help="FASTA file with paired-end linker sequences", metavar="FILE")
1357
1358	group = OptionGroup(parser, "File name options","")
1359	group.add_option('-c', '--clip', action="store_true", dest='clip',
1360	help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
1361	group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
1362	help='all bases in upper case, including clipped ends', default=True)
1363	group.add_option('', '--min_left_clip', dest='min_leftclip',
1364	metavar="INTEGER", type = "int",
1365	help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
1366	group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
1367	help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
1368	parser.add_option_group(group)
1369
1370	group = OptionGroup(parser, "File name options","")
1371	group.add_option("-o", "--out_basename", dest="basename",
1372	help="base name for all output files")
1373	group.add_option("-s", "--seq_file", dest="seq_fname",
1374	help="output sequence file name", metavar="FILE")
1375	group.add_option("-q", "--qual_file", dest="qual_fname",
1376	help="output quality file name", metavar="FILE")
1377	group.add_option("-x", "--xml_file", dest="xml_fname",
1378	help="output ancillary xml file name", metavar="FILE")
1379	parser.add_option_group(group)
1380
1381	#default fnames
1382	#is there an sff file?
1383	basename = 'reads'
1384	if sys.argv[-1][-4:].lower() == '.sff':
1385	basename = sys.argv[-1][:-4]
1386	def_seq_fname = basename + '.fasta'
1387	def_qual_fname = basename + '.fasta.qual'
1388	def_xml_fname = basename + '.xml'
1389	def_pelinker_fname = ''
1390	parser.set_defaults(seq_fname = def_seq_fname)
1391	parser.set_defaults(qual_fname = def_qual_fname)
1392	parser.set_defaults(xml_fname = def_xml_fname)
1393	parser.set_defaults(pelinker_fname = def_pelinker_fname)
1394
1395	#we parse the cmd line
1396	(options, args) = parser.parse_args()
1397
1398	#we put the result in a dict
1399	global config
1400	config = {}
1401	for property in dir(options):
1402	if property[0] == '_' or property in ('ensure_value', 'read_file',
1403	'read_module'):
1404	continue
1405	config[property] = getattr(options, property)
1406
1407	if config['basename'] is None:
1408	config['basename']=basename
1409
1410	#if we have not set a file name with -s, -q or -x we set the basename
1411	#based file name
1412	if config['want_fastq']:
1413	config['qual_fname'] = ''
1414	if config['seq_fname'] == def_seq_fname:
1415	config['seq_fname'] = config['basename'] + '.fastq'
1416	else:
1417	if config['seq_fname'] == def_seq_fname:
1418	config['seq_fname'] = config['basename'] + '.fasta'
1419	if config['qual_fname'] == def_qual_fname:
1420	config['qual_fname'] = config['basename'] + '.fasta.qual'
1421
1422	if config['xml_fname'] == def_xml_fname:
1423	config['xml_fname'] = config['basename'] + '.xml'
1424
1425	#we parse the extra info for the xml file
1426	config['xml_info'] = parse_extra_info(config['xml_info'])
1427	return config, args
1428
1429
1430
1431	##########################################################################
1432
1433
1434	def testsome():
1435	sys.exit()
1436	return
1437
1438
1439	def debug():
1440	try:
1441	dummy = 1
1442	#debug()
1443	#testsome()
1444
1445	config, args = read_config()
1446	load_linker_sequences(config['pelinker_fname'])
1447
1448	#pid = os.getpid()
1449	pid = 15603
1450
1451	#tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
1452	#tmpfasta_fh = open(tmpfasta_fname, 'w')
1453	tmpfasta_fname = 'FLVI58L05.fa'
1454	tmpfasta_fh = open(tmpfasta_fname, 'r')
1455
1456	tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
1457	tmpssaha_fh = open(tmpssaha_fname, 'w')
1458
1459	launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
1460
1461	tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r')
1462	read_ssaha_data(tmpssaha_fh)
1463
1464	sys.exit()
1465
1466	extract_reads_from_sff(config, args)
1467
1468	except (OSError, IOError, RuntimeError), errval:
1469	print errval
1470	sys.exit()
1471
1472	sys.exit()
1473
1474
1475	def main():
1476
1477	argv = sys.argv
1478	if len(argv) == 1:
1479	sys.argv.append('-h')
1480	read_config()
1481	sys.exit()
1482	try:
1483	#debug();
1484
1485	config, args = read_config()
1486
1487	if config['pelinker_fname']:
1488	#tests_for_ssaha(config['pelinker_fname'])
1489	load_linker_sequences(config['pelinker_fname'])
1490	if len(args) == 0:
1491	raise RuntimeError("No SFF file given?")
1492	extract_reads_from_sff(config, args)
1493	except (OSError, IOError, RuntimeError), errval:
1494	print errval
1495	return 1
1496
1497	if stern_warning:
1498	return 1
1499
1500	return 0
1501
1502
1503
1504	if __name__ == "__main__":
1505	sys.exit(main())

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/filters/sff_extract.py @ 2

異なるフォーマットでダウンロード: