Context Navigation

sff_extract.py @ 2

リビジョン 2, 51.1 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/python
	2	'''This software extracts the seq, qual and ancillary information from an sff
	3	file, like the ones used by the 454 sequencer.
	4
	5	Optionally, it can also split paired-end reads if given the linker sequence.
	6	The splitting is done with maximum match, i.e., every occurence of the linker
	7	sequence will be removed, even if occuring multiple times.'''
	8
	9	#copyright Jose Blanca and Bastien Chevreux
	10	#COMAV institute, Universidad Politecnica de Valencia (UPV)
	11	#Valencia, Spain
	12
	13	# additions to handle paired end reads by Bastien Chevreux
	14	# bugfixes for linker specific lengths: Lionel Guy
	15
	16	#This program is free software: you can redistribute it and/or modify
	17	#it under the terms of the GNU General Public License as published by
	18	#the Free Software Foundation, either version 3 of the License, or
	19	#(at your option) any later version.
	20	#This program is distributed in the hope that it will be useful,
	21	#but WITHOUT ANY WARRANTY; without even the implied warranty of
	22	#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	23	#GNU General Public License for more details.
	24	#You should have received a copy of the GNU General Public License
	25	#along with this program. If not, see <http://www.gnu.org/licenses/>.
	26
	27	__author__ = 'Jose Blanca and Bastien Chevreux'
	28	__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
	29	__license__ = 'GPLv3 or later'
	30	__version__ = '0.2.8'
	31	__email__ = 'jblanca@btc.upv.es'
	32	__status__ = 'beta'
	33
	34	import struct
	35	import sys
	36	import os
	37	import subprocess
	38	import tempfile
	39
	40
	41	fake_sff_name = 'fake_sff_name'
	42
	43
	44	# readname as key: lines with matches from SSAHA, one best match
	45	ssahapematches = {}
	46	# linker readname as key: length of linker sequence
	47	linkerlengths = {}
	48
	49	# set to true if something really fishy is going on with the sequences
	50	stern_warning = True
	51
	52	def read_bin_fragment(struct_def, fileh, offset=0, data=None,
	53	byte_padding=None):
	54	'''It reads a chunk of a binary file.
	55
	56	You have to provide the struct, a file object, the offset (where to start
	57	reading).
	58	Also you can provide an optional dict that will be populated with the
	59	extracted data.
	60	If a byte_padding is given the number of bytes read will be a multiple of
	61	that number, adding the required pad at the end.
	62	It returns the number of bytes reads and the data dict.
	63	'''
	64	if data is None:
	65	data = {}
	66
	67	#we read each item
	68	bytes_read = 0
	69	for item in struct_def:
	70	#we go to the place and read
	71	fileh.seek(offset + bytes_read)
	72	n_bytes = struct.calcsize(item[1])
	73	buffer = fileh.read(n_bytes)
	74	read = struct.unpack('>' + item[1], buffer)
	75	if len(read) == 1:
	76	read = read[0]
	77	data[item[0]] = read
	78	bytes_read += n_bytes
	79
	80	#if there is byte_padding the bytes_to_read should be a multiple of the
	81	#byte_padding
	82	if byte_padding is not None:
	83	pad = byte_padding
	84	bytes_read = ((bytes_read + pad - 1) // pad) * pad
	85
	86	return (bytes_read, data)
	87
	88
	89	def check_magic(magic):
	90	'''It checks that the magic number of the file matches the sff magic.'''
	91	if magic != 779314790:
	92	raise RuntimeError('This file does not seems to be an sff file.')
	93
	94	def check_version(version):
	95	'''It checks that the version is supported, otherwise it raises an error.'''
	96	supported = ('\x00', '\x00', '\x00', '\x01')
	97	i = 0
	98	for item in version:
	99	if version[i] != supported[i]:
	100	raise RuntimeError('SFF version not supported. Please contact the author of the software.')
	101	i += 1
	102
	103	def read_header(fileh):
	104	'''It reads the header from the sff file and returns a dict with the
	105	information'''
	106	#first we read the first part of the header
	107	head_struct = [
	108	('magic_number', 'I'),
	109	('version', 'cccc'),
	110	('index_offset', 'Q'),
	111	('index_length', 'I'),
	112	('number_of_reads', 'I'),
	113	('header_length', 'H'),
	114	('key_length', 'H'),
	115	('number_of_flows_per_read', 'H'),
	116	('flowgram_format_code', 'B'),
	117	]
	118	data = {}
	119	first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
	120	offset=0, data=data)
	121	check_magic(data['magic_number'])
	122	check_version(data['version'])
	123	#now that we know the number_of_flows_per_read and the key_length
	124	#we can read the second part of the header
	125	struct2 = [
	126	('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
	127	('key_sequence', str(data['key_length']) + 'c')
	128	]
	129	read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
	130	data=data)
	131	return data
	132
	133
	134	def read_sequence(header, fileh, fposition):
	135	'''It reads one read from the sff file located at the fposition and
	136	returns a dict with the information.'''
	137	header_length = header['header_length']
	138	index_offset = header['index_offset']
	139	index_length = header['index_length']
	140
	141	#the sequence struct
	142	read_header_1 = [
	143	('read_header_length', 'H'),
	144	('name_length', 'H'),
	145	('number_of_bases', 'I'),
	146	('clip_qual_left', 'H'),
	147	('clip_qual_right', 'H'),
	148	('clip_adapter_left', 'H'),
	149	('clip_adapter_right', 'H'),
	150	]
	151	def read_header_2(name_length):
	152	'''It returns the struct definition for the second part of the header'''
	153	return [('name', str(name_length) +'c')]
	154	def read_data(number_of_bases):
	155	'''It returns the struct definition for the read data section.'''
	156	#size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
	157	if header['flowgram_format_code'] == 1:
	158	flow_type = 'H'
	159	else:
	160	raise Error('file version not supported')
	161	number_of_bases = str(number_of_bases)
	162	return [
	163	('flowgram_values', str(header['number_of_flows_per_read']) +
	164	flow_type),
	165	('flow_index_per_base', number_of_bases + 'B'),
	166	('bases', number_of_bases + 'c'),
	167	('quality_scores', number_of_bases + 'B'),
	168	]
	169
	170	data = {}
	171	#we read the first part of the header
	172	bytes_read, data = read_bin_fragment(struct_def=read_header_1,
	173	fileh=fileh, offset=fposition, data=data)
	174
	175	read_bin_fragment(struct_def=read_header_2(data['name_length']),
	176	fileh=fileh, offset=fposition + bytes_read, data=data)
	177	#we join the letters of the name
	178	data['name'] = ''.join(data['name'])
	179	offset = data['read_header_length']
	180	#we read the sequence and the quality
	181	read_data_st = read_data(data['number_of_bases'])
	182	bytes_read, data = read_bin_fragment(struct_def=read_data_st,
	183	fileh=fileh, offset=fposition + offset,
	184	data=data, byte_padding=8)
	185	#we join the bases
	186	data['bases'] = ''.join(data['bases'])
	187
	188	#print data
	189	#print "pre cqr: ", data['clip_qual_right']
	190	#print "pre car: ", data['clip_adapter_right']
	191	#print "pre cql: ", data['clip_qual_left']
	192	#print "pre cal: ", data['clip_adapter_left']
	193
	194	# correct for the case the right clip is <= than the left clip
	195	# in this case, left clip is 0 are set to 0 (right clip == 0 means
	196	# "whole sequence")
	197	if data['clip_qual_right'] <= data['clip_qual_left'] :
	198	data['clip_qual_right'] = 0
	199	data['clip_qual_left'] = 0
	200	if data['clip_adapter_right'] <= data['clip_adapter_left'] :
	201	data['clip_adapter_right'] = 0
	202	data['clip_adapter_left'] = 0
	203
	204	#the clipping section follows the NCBI's guidelines Trace Archive RFC
	205	#http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
	206	#if there's no adapter clip: qual -> vector
	207	#else: qual-> qual
	208	# adapter -> vector
	209
	210	if not data['clip_adapter_left']:
	211	data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
	212	if not data['clip_adapter_right']:
	213	data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
	214
	215	# see whether we have to override the minimum left clips
	216	if config['min_leftclip'] > 0:
	217	if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
	218	data['clip_adapter_left'] = config['min_leftclip']
	219	if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
	220	data['clip_qual_left'] = config['min_leftclip']
	221
	222
	223	#print "post cqr: ", data['clip_qual_right']
	224	#print "post car: ", data['clip_adapter_right']
	225	#print "post cql: ", data['clip_qual_left']
	226	#print "post cal: ", data['clip_adapter_left']
	227
	228
	229	# for handling the -c (clip) option gently, we already clip here
	230	# and set all clip points to the sequence end points
	231	if config['clip']:
	232	data['bases'], data['quality_scores'] = clip_read(data)
	233
	234	data['number_of_bases']=len(data['bases'])
	235	data['clip_qual_right'] = data['number_of_bases']
	236	data['clip_adapter_right'] = data['number_of_bases']
	237	data['clip_qual_left'] = 0
	238	data['clip_adapter_left'] = 0
	239
	240	return data['read_header_length'] + bytes_read, data
	241
	242
	243	def sequences(fileh, header):
	244	'''It returns a generator with the data for each read.'''
	245	#now we can read all the sequences
	246	fposition = header['header_length'] #position in the file
	247	reads_read = 0
	248	while True:
	249	if fposition == header['index_offset']:
	250	#we have to skip the index section
	251	fposition += index_length
	252	continue
	253	else:
	254	bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
	255	fposition=fposition)
	256	yield seq_data
	257	fposition += bytes_read
	258	reads_read += 1
	259	if reads_read >= header['number_of_reads']:
	260	break
	261
	262
	263	def remove_last_xmltag_in_file(fname, tag=None):
	264	'''Given an xml file name and a tag, it removes the last tag of the
	265	file if it matches the given tag. Tag removal is performed via file
	266	truncation.
	267
	268	It the given tag is not the last in the file, a RunTimeError will be
	269	raised.
	270
	271	The resulting xml file will be not xml valid. This function is a hack
	272	that allows to append records to xml files in a quick and dirty way.
	273	'''
	274
	275	fh = open(fname, 'r+')
	276	#we have to read from the end to the start of the file and keep the
	277	#string enclosed by </ >
	278	i = -1
	279	last_tag = [] #the chars that form the last tag
	280	start_offset = None #in which byte does the last tag starts?
	281	end_offset = None #in which byte does the last tag ends?
	282	while True:
	283	fh.seek(i, 2)
	284	char = fh.read(1)
	285	if not char.isspace():
	286	last_tag.append(char)
	287	if char == '>':
	288	end_offset = i
	289	if char == '<':
	290	start_offset = i
	291	break
	292	i -= 1
	293
	294	#we have read the last tag backwards
	295	last_tag = ''.join(last_tag[::-1])
	296	#we remove the </ and >
	297	last_tag = last_tag.rstrip('>').lstrip('</')
	298
	299	#we check that we're removing the asked tag
	300	if tag is not None and tag != last_tag:
	301	raise RuntimeError("The given xml tag wasn't the last one in the file")
	302
	303	# while we are at it: also remove all white spaces in that line :-)
	304	i -= 1
	305	while True:
	306	fh.seek(i, 2)
	307	char = fh.read(1)
	308	if not char == ' ' and not char == '\t':
	309	break;
	310	if fh.tell() == 1:
	311	break;
	312	i -= 1
	313
	314	fh.truncate();
	315
	316	fh.close()
	317	return last_tag
	318
	319
	320	def create_basic_xml_info(readname, fname):
	321	'''Formats a number of read specific infos into XML format.
	322	Currently formated: name and the tags set from command line
	323	'''
	324	to_print = [' <trace>\n']
	325	to_print.append(' <trace_name>')
	326	to_print.append(readname)
	327	to_print.append('</trace_name>\n')
	328
	329	#extra information
	330	#do we have extra info for this file?
	331	info = None
	332	if config['xml_info']:
	333	#with this name?
	334	if fname in config['xml_info']:
	335	info = config['xml_info'][fname]
	336	else:
	337	#with no name?
	338	try:
	339	info = config['xml_info'][fake_sff_name]
	340	except KeyError:
	341	pass
	342	#we print the info that we have
	343	if info:
	344	for key in info:
	345	to_print.append(' <' + key + '>' + info[key] + \
	346	'</' + key +'>\n')
	347
	348	return ''.join(to_print)
	349
	350
	351	def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
	352	'''Takes the clip values of the read and formats them into XML
	353	Corrects "wrong" values that might have resulted through
	354	simplified calculations earlier in the process of conversion
	355	(especially during splitting of paired-end reads)
	356	'''
	357
	358	to_print = [""]
	359
	360	# if right borders are >= to read length, they don't need
	361	# to be printed
	362	if adapr >= readlen:
	363	adapr = 0
	364	if qualr >= readlen:
	365	qualr = 0
	366
	367	# BaCh
	368	# when called via split_paired_end(), some values may be < 0
	369	# (when clip values were 0 previously)
	370	# instead of putting tons of if clauses for different calculations there,
	371	# I centralise corrective measure here
	372	# set all values <0 to 0
	373
	374	if adapr < 0:
	375	adapr = 0
	376	if qualr <0:
	377	qualr = 0
	378	if adapl < 0:
	379	adapl = 0
	380	if quall <0:
	381	quall = 0
	382
	383	if quall:
	384	to_print.append(' <clip_quality_left>')
	385	to_print.append(str(quall))
	386	to_print.append('</clip_quality_left>\n')
	387	if qualr:
	388	to_print.append(' <clip_quality_right>')
	389	to_print.append(str(qualr))
	390	to_print.append('</clip_quality_right>\n')
	391	if adapl:
	392	to_print.append(' <clip_vector_left>')
	393	to_print.append(str(adapl))
	394	to_print.append('</clip_vector_left>\n')
	395	if adapr:
	396	to_print.append(' <clip_vector_right>')
	397	to_print.append(str(adapr))
	398	to_print.append('</clip_vector_right>\n')
	399	return ''.join(to_print)
	400
	401
	402	def create_xml_for_unpaired_read(data, fname):
	403	'''Given the data for one read it returns an str with the xml ancillary
	404	data.'''
	405	to_print = [create_basic_xml_info(data['name'],fname)]
	406	#clippings in the XML only if we do not hard clip
	407	if not config['clip']:
	408	to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
	409	to_print.append(' </trace>\n')
	410	return ''.join(to_print)
	411
	412
	413	def format_as_fasta(name,seq,qual):
	414	name_line = ''.join(('>', name,'\n'))
	415	seqstring = ''.join((name_line, seq, '\n'))
	416	qual_line = ' '.join([str(q) for q in qual])
	417	qualstring = ''.join((name_line, qual_line, '\n'))
	418	return seqstring, qualstring
	419
	420	def format_as_fastq(name,seq,qual):
	421	qual_line = ''.join([chr(q+33) for q in qual])
	422	#seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
	423	seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
	424	return seqstring
	425
	426
	427	def get_read_data(data):
	428	'''Given the data for one read it returns 2 strs with the fasta seq
	429	and fasta qual.'''
	430	#seq and qual
	431	if config['mix_case']:
	432	seq = sequence_case(data)
	433	qual = data['quality_scores']
	434	else :
	435	seq = data['bases']
	436	qual = data['quality_scores']
	437
	438	return seq, qual
	439
	440	def extract_read_info(data, fname):
	441	'''Given the data for one read it returns 3 strs with the fasta seq, fasta
	442	qual and xml ancillary data.'''
	443
	444	seq,qual = get_read_data(data)
	445	seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
	446
	447	#name_line = ''.join(('>', data['name'],'\n'))
	448	#seq = ''.join((name_line, seq, '\n'))
	449	#qual_line = ' '.join([str(q) for q in qual])
	450	#qual = ''.join((name_line, qual_line, '\n'))
	451
	452	xmlstring = create_xml_for_unpaired_read(data, fname)
	453
	454	return seqstring, qualstring, xmlstring
	455
	456	def write_sequence(name,seq,qual,seq_fh,qual_fh):
	457	'''Write sequence and quality FASTA and FASTA qual filehandles
	458	(or into FASTQ and XML)
	459	if sequence length is 0, don't write'''
	460
	461	if len(seq) == 0 : return
	462
	463	if qual_fh is None:
	464	seq_fh.write(format_as_fastq(name,seq,qual))
	465	else:
	466	seqstring, qualstring = format_as_fasta(name,seq,qual)
	467	seq_fh.write(seqstring)
	468	qual_fh.write(qualstring)
	469	return
	470
	471	def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
	472	'''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
	473	(or into FASTQ and XML)
	474	if sequence length is 0, don't write'''
	475
	476	seq,qual = get_read_data(data)
	477	if len(seq) == 0 : return
	478
	479	write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
	480
	481	anci = create_xml_for_unpaired_read(data, sff_fh.name)
	482	if anci is not None:
	483	xml_fh.write(anci)
	484	return
	485
	486
	487	def reverse_complement(seq):
	488	'''Returns the reverse complement of a DNA sequence as string'''
	489
	490	compdict = {
	491	'a': 't',
	492	'c': 'g',
	493	'g': 'c',
	494	't': 'a',
	495	'u': 't',
	496	'm': 'k',
	497	'r': 'y',
	498	'w': 'w',
	499	's': 's',
	500	'y': 'r',
	501	'k': 'm',
	502	'v': 'b',
	503	'h': 'd',
	504	'd': 'h',
	505	'b': 'v',
	506	'x': 'x',
	507	'n': 'n',
	508	'A': 'T',
	509	'C': 'G',
	510	'G': 'C',
	511	'T': 'A',
	512	'U': 'T',
	513	'M': 'K',
	514	'R': 'Y',
	515	'W': 'W',
	516	'S': 'S',
	517	'Y': 'R',
	518	'K': 'M',
	519	'V': 'B',
	520	'H': 'D',
	521	'D': 'H',
	522	'B': 'V',
	523	'X': 'X',
	524	'N': 'N',
	525	'': ''
	526	}
	527
	528	complseq = ''.join([compdict[base] for base in seq])
	529	# python hack to reverse a list/string/etc
	530	complseq = complseq[::-1]
	531	return complseq
	532
	533
	534	def mask_sequence(seq, maskchar, fpos, tpos):
	535	'''Given a sequence, mask it with maskchar starting at fpos (including) and
	536	ending at tpos (excluding)
	537	'''
	538
	539	if len(maskchar) > 1:
	540	raise RuntimeError("Internal error: more than one character given to mask_sequence")
	541	if fpos<0:
	542	fpos = 0
	543	if tpos > len(seq):
	544	tpos = len(seq)
	545
	546	newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
	547
	548	return newseq
	549
	550
	551	def fragment_sequences(sequence, qualities, splitchar):
	552	'''Works like split() on strings, except it does this on a sequence
	553	and the corresponding list with quality values.
	554	Returns a tuple for each fragment, each sublist has the fragment
	555	sequence as first and the fragment qualities as second elemnt'''
	556
	557	# this is slow (due to zip and list appends... use an iterator over
	558	# the sequence find find variations and splices on seq and qual
	559
	560	if len(sequence) != len(qualities):
	561	print sequence, qualities
	562	raise RuntimeError("Internal error: length of sequence and qualities don't match???")
	563
	564	retlist = ([])
	565	if len(sequence) == 0:
	566	return retlist
	567
	568	actseq = ([])
	569	actqual = ([])
	570	if sequence[0] != splitchar:
	571	inseq = True
	572	else:
	573	inseq = False
	574	for char,qual in zip(sequence,qualities):
	575	if inseq:
	576	if char != splitchar:
	577	actseq.append(char)
	578	actqual.append(qual)
	579	else:
	580	retlist.append((''.join(actseq), actqual))
	581	actseq = ([])
	582	actqual = ([])
	583	inseq = False
	584	else:
	585	if char != splitchar:
	586	inseq = True
	587	actseq.append(char)
	588	actqual.append(qual)
	589
	590	if inseq and len(actseq):
	591	retlist.append((''.join(actseq), actqual))
	592
	593	return retlist
	594
	595
	596	def calc_subseq_boundaries(maskedseq, maskchar):
	597	'''E.g.:
	598	........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
	599	to
	600	(0,8),(8,16),(16,26),(26,47),(47,56)
	601	'''
	602
	603	blist = ([])
	604	if len(maskedseq) == 0:
	605	return blist
	606
	607	inmask = True
	608	if maskedseq[0] != maskchar:
	609	inmask = False
	610
	611	start = 0
	612	for spos in range(len(maskedseq)):
	613	if inmask and maskedseq[spos] != maskchar:
	614	blist.append(([start,spos]))
	615	start = spos
	616	inmask = False
	617	elif not inmask and maskedseq[spos] == maskchar:
	618	blist.append(([start,spos]))
	619	start = spos
	620	inmask = True
	621
	622	blist.append(([start,spos+1]))
	623
	624	return blist
	625
	626
	627	def correct_for_smallhits(maskedseq, maskchar, linkername):
	628	'''If partial hits were found, take preventive measure: grow
	629	the masked areas by 20 bases in each direction
	630	Returns either unchanged "maskedseq" or a new sequence
	631	with some more characters masked.
	632	'''
	633	global linkerlengths
	634
	635	CEBUG = 0
	636
	637	if CEBUG : print "correct_for_smallhits"
	638	if CEBUG : print "Masked seq\n", maskedseq
	639	if CEBUG : print "Linkername: ", linkername
	640
	641	if len(maskedseq) == 0:
	642	return maskedseq
	643
	644	growl=40
	645	growl2=growl/2
	646
	647	boundaries = calc_subseq_boundaries(maskedseq,maskchar)
	648	if CEBUG : print "Boundaries: ", boundaries
	649
	650	foundpartial = False
	651	for bounds in boundaries:
	652	if CEBUG : print "\tbounds: ", bounds
	653	left, right = bounds
	654	if left != 0 and right != len(maskedseq):
	655	if maskedseq[left] == maskchar:
	656	# allow 10% discrepancy
	657	# -linkerlengths[linkername]/10
	658	# that's a kind of safety net if there are slight sequencing
	659	# errors in the linker itself
	660	if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
	661	if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
	662	foundpartial = True
	663
	664	if not foundpartial:
	665	return maskedseq
	666
	667	# grow
	668	newseq = ""
	669	for bounds in boundaries:
	670	if CEBUG : print "Bounds: ", bounds
	671	left, right = bounds
	672	if maskedseq[left] == maskchar:
	673	newseq += maskedseq[left:right]
	674	else:
	675	clearstart = 0
	676	if left > 0 :
	677	clearstart = left+growl2
	678	clearstop = len(maskedseq)
	679	if right < len(maskedseq):
	680	clearstop = right-growl2
	681
	682	if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
	683
	684	if clearstop <= clearstart:
	685	newseq += maskchar * (right-left)
	686	else:
	687	if clearstart != left:
	688	newseq += maskchar * growl2
	689	newseq += maskedseq[clearstart:clearstop]
	690	if clearstop != right:
	691	newseq += maskchar * growl2
	692
	693	#print "newseq\n",newseq
	694
	695	return newseq
	696
	697
	698	def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
	699	'''Splits a paired end read and writes sequences into FASTA, FASTA qual
	700	and XML traceinfo file. Returns the number of sequences created.
	701
	702	As the linker sequence may be anywhere in the read, including the ends
	703	and overlapping with bad quality sequence, we need to perform some
	704	computing and eventually set new clip points.
	705
	706	If the resulting split yields only one sequence (because linker
	707	was not present or overlapping with left or right clip), only one
	708	sequence will be written with ".fn" appended to the name.
	709
	710	If the read can be split, two reads will be written. The side left of
	711	the linker will be named ".r" and will be written in reverse complement
	712	into the file to conform with what approximately all assemblers expect
	713	when reading paired-end data: reads in forward direction in file. The side
	714	right of the linker will be named ".f"
	715
	716	If SSAHA found partial linker (linker sequences < length of linker),
	717	the sequences will get a "_pl" furthermore be cut back thoroughly.
	718
	719	If SSAHA found multiple occurences of the linker, the names will get an
	720	additional "_mlc" within the name to show that there was "multiple
	721	linker contamination".
	722
	723	For multiple or partial linker, the "good" parts of the reads are
	724	stored with a ".part<number>" name, additionally they will not get
	725	template information in the XML
	726	'''
	727
	728	global ssahapematches
	729
	730	CEBUG = 0
	731
	732	maskchar = "#"
	733
	734	if CEBUG : print "Need to split: " + data['name']
	735
	736	numseqs = 0;
	737	readname = data['name']
	738	readlen = data['number_of_bases']
	739
	740	leftclip, rightclip = return_merged_clips(data)
	741	seq, qual = get_read_data(data)
	742
	743	if CEBUG : print "Original read:\n",seq
	744
	745	maskedseq = seq
	746	if leftclip > 0:
	747	maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
	748	if rightclip < len(maskedseq):
	749	maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
	750
	751	leftclip, rightclip = return_merged_clips(data)
	752	readlen = data['number_of_bases']
	753
	754	if CEBUG : print "Readname:", readname
	755	if CEBUG : print "Readlen:", readlen
	756	if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
	757	if CEBUG : print "matches:", ssahapematches[data['name']]
	758
	759	for match in ssahapematches[data['name']]:
	760	score = int(match[0])
	761	linkername = match[2]
	762	leftreadhit = int(match[3])
	763	rightreadhit = int(match[4])
	764	#leftlinkerhit = int(match[5])
	765	#rightlinkerhit = int(match[6])
	766	#direction = match[7]
	767	#hitlen = int(match[8])
	768	#hitidentity = float(match[9])
	769
	770	if CEBUG : print match
	771	if CEBUG : print "Match with score:", score
	772	if CEBUG : print "Read before:\n", maskedseq
	773	maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
	774	if CEBUG : print "Masked seq:\n", maskedseq
	775
	776	correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
	777
	778	if len(maskedseq) != len(correctedseq):
	779	raise RuntimeError("Internal error: maskedseq != correctedseq")
	780
	781	partialhits = False
	782	if correctedseq != maskedseq:
	783	if CEBUG : print "Partial hits in", readname
	784	if CEBUG : print "Original seq:\n", seq
	785	if CEBUG : print "Masked seq:\n", maskedseq
	786	if CEBUG : print "Corrected seq\n", correctedseq
	787	partialhits = True
	788	readname += "_pl"
	789	maskedseq = correctedseq
	790
	791	fragments = fragment_sequences(maskedseq, qual, maskchar)
	792
	793	if CEBUG : print "Fragments (", len(fragments), "): ", fragments
	794
	795	mlcflag = False
	796	#if len(ssahapematches[data['name']]) > 1:
	797	# #print "Multi linker contamination"
	798	# mlcflag = True
	799	# readname += "_mlc"
	800
	801	if len(fragments) > 2:
	802	if CEBUG : print "Multi linker contamination"
	803	mlcflag = True
	804	readname += "_mlc"
	805
	806
	807	#print fragments
	808	if mlcflag or partialhits:
	809	fragcounter = 1
	810	readname += ".part"
	811	for frag in fragments:
	812	actseq = frag[0]
	813	if len(actseq) >= 20:
	814	actqual = frag[1]
	815	oname = readname + str(fragcounter)
	816	#seq_fh.write(">"+oname+"\n")
	817	#seq_fh.write(actseq+"\n")
	818	#qual_fh.write(">"+oname+"\n")
	819	#qual_fh.write(' '.join((str(q) for q in actqual)))
	820	#qual_fh.write("\n")
	821	write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
	822	to_print = [create_basic_xml_info(oname,sff_fh.name)]
	823	# No clipping in XML ... the multiple and partial fragments
	824	# are clipped "hard"
	825	# No template ID and trace_end: we don't know the
	826	# orientation of the frahments. Even if it were
	827	# only two, the fact we had multiple linkers
	828	# says something went wrong, so simply do not
	829	# write any paired-end information for all these fragments
	830	to_print.append(' </trace>\n')
	831	xml_fh.write(''.join(to_print))
	832	numseqs += 1
	833	fragcounter += 1
	834	else:
	835	if len(fragments) >2:
	836	raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
	837	# nothing will happen for 0 fragments
	838	if len(fragments) == 1:
	839	#print "Tada1"
	840	boundaries = calc_subseq_boundaries(maskedseq,maskchar)
	841	if len(boundaries) < 1 or len(boundaries) >3:
	842	raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
	843	if len(boundaries) == 3:
	844	# case: mask char on both sides of sequence
	845	#print "bounds3"
	846	data['clip_adapter_left']=1+boundaries[0][1]
	847	data['clip_adapter_right']=boundaries[2][0]
	848	elif len(boundaries) == 2:
	849	# case: mask char left or right of sequence
	850	#print "bounds2",
	851	if maskedseq[0] == maskchar :
	852	# case: mask char left
	853	#print "left"
	854	data['clip_adapter_left']=1+boundaries[0][1]
	855	else:
	856	# case: mask char right
	857	#print "right"
	858	data['clip_adapter_right']=boundaries[1][0]
	859	data['name'] = data['name'] + ".fn"
	860	write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
	861	numseqs = 1
	862	elif len(fragments) == 2:
	863	#print "Tada2"
	864	oname = readname + ".r"
	865	seq, qual = get_read_data(data)
	866
	867	startsearch = False
	868	for spos in range(len(maskedseq)):
	869	if maskedseq[spos] != maskchar:
	870	startsearch = True;
	871	else:
	872	if startsearch:
	873	break
	874
	875	#print "\nspos: ", spos
	876	lseq=seq[:spos]
	877	#print "lseq:", lseq
	878	actseq = reverse_complement(lseq)
	879	lreadlen = len(actseq)
	880	lqual = qual[:spos];
	881	# python hack to reverse a list/string/etc
	882	lqual = lqual[::-1];
	883
	884	#seq_fh.write(">"+oname+"\n")
	885	#seq_fh.write(actseq+"\n")
	886	#qual_fh.write(">"+oname+"\n")
	887	#qual_fh.write(' '.join((str(q) for q in lqual)))
	888	#qual_fh.write("\n")
	889
	890	write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
	891
	892	to_print = [create_basic_xml_info(oname,sff_fh.name)]
	893	to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
	894	to_print.append(' <template_id>')
	895	to_print.append(readname)
	896	to_print.append('</template_id>\n')
	897	to_print.append(' <trace_end>r</trace_end>\n')
	898	to_print.append(' </trace>\n')
	899	xml_fh.write(''.join(to_print))
	900
	901	oname = readname + ".f"
	902	startsearch = False
	903	for spos in range(len(maskedseq)-1,-1,-1):
	904	if maskedseq[spos] != maskchar:
	905	startsearch = True;
	906	else:
	907	if startsearch:
	908	break
	909
	910	actseq = seq[spos+1:]
	911	actqual = qual[spos+1:];
	912
	913	#print "\nspos: ", spos
	914	#print "rseq:", actseq
	915
	916	#seq_fh.write(">"+oname+"\n")
	917	#seq_fh.write(actseq+"\n")
	918	#qual_fh.write(">"+oname+"\n")
	919	#qual_fh.write(' '.join((str(q) for q in actqual)))
	920	#qual_fh.write("\n")
	921	write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
	922
	923	rreadlen = len(actseq)
	924	to_print = [create_basic_xml_info(oname,sff_fh.name)]
	925	to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
	926	to_print.append(' <template_id>')
	927	to_print.append(readname)
	928	to_print.append('</template_id>\n')
	929	to_print.append(' <trace_end>f</trace_end>\n')
	930	to_print.append(' </trace>\n')
	931	xml_fh.write(''.join(to_print))
	932	numseqs = 2
	933
	934	return numseqs
	935
	936
	937
	938	def extract_reads_from_sff(config, sff_files):
	939	'''Given the configuration and the list of sff_files it writes the seqs,
	940	qualities and ancillary data into the output file(s).
	941
	942	If file for paired-end linker was given, first extracts all sequences
	943	of an SFF and searches these against the linker(s) with SSAHA2 to
	944	create needed information to split reads.
	945	'''
	946
	947	global ssahapematches
	948
	949
	950	if len(sff_files) == 0 :
	951	raise RuntimeError("No SFF file given?")
	952
	953	#we go through all input files
	954	for sff_file in sff_files:
	955	if not os.path.getsize(sff_file):
	956	raise RuntimeError('Empty file? : ' + sff_file)
	957	fh = open(sff_file, 'r')
	958	fh.close()
	959
	960	openmode = 'w'
	961	if config['append']:
	962	openmode = 'a'
	963
	964	seq_fh = open(config['seq_fname'], openmode)
	965	xml_fh = open(config['xml_fname'], openmode)
	966	if config['want_fastq']:
	967	qual_fh = None
	968	try:
	969	os.remove(config['qual_fname'])
	970	except :
	971	python_formattingwithoutbracesisdumb_dummy = 1
	972	else:
	973	qual_fh = open(config['qual_fname'], openmode)
	974
	975	if not config['append']:
	976	xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
	977	else:
	978	remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
	979
	980	#we go through all input files
	981	for sff_file in sff_files:
	982	#print "Working on '" + sff_file + "':"
	983	ssahapematches.clear()
	984
	985	seqcheckstore = ([])
	986
	987	debug = 0
	988
	989	if not debug and config['pelinker_fname']:
	990	#print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
	991	sys.stdout.flush()
	992
	993	if 0 :
	994	# for debugging
	995	pid = os.getpid()
	996	tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
	997	tmpfasta_fh = open(tmpfasta_fname, 'w')
	998	else:
	999	tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
	1000	suffix = '.fasta')
	1001
	1002	sff_fh = open(sff_file, 'rb')
	1003	header_data = read_header(fileh=sff_fh)
	1004	for seq_data in sequences(fileh=sff_fh, header=header_data):
	1005	seq,qual = get_read_data(seq_data)
	1006	seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
	1007	tmpfasta_fh.write(seqstring)
	1008	#seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
	1009	#tmpfasta_fh.write(seq)
	1010	#print "done."
	1011	tmpfasta_fh.seek(0)
	1012
	1013	if 0 :
	1014	# for debugging
	1015	tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
	1016	tmpssaha_fh = open(tmpssaha_fname, 'w+')
	1017	else:
	1018	tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
	1019	suffix = '.ssaha2')
	1020
	1021	launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
	1022	tmpfasta_fh.close()
	1023
	1024	tmpssaha_fh.seek(0)
	1025	read_ssaha_data(tmpssaha_fh)
	1026	tmpssaha_fh.close()
	1027
	1028	if debug:
	1029	tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
	1030	read_ssaha_data(tmpssaha_fh)
	1031
	1032	#print "Converting '" + sff_file + "' ... ",
	1033	sys.stdout.flush()
	1034	sff_fh = open(sff_file, 'rb')
	1035	#read_header(infile)
	1036	header_data = read_header(fileh=sff_fh)
	1037
	1038	#now convert all reads
	1039	nseqs_sff = 0
	1040	nseqs_out = 0
	1041	for seq_data in sequences(fileh=sff_fh, header=header_data):
	1042	nseqs_sff += 1
	1043
	1044	seq, qual = clip_read(seq_data)
	1045	seqcheckstore.append(seq[0:50])
	1046
	1047	#if nseqs_sff >1000:
	1048	# check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
	1049	# sys.exit()
	1050
	1051	if ssahapematches.has_key(seq_data['name']):
	1052	#print "Paired end:",seq_data['name']
	1053	nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
	1054	else:
	1055	#print "Normal:",seq_data['name']
	1056	if config['pelinker_fname']:
	1057	seq_data['name'] = seq_data['name'] + ".fn"
	1058	write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
	1059	nseqs_out += 1
	1060	#print "done."
	1061	#print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
	1062	sff_fh.close()
	1063
	1064	check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
	1065	seqcheckstore = ([])
	1066
	1067	xml_fh.write('</trace_volume>\n')
	1068
	1069	xml_fh.close()
	1070	seq_fh.close()
	1071	if qual_fh is not None:
	1072	qual_fh.close()
	1073
	1074	return
	1075
	1076	def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
	1077
	1078	global stern_warning
	1079
	1080	foundproblem = ""
	1081	for checklen in range(1,len(seqcheckstore[0])):
	1082	foundinloop = False
	1083	seqdict = {}
	1084	for seq in seqcheckstore:
	1085	shortseq = seq[0:checklen]
	1086	if shortseq in seqdict:
	1087	seqdict[shortseq] += 1
	1088	else:
	1089	seqdict[shortseq] = 1
	1090
	1091	for shortseq, count in seqdict.items():
	1092	if float(count)/len(seqcheckstore) >= 0.5:
	1093	foundinloop = True
	1094	stern_warning
	1095	foundproblem = "\n"+"" 80
	1096	foundproblem += "\nWARNING: "
	1097	foundproblem += "weird sequences in file " + sffname + "\n\n"
	1098	foundproblem += "After applying left clips, " + str(count) + " sequences (="
	1099	foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
	1100	foundproblem += "%) start with these bases:\n" + shortseq
	1101	foundproblem += "\n\nThis does not look sane.\n\n"
	1102	foundproblem += "Countermeasures you probably must take:\n"
	1103	foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n"
	1104	foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n"
	1105	left,right = return_merged_clips(seqdata)
	1106	foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
	1107	foundproblem += "" 80 + "\n"
	1108	if not foundinloop :
	1109	break
	1110	if len(foundproblem):
	1111	print foundproblem
	1112
	1113
	1114	def parse_extra_info(info):
	1115	'''It parses the information that will go in the xml file.
	1116
	1117	There are two formats accepted for the extra information:
	1118	key1:value1, key2:value2
	1119	or:
	1120	file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
	1121	'''
	1122	if not info:
	1123	return info
	1124	finfos = info.split(';') #information for each file
	1125	data_for_files = {}
	1126	for finfo in finfos:
	1127	#we split the file name from the rest
	1128	items = finfo.split('{')
	1129	if len(items) == 1:
	1130	fname = fake_sff_name
	1131	info = items[0]
	1132	else:
	1133	fname = items[0]
	1134	info = items[1]
	1135	#now we get each key,value pair in the info
	1136	info = info.replace('}', '')
	1137	data = {}
	1138	for item in info.split(','):
	1139	key, value = item.strip().split(':')
	1140	key = key.strip()
	1141	value = value.strip()
	1142	data[key] = value
	1143	data_for_files[fname] = data
	1144	return data_for_files
	1145
	1146
	1147	def return_merged_clips(data):
	1148	'''It returns the left and right positions to clip.'''
	1149	def max(a, b):
	1150	'''It returns the max of the two given numbers.
	1151
	1152	It won't take into account the zero values.
	1153	'''
	1154	if not a and not b:
	1155	return None
	1156	if not a:
	1157	return b
	1158	if not b:
	1159	return a
	1160	if a >= b:
	1161	return a
	1162	else:
	1163	return b
	1164	def min(a, b):
	1165	'''It returns the min of the two given numbers.
	1166
	1167	It won't take into account the zero values.
	1168	'''
	1169	if not a and not b:
	1170	return None
	1171	if not a:
	1172	return b
	1173	if not b:
	1174	return a
	1175	if a <= b:
	1176	return a
	1177	else:
	1178	return b
	1179	left = max(data['clip_adapter_left'], data['clip_qual_left'])
	1180	right = min(data['clip_adapter_right'], data['clip_qual_right'])
	1181	#maybe both clips where zero
	1182	if left is None:
	1183	left = 1
	1184	if right is None:
	1185	right = data['number_of_bases']
	1186	return left, right
	1187
	1188	def sequence_case(data):
	1189	'''Given the data for one read it returns the seq with mixed case.
	1190
	1191	The regions to be clipped will be lower case and the rest upper case.
	1192	'''
	1193	left, right = return_merged_clips(data)
	1194	seq = data['bases']
	1195	new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
	1196	return new_seq
	1197
	1198	def clip_read(data):
	1199	'''Given the data for one read it returns clipped seq and qual.'''
	1200
	1201	qual = data['quality_scores']
	1202	left, right = return_merged_clips(data)
	1203	seq = data['bases']
	1204	qual = data['quality_scores']
	1205	new_seq = seq[left-1:right]
	1206	new_qual = qual[left-1:right]
	1207
	1208	return new_seq, new_qual
	1209
	1210
	1211
	1212	def tests_for_ssaha(linker_fname):
	1213	'''Tests whether SSAHA2 can be successfully called.'''
	1214
	1215	try:
	1216	print "Testing whether SSAHA2 is installed and can be launched ... ",
	1217	sys.stdout.flush()
	1218	fh = open('/dev/null', 'w')
	1219	retcode = subprocess.call(["ssaha2", "-v"], stdout = fh)
	1220	fh.close()
	1221	print "ok."
	1222	except :
	1223	print "nope? Uh oh ...\n\n"
	1224	raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
	1225
	1226
	1227	def load_linker_sequences(linker_fname):
	1228	'''Loads all linker sequences into memory, storing only the length
	1229	of each linker.'''
	1230
	1231	global linkerlengths
	1232
	1233	if not os.path.getsize(linker_fname):
	1234	raise RuntimeError("File empty? '" + linker_fname + "'")
	1235	fh = open(linker_fname, 'r')
	1236	linkerseqs = read_fasta(fh)
	1237	if len(linkerseqs) == 0:
	1238	raise RuntimeError(linker_fname + ": no sequence found?")
	1239	for i in linkerseqs:
	1240	if linkerlengths.has_key(i.name):
	1241	raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
	1242	linkerlengths[i.name] = len(i.sequence)
	1243	fh.close()
	1244
	1245
	1246	def launch_ssaha(linker_fname, query_fname, output_fh):
	1247	'''Launches SSAHA2 on the linker and query file, string SSAHA2 output
	1248	into the output filehandle'''
	1249
	1250	try:
	1251	print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
	1252	sys.stdout.flush()
	1253	retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
	1254	if retcode:
	1255	raise RuntimeError('Ups.')
	1256	else:
	1257	print "ok."
	1258	except:
	1259	print "\n"
	1260	raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
	1261
	1262	def read_ssaha_data(ssahadata_fh):
	1263	'''Given file handle, reads file generated with SSAHA2 (with default
	1264	output format) and stores all matches as list ssahapematches
	1265	(ssaha paired-end matches) dictionary'''
	1266
	1267	global ssahapematches
	1268
	1269	print "Parsing SSAHA2 result file ... ",
	1270	sys.stdout.flush()
	1271
	1272	for line in ssahadata_fh:
	1273	if line.startswith('ALIGNMENT'):
	1274	ml = line.split()
	1275	if len(ml) != 12 :
	1276	print "\n", line,
	1277	raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
	1278	if not ssahapematches.has_key(ml[2]) :
	1279	ssahapematches[ml[2]] = ([])
	1280	if ml[8] == 'F':
	1281	#print line,
	1282
	1283	# store everything except the first element (output
	1284	# format name (ALIGNMENT)) and the last element
	1285	# (length)
	1286	ssahapematches[ml[2]].append(ml[1:-1])
	1287	else:
	1288	#print ml
	1289	ml[4],ml[5] = ml[5],ml[4]
	1290	#print ml
	1291	ssahapematches[ml[2]].append(ml[1:-1])
	1292
	1293	print "done."
	1294
	1295
	1296	##########################################################################
	1297	#
	1298	# BaCh: This block was shamelessly copied from
	1299	# http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
	1300	# and then subsequently modified to read fasta correctly
	1301	# It's still not fool proof, but should be good enough
	1302	#
	1303	##########################################################################
	1304
	1305	class Fasta:
	1306	def __init__(self, name, sequence):
	1307	self.name = name
	1308	self.sequence = sequence
	1309
	1310	def read_fasta(file):
	1311	items = []
	1312	aninstance = Fasta('', '')
	1313	linenum = 0
	1314	for line in file:
	1315	linenum += 1
	1316	if line.startswith(">"):
	1317	if len(aninstance.sequence):
	1318	items.append(aninstance)
	1319	aninstance = Fasta('', '')
	1320	# name == all characters until the first whitespace
	1321	# (split()[0]) but without the starting ">" ([1:])
	1322	aninstance.name = line.split()[0][1:]
	1323	aninstance.sequence = ''
	1324	if len(aninstance.name) == 0:
	1325	raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
	1326
	1327	else:
	1328	if len(aninstance.name) == 0:
	1329	raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
	1330	aninstance.sequence += line.strip()
	1331
	1332	if len(aninstance.name) and len(aninstance.sequence):
	1333	items.append(aninstance)
	1334
	1335	return items
	1336	##########################################################################
	1337
	1338	def version_string ():
	1339	return "sff_extract " + __version__
	1340
	1341	def read_config():
	1342	'''It reads the configuration options from the command line arguments and
	1343	it returns a dict with them.'''
	1344	from optparse import OptionParser, OptionGroup
	1345	usage = "usage: %prog [options] sff1 sff2 ..."
	1346	desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
	1347	" and XML traceinfo format. When a paired-end linker sequence"\
	1348	" is given (-l), use SSAHA2 to scan the sequences for the linker,"\
	1349	" then split the sequences, removing the linker."
	1350	parser = OptionParser(usage = usage, version = version_string(), description = desc)
	1351	parser.add_option('-a', '--append', action="store_true", dest='append',
	1352	help='append output to existing files', default=False)
	1353	parser.add_option('-i', '--xml_info', dest='xml_info',
	1354	help='extra info to write in the xml file')
	1355	parser.add_option("-l", "--linker_file", dest="pelinker_fname",
	1356	help="FASTA file with paired-end linker sequences", metavar="FILE")
	1357
	1358	group = OptionGroup(parser, "File name options","")
	1359	group.add_option('-c', '--clip', action="store_true", dest='clip',
	1360	help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
	1361	group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
	1362	help='all bases in upper case, including clipped ends', default=True)
	1363	group.add_option('', '--min_left_clip', dest='min_leftclip',
	1364	metavar="INTEGER", type = "int",
	1365	help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
	1366	group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
	1367	help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
	1368	parser.add_option_group(group)
	1369
	1370	group = OptionGroup(parser, "File name options","")
	1371	group.add_option("-o", "--out_basename", dest="basename",
	1372	help="base name for all output files")
	1373	group.add_option("-s", "--seq_file", dest="seq_fname",
	1374	help="output sequence file name", metavar="FILE")
	1375	group.add_option("-q", "--qual_file", dest="qual_fname",
	1376	help="output quality file name", metavar="FILE")
	1377	group.add_option("-x", "--xml_file", dest="xml_fname",
	1378	help="output ancillary xml file name", metavar="FILE")
	1379	parser.add_option_group(group)
	1380
	1381	#default fnames
	1382	#is there an sff file?
	1383	basename = 'reads'
	1384	if sys.argv[-1][-4:].lower() == '.sff':
	1385	basename = sys.argv[-1][:-4]
	1386	def_seq_fname = basename + '.fasta'
	1387	def_qual_fname = basename + '.fasta.qual'
	1388	def_xml_fname = basename + '.xml'
	1389	def_pelinker_fname = ''
	1390	parser.set_defaults(seq_fname = def_seq_fname)
	1391	parser.set_defaults(qual_fname = def_qual_fname)
	1392	parser.set_defaults(xml_fname = def_xml_fname)
	1393	parser.set_defaults(pelinker_fname = def_pelinker_fname)
	1394
	1395	#we parse the cmd line
	1396	(options, args) = parser.parse_args()
	1397
	1398	#we put the result in a dict
	1399	global config
	1400	config = {}
	1401	for property in dir(options):
	1402	if property[0] == '_' or property in ('ensure_value', 'read_file',
	1403	'read_module'):
	1404	continue
	1405	config[property] = getattr(options, property)
	1406
	1407	if config['basename'] is None:
	1408	config['basename']=basename
	1409
	1410	#if we have not set a file name with -s, -q or -x we set the basename
	1411	#based file name
	1412	if config['want_fastq']:
	1413	config['qual_fname'] = ''
	1414	if config['seq_fname'] == def_seq_fname:
	1415	config['seq_fname'] = config['basename'] + '.fastq'
	1416	else:
	1417	if config['seq_fname'] == def_seq_fname:
	1418	config['seq_fname'] = config['basename'] + '.fasta'
	1419	if config['qual_fname'] == def_qual_fname:
	1420	config['qual_fname'] = config['basename'] + '.fasta.qual'
	1421
	1422	if config['xml_fname'] == def_xml_fname:
	1423	config['xml_fname'] = config['basename'] + '.xml'
	1424
	1425	#we parse the extra info for the xml file
	1426	config['xml_info'] = parse_extra_info(config['xml_info'])
	1427	return config, args
	1428
	1429
	1430
	1431	##########################################################################
	1432
	1433
	1434	def testsome():
	1435	sys.exit()
	1436	return
	1437
	1438
	1439	def debug():
	1440	try:
	1441	dummy = 1
	1442	#debug()
	1443	#testsome()
	1444
	1445	config, args = read_config()
	1446	load_linker_sequences(config['pelinker_fname'])
	1447
	1448	#pid = os.getpid()
	1449	pid = 15603
	1450
	1451	#tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
	1452	#tmpfasta_fh = open(tmpfasta_fname, 'w')
	1453	tmpfasta_fname = 'FLVI58L05.fa'
	1454	tmpfasta_fh = open(tmpfasta_fname, 'r')
	1455
	1456	tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
	1457	tmpssaha_fh = open(tmpssaha_fname, 'w')
	1458
	1459	launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
	1460
	1461	tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r')
	1462	read_ssaha_data(tmpssaha_fh)
	1463
	1464	sys.exit()
	1465
	1466	extract_reads_from_sff(config, args)
	1467
	1468	except (OSError, IOError, RuntimeError), errval:
	1469	print errval
	1470	sys.exit()
	1471
	1472	sys.exit()
	1473
	1474
	1475	def main():
	1476
	1477	argv = sys.argv
	1478	if len(argv) == 1:
	1479	sys.argv.append('-h')
	1480	read_config()
	1481	sys.exit()
	1482	try:
	1483	#debug();
	1484
	1485	config, args = read_config()
	1486
	1487	if config['pelinker_fname']:
	1488	#tests_for_ssaha(config['pelinker_fname'])
	1489	load_linker_sequences(config['pelinker_fname'])
	1490	if len(args) == 0:
	1491	raise RuntimeError("No SFF file given?")
	1492	extract_reads_from_sff(config, args)
	1493	except (OSError, IOError, RuntimeError), errval:
	1494	print errval
	1495	return 1
	1496
	1497	if stern_warning:
	1498	return 1
	1499
	1500	return 0
	1501
	1502
	1503
	1504	if __name__ == "__main__":
	1505	sys.exit(main())

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/filters/sff_extract.py @ 2

異なるフォーマットでダウンロード: