1 | #!/usr/bin/python |
---|
2 | '''This software extracts the seq, qual and ancillary information from an sff |
---|
3 | file, like the ones used by the 454 sequencer. |
---|
4 | |
---|
5 | Optionally, it can also split paired-end reads if given the linker sequence. |
---|
6 | The splitting is done with maximum match, i.e., every occurence of the linker |
---|
7 | sequence will be removed, even if occuring multiple times.''' |
---|
8 | |
---|
9 | #copyright Jose Blanca and Bastien Chevreux |
---|
10 | #COMAV institute, Universidad Politecnica de Valencia (UPV) |
---|
11 | #Valencia, Spain |
---|
12 | |
---|
13 | # additions to handle paired end reads by Bastien Chevreux |
---|
14 | # bugfixes for linker specific lengths: Lionel Guy |
---|
15 | |
---|
16 | #This program is free software: you can redistribute it and/or modify |
---|
17 | #it under the terms of the GNU General Public License as published by |
---|
18 | #the Free Software Foundation, either version 3 of the License, or |
---|
19 | #(at your option) any later version. |
---|
20 | #This program is distributed in the hope that it will be useful, |
---|
21 | #but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
22 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
23 | #GNU General Public License for more details. |
---|
24 | #You should have received a copy of the GNU General Public License |
---|
25 | #along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
26 | |
---|
27 | __author__ = 'Jose Blanca and Bastien Chevreux' |
---|
28 | __copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux' |
---|
29 | __license__ = 'GPLv3 or later' |
---|
30 | __version__ = '0.2.8' |
---|
31 | __email__ = 'jblanca@btc.upv.es' |
---|
32 | __status__ = 'beta' |
---|
33 | |
---|
34 | import struct |
---|
35 | import sys |
---|
36 | import os |
---|
37 | import subprocess |
---|
38 | import tempfile |
---|
39 | |
---|
40 | |
---|
41 | fake_sff_name = 'fake_sff_name' |
---|
42 | |
---|
43 | |
---|
44 | # readname as key: lines with matches from SSAHA, one best match |
---|
45 | ssahapematches = {} |
---|
46 | # linker readname as key: length of linker sequence |
---|
47 | linkerlengths = {} |
---|
48 | |
---|
49 | # set to true if something really fishy is going on with the sequences |
---|
50 | stern_warning = True |
---|
51 | |
---|
52 | def read_bin_fragment(struct_def, fileh, offset=0, data=None, |
---|
53 | byte_padding=None): |
---|
54 | '''It reads a chunk of a binary file. |
---|
55 | |
---|
56 | You have to provide the struct, a file object, the offset (where to start |
---|
57 | reading). |
---|
58 | Also you can provide an optional dict that will be populated with the |
---|
59 | extracted data. |
---|
60 | If a byte_padding is given the number of bytes read will be a multiple of |
---|
61 | that number, adding the required pad at the end. |
---|
62 | It returns the number of bytes reads and the data dict. |
---|
63 | ''' |
---|
64 | if data is None: |
---|
65 | data = {} |
---|
66 | |
---|
67 | #we read each item |
---|
68 | bytes_read = 0 |
---|
69 | for item in struct_def: |
---|
70 | #we go to the place and read |
---|
71 | fileh.seek(offset + bytes_read) |
---|
72 | n_bytes = struct.calcsize(item[1]) |
---|
73 | buffer = fileh.read(n_bytes) |
---|
74 | read = struct.unpack('>' + item[1], buffer) |
---|
75 | if len(read) == 1: |
---|
76 | read = read[0] |
---|
77 | data[item[0]] = read |
---|
78 | bytes_read += n_bytes |
---|
79 | |
---|
80 | #if there is byte_padding the bytes_to_read should be a multiple of the |
---|
81 | #byte_padding |
---|
82 | if byte_padding is not None: |
---|
83 | pad = byte_padding |
---|
84 | bytes_read = ((bytes_read + pad - 1) // pad) * pad |
---|
85 | |
---|
86 | return (bytes_read, data) |
---|
87 | |
---|
88 | |
---|
89 | def check_magic(magic): |
---|
90 | '''It checks that the magic number of the file matches the sff magic.''' |
---|
91 | if magic != 779314790: |
---|
92 | raise RuntimeError('This file does not seems to be an sff file.') |
---|
93 | |
---|
94 | def check_version(version): |
---|
95 | '''It checks that the version is supported, otherwise it raises an error.''' |
---|
96 | supported = ('\x00', '\x00', '\x00', '\x01') |
---|
97 | i = 0 |
---|
98 | for item in version: |
---|
99 | if version[i] != supported[i]: |
---|
100 | raise RuntimeError('SFF version not supported. Please contact the author of the software.') |
---|
101 | i += 1 |
---|
102 | |
---|
103 | def read_header(fileh): |
---|
104 | '''It reads the header from the sff file and returns a dict with the |
---|
105 | information''' |
---|
106 | #first we read the first part of the header |
---|
107 | head_struct = [ |
---|
108 | ('magic_number', 'I'), |
---|
109 | ('version', 'cccc'), |
---|
110 | ('index_offset', 'Q'), |
---|
111 | ('index_length', 'I'), |
---|
112 | ('number_of_reads', 'I'), |
---|
113 | ('header_length', 'H'), |
---|
114 | ('key_length', 'H'), |
---|
115 | ('number_of_flows_per_read', 'H'), |
---|
116 | ('flowgram_format_code', 'B'), |
---|
117 | ] |
---|
118 | data = {} |
---|
119 | first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh, |
---|
120 | offset=0, data=data) |
---|
121 | check_magic(data['magic_number']) |
---|
122 | check_version(data['version']) |
---|
123 | #now that we know the number_of_flows_per_read and the key_length |
---|
124 | #we can read the second part of the header |
---|
125 | struct2 = [ |
---|
126 | ('flow_chars', str(data['number_of_flows_per_read']) + 'c'), |
---|
127 | ('key_sequence', str(data['key_length']) + 'c') |
---|
128 | ] |
---|
129 | read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes, |
---|
130 | data=data) |
---|
131 | return data |
---|
132 | |
---|
133 | |
---|
134 | def read_sequence(header, fileh, fposition): |
---|
135 | '''It reads one read from the sff file located at the fposition and |
---|
136 | returns a dict with the information.''' |
---|
137 | header_length = header['header_length'] |
---|
138 | index_offset = header['index_offset'] |
---|
139 | index_length = header['index_length'] |
---|
140 | |
---|
141 | #the sequence struct |
---|
142 | read_header_1 = [ |
---|
143 | ('read_header_length', 'H'), |
---|
144 | ('name_length', 'H'), |
---|
145 | ('number_of_bases', 'I'), |
---|
146 | ('clip_qual_left', 'H'), |
---|
147 | ('clip_qual_right', 'H'), |
---|
148 | ('clip_adapter_left', 'H'), |
---|
149 | ('clip_adapter_right', 'H'), |
---|
150 | ] |
---|
151 | def read_header_2(name_length): |
---|
152 | '''It returns the struct definition for the second part of the header''' |
---|
153 | return [('name', str(name_length) +'c')] |
---|
154 | def read_data(number_of_bases): |
---|
155 | '''It returns the struct definition for the read data section.''' |
---|
156 | #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8} |
---|
157 | if header['flowgram_format_code'] == 1: |
---|
158 | flow_type = 'H' |
---|
159 | else: |
---|
160 | raise Error('file version not supported') |
---|
161 | number_of_bases = str(number_of_bases) |
---|
162 | return [ |
---|
163 | ('flowgram_values', str(header['number_of_flows_per_read']) + |
---|
164 | flow_type), |
---|
165 | ('flow_index_per_base', number_of_bases + 'B'), |
---|
166 | ('bases', number_of_bases + 'c'), |
---|
167 | ('quality_scores', number_of_bases + 'B'), |
---|
168 | ] |
---|
169 | |
---|
170 | data = {} |
---|
171 | #we read the first part of the header |
---|
172 | bytes_read, data = read_bin_fragment(struct_def=read_header_1, |
---|
173 | fileh=fileh, offset=fposition, data=data) |
---|
174 | |
---|
175 | read_bin_fragment(struct_def=read_header_2(data['name_length']), |
---|
176 | fileh=fileh, offset=fposition + bytes_read, data=data) |
---|
177 | #we join the letters of the name |
---|
178 | data['name'] = ''.join(data['name']) |
---|
179 | offset = data['read_header_length'] |
---|
180 | #we read the sequence and the quality |
---|
181 | read_data_st = read_data(data['number_of_bases']) |
---|
182 | bytes_read, data = read_bin_fragment(struct_def=read_data_st, |
---|
183 | fileh=fileh, offset=fposition + offset, |
---|
184 | data=data, byte_padding=8) |
---|
185 | #we join the bases |
---|
186 | data['bases'] = ''.join(data['bases']) |
---|
187 | |
---|
188 | #print data |
---|
189 | #print "pre cqr: ", data['clip_qual_right'] |
---|
190 | #print "pre car: ", data['clip_adapter_right'] |
---|
191 | #print "pre cql: ", data['clip_qual_left'] |
---|
192 | #print "pre cal: ", data['clip_adapter_left'] |
---|
193 | |
---|
194 | # correct for the case the right clip is <= than the left clip |
---|
195 | # in this case, left clip is 0 are set to 0 (right clip == 0 means |
---|
196 | # "whole sequence") |
---|
197 | if data['clip_qual_right'] <= data['clip_qual_left'] : |
---|
198 | data['clip_qual_right'] = 0 |
---|
199 | data['clip_qual_left'] = 0 |
---|
200 | if data['clip_adapter_right'] <= data['clip_adapter_left'] : |
---|
201 | data['clip_adapter_right'] = 0 |
---|
202 | data['clip_adapter_left'] = 0 |
---|
203 | |
---|
204 | #the clipping section follows the NCBI's guidelines Trace Archive RFC |
---|
205 | #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc |
---|
206 | #if there's no adapter clip: qual -> vector |
---|
207 | #else: qual-> qual |
---|
208 | # adapter -> vector |
---|
209 | |
---|
210 | if not data['clip_adapter_left']: |
---|
211 | data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left'] |
---|
212 | if not data['clip_adapter_right']: |
---|
213 | data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right'] |
---|
214 | |
---|
215 | # see whether we have to override the minimum left clips |
---|
216 | if config['min_leftclip'] > 0: |
---|
217 | if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']: |
---|
218 | data['clip_adapter_left'] = config['min_leftclip'] |
---|
219 | if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']: |
---|
220 | data['clip_qual_left'] = config['min_leftclip'] |
---|
221 | |
---|
222 | |
---|
223 | #print "post cqr: ", data['clip_qual_right'] |
---|
224 | #print "post car: ", data['clip_adapter_right'] |
---|
225 | #print "post cql: ", data['clip_qual_left'] |
---|
226 | #print "post cal: ", data['clip_adapter_left'] |
---|
227 | |
---|
228 | |
---|
229 | # for handling the -c (clip) option gently, we already clip here |
---|
230 | # and set all clip points to the sequence end points |
---|
231 | if config['clip']: |
---|
232 | data['bases'], data['quality_scores'] = clip_read(data) |
---|
233 | |
---|
234 | data['number_of_bases']=len(data['bases']) |
---|
235 | data['clip_qual_right'] = data['number_of_bases'] |
---|
236 | data['clip_adapter_right'] = data['number_of_bases'] |
---|
237 | data['clip_qual_left'] = 0 |
---|
238 | data['clip_adapter_left'] = 0 |
---|
239 | |
---|
240 | return data['read_header_length'] + bytes_read, data |
---|
241 | |
---|
242 | |
---|
243 | def sequences(fileh, header): |
---|
244 | '''It returns a generator with the data for each read.''' |
---|
245 | #now we can read all the sequences |
---|
246 | fposition = header['header_length'] #position in the file |
---|
247 | reads_read = 0 |
---|
248 | while True: |
---|
249 | if fposition == header['index_offset']: |
---|
250 | #we have to skip the index section |
---|
251 | fposition += index_length |
---|
252 | continue |
---|
253 | else: |
---|
254 | bytes_read, seq_data = read_sequence(header=header, fileh=fileh, |
---|
255 | fposition=fposition) |
---|
256 | yield seq_data |
---|
257 | fposition += bytes_read |
---|
258 | reads_read += 1 |
---|
259 | if reads_read >= header['number_of_reads']: |
---|
260 | break |
---|
261 | |
---|
262 | |
---|
263 | def remove_last_xmltag_in_file(fname, tag=None): |
---|
264 | '''Given an xml file name and a tag, it removes the last tag of the |
---|
265 | file if it matches the given tag. Tag removal is performed via file |
---|
266 | truncation. |
---|
267 | |
---|
268 | It the given tag is not the last in the file, a RunTimeError will be |
---|
269 | raised. |
---|
270 | |
---|
271 | The resulting xml file will be not xml valid. This function is a hack |
---|
272 | that allows to append records to xml files in a quick and dirty way. |
---|
273 | ''' |
---|
274 | |
---|
275 | fh = open(fname, 'r+') |
---|
276 | #we have to read from the end to the start of the file and keep the |
---|
277 | #string enclosed by </ > |
---|
278 | i = -1 |
---|
279 | last_tag = [] #the chars that form the last tag |
---|
280 | start_offset = None #in which byte does the last tag starts? |
---|
281 | end_offset = None #in which byte does the last tag ends? |
---|
282 | while True: |
---|
283 | fh.seek(i, 2) |
---|
284 | char = fh.read(1) |
---|
285 | if not char.isspace(): |
---|
286 | last_tag.append(char) |
---|
287 | if char == '>': |
---|
288 | end_offset = i |
---|
289 | if char == '<': |
---|
290 | start_offset = i |
---|
291 | break |
---|
292 | i -= 1 |
---|
293 | |
---|
294 | #we have read the last tag backwards |
---|
295 | last_tag = ''.join(last_tag[::-1]) |
---|
296 | #we remove the </ and > |
---|
297 | last_tag = last_tag.rstrip('>').lstrip('</') |
---|
298 | |
---|
299 | #we check that we're removing the asked tag |
---|
300 | if tag is not None and tag != last_tag: |
---|
301 | raise RuntimeError("The given xml tag wasn't the last one in the file") |
---|
302 | |
---|
303 | # while we are at it: also remove all white spaces in that line :-) |
---|
304 | i -= 1 |
---|
305 | while True: |
---|
306 | fh.seek(i, 2) |
---|
307 | char = fh.read(1) |
---|
308 | if not char == ' ' and not char == '\t': |
---|
309 | break; |
---|
310 | if fh.tell() == 1: |
---|
311 | break; |
---|
312 | i -= 1 |
---|
313 | |
---|
314 | fh.truncate(); |
---|
315 | |
---|
316 | fh.close() |
---|
317 | return last_tag |
---|
318 | |
---|
319 | |
---|
320 | def create_basic_xml_info(readname, fname): |
---|
321 | '''Formats a number of read specific infos into XML format. |
---|
322 | Currently formated: name and the tags set from command line |
---|
323 | ''' |
---|
324 | to_print = [' <trace>\n'] |
---|
325 | to_print.append(' <trace_name>') |
---|
326 | to_print.append(readname) |
---|
327 | to_print.append('</trace_name>\n') |
---|
328 | |
---|
329 | #extra information |
---|
330 | #do we have extra info for this file? |
---|
331 | info = None |
---|
332 | if config['xml_info']: |
---|
333 | #with this name? |
---|
334 | if fname in config['xml_info']: |
---|
335 | info = config['xml_info'][fname] |
---|
336 | else: |
---|
337 | #with no name? |
---|
338 | try: |
---|
339 | info = config['xml_info'][fake_sff_name] |
---|
340 | except KeyError: |
---|
341 | pass |
---|
342 | #we print the info that we have |
---|
343 | if info: |
---|
344 | for key in info: |
---|
345 | to_print.append(' <' + key + '>' + info[key] + \ |
---|
346 | '</' + key +'>\n') |
---|
347 | |
---|
348 | return ''.join(to_print) |
---|
349 | |
---|
350 | |
---|
351 | def create_clip_xml_info(readlen, adapl, adapr, quall, qualr): |
---|
352 | '''Takes the clip values of the read and formats them into XML |
---|
353 | Corrects "wrong" values that might have resulted through |
---|
354 | simplified calculations earlier in the process of conversion |
---|
355 | (especially during splitting of paired-end reads) |
---|
356 | ''' |
---|
357 | |
---|
358 | to_print = [""] |
---|
359 | |
---|
360 | # if right borders are >= to read length, they don't need |
---|
361 | # to be printed |
---|
362 | if adapr >= readlen: |
---|
363 | adapr = 0 |
---|
364 | if qualr >= readlen: |
---|
365 | qualr = 0 |
---|
366 | |
---|
367 | # BaCh |
---|
368 | # when called via split_paired_end(), some values may be < 0 |
---|
369 | # (when clip values were 0 previously) |
---|
370 | # instead of putting tons of if clauses for different calculations there, |
---|
371 | # I centralise corrective measure here |
---|
372 | # set all values <0 to 0 |
---|
373 | |
---|
374 | if adapr < 0: |
---|
375 | adapr = 0 |
---|
376 | if qualr <0: |
---|
377 | qualr = 0 |
---|
378 | if adapl < 0: |
---|
379 | adapl = 0 |
---|
380 | if quall <0: |
---|
381 | quall = 0 |
---|
382 | |
---|
383 | if quall: |
---|
384 | to_print.append(' <clip_quality_left>') |
---|
385 | to_print.append(str(quall)) |
---|
386 | to_print.append('</clip_quality_left>\n') |
---|
387 | if qualr: |
---|
388 | to_print.append(' <clip_quality_right>') |
---|
389 | to_print.append(str(qualr)) |
---|
390 | to_print.append('</clip_quality_right>\n') |
---|
391 | if adapl: |
---|
392 | to_print.append(' <clip_vector_left>') |
---|
393 | to_print.append(str(adapl)) |
---|
394 | to_print.append('</clip_vector_left>\n') |
---|
395 | if adapr: |
---|
396 | to_print.append(' <clip_vector_right>') |
---|
397 | to_print.append(str(adapr)) |
---|
398 | to_print.append('</clip_vector_right>\n') |
---|
399 | return ''.join(to_print) |
---|
400 | |
---|
401 | |
---|
402 | def create_xml_for_unpaired_read(data, fname): |
---|
403 | '''Given the data for one read it returns an str with the xml ancillary |
---|
404 | data.''' |
---|
405 | to_print = [create_basic_xml_info(data['name'],fname)] |
---|
406 | #clippings in the XML only if we do not hard clip |
---|
407 | if not config['clip']: |
---|
408 | to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right'])); |
---|
409 | to_print.append(' </trace>\n') |
---|
410 | return ''.join(to_print) |
---|
411 | |
---|
412 | |
---|
413 | def format_as_fasta(name,seq,qual): |
---|
414 | name_line = ''.join(('>', name,'\n')) |
---|
415 | seqstring = ''.join((name_line, seq, '\n')) |
---|
416 | qual_line = ' '.join([str(q) for q in qual]) |
---|
417 | qualstring = ''.join((name_line, qual_line, '\n')) |
---|
418 | return seqstring, qualstring |
---|
419 | |
---|
420 | def format_as_fastq(name,seq,qual): |
---|
421 | qual_line = ''.join([chr(q+33) for q in qual]) |
---|
422 | #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n')) |
---|
423 | seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n')) |
---|
424 | return seqstring |
---|
425 | |
---|
426 | |
---|
427 | def get_read_data(data): |
---|
428 | '''Given the data for one read it returns 2 strs with the fasta seq |
---|
429 | and fasta qual.''' |
---|
430 | #seq and qual |
---|
431 | if config['mix_case']: |
---|
432 | seq = sequence_case(data) |
---|
433 | qual = data['quality_scores'] |
---|
434 | else : |
---|
435 | seq = data['bases'] |
---|
436 | qual = data['quality_scores'] |
---|
437 | |
---|
438 | return seq, qual |
---|
439 | |
---|
440 | def extract_read_info(data, fname): |
---|
441 | '''Given the data for one read it returns 3 strs with the fasta seq, fasta |
---|
442 | qual and xml ancillary data.''' |
---|
443 | |
---|
444 | seq,qual = get_read_data(data) |
---|
445 | seqstring, qualstring = format_as_fasta(data['name'],seq,qual) |
---|
446 | |
---|
447 | #name_line = ''.join(('>', data['name'],'\n')) |
---|
448 | #seq = ''.join((name_line, seq, '\n')) |
---|
449 | #qual_line = ' '.join([str(q) for q in qual]) |
---|
450 | #qual = ''.join((name_line, qual_line, '\n')) |
---|
451 | |
---|
452 | xmlstring = create_xml_for_unpaired_read(data, fname) |
---|
453 | |
---|
454 | return seqstring, qualstring, xmlstring |
---|
455 | |
---|
456 | def write_sequence(name,seq,qual,seq_fh,qual_fh): |
---|
457 | '''Write sequence and quality FASTA and FASTA qual filehandles |
---|
458 | (or into FASTQ and XML) |
---|
459 | if sequence length is 0, don't write''' |
---|
460 | |
---|
461 | if len(seq) == 0 : return |
---|
462 | |
---|
463 | if qual_fh is None: |
---|
464 | seq_fh.write(format_as_fastq(name,seq,qual)) |
---|
465 | else: |
---|
466 | seqstring, qualstring = format_as_fasta(name,seq,qual) |
---|
467 | seq_fh.write(seqstring) |
---|
468 | qual_fh.write(qualstring) |
---|
469 | return |
---|
470 | |
---|
471 | def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh): |
---|
472 | '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles |
---|
473 | (or into FASTQ and XML) |
---|
474 | if sequence length is 0, don't write''' |
---|
475 | |
---|
476 | seq,qual = get_read_data(data) |
---|
477 | if len(seq) == 0 : return |
---|
478 | |
---|
479 | write_sequence(data['name'],seq,qual,seq_fh,qual_fh) |
---|
480 | |
---|
481 | anci = create_xml_for_unpaired_read(data, sff_fh.name) |
---|
482 | if anci is not None: |
---|
483 | xml_fh.write(anci) |
---|
484 | return |
---|
485 | |
---|
486 | |
---|
487 | def reverse_complement(seq): |
---|
488 | '''Returns the reverse complement of a DNA sequence as string''' |
---|
489 | |
---|
490 | compdict = { |
---|
491 | 'a': 't', |
---|
492 | 'c': 'g', |
---|
493 | 'g': 'c', |
---|
494 | 't': 'a', |
---|
495 | 'u': 't', |
---|
496 | 'm': 'k', |
---|
497 | 'r': 'y', |
---|
498 | 'w': 'w', |
---|
499 | 's': 's', |
---|
500 | 'y': 'r', |
---|
501 | 'k': 'm', |
---|
502 | 'v': 'b', |
---|
503 | 'h': 'd', |
---|
504 | 'd': 'h', |
---|
505 | 'b': 'v', |
---|
506 | 'x': 'x', |
---|
507 | 'n': 'n', |
---|
508 | 'A': 'T', |
---|
509 | 'C': 'G', |
---|
510 | 'G': 'C', |
---|
511 | 'T': 'A', |
---|
512 | 'U': 'T', |
---|
513 | 'M': 'K', |
---|
514 | 'R': 'Y', |
---|
515 | 'W': 'W', |
---|
516 | 'S': 'S', |
---|
517 | 'Y': 'R', |
---|
518 | 'K': 'M', |
---|
519 | 'V': 'B', |
---|
520 | 'H': 'D', |
---|
521 | 'D': 'H', |
---|
522 | 'B': 'V', |
---|
523 | 'X': 'X', |
---|
524 | 'N': 'N', |
---|
525 | '*': '*' |
---|
526 | } |
---|
527 | |
---|
528 | complseq = ''.join([compdict[base] for base in seq]) |
---|
529 | # python hack to reverse a list/string/etc |
---|
530 | complseq = complseq[::-1] |
---|
531 | return complseq |
---|
532 | |
---|
533 | |
---|
534 | def mask_sequence(seq, maskchar, fpos, tpos): |
---|
535 | '''Given a sequence, mask it with maskchar starting at fpos (including) and |
---|
536 | ending at tpos (excluding) |
---|
537 | ''' |
---|
538 | |
---|
539 | if len(maskchar) > 1: |
---|
540 | raise RuntimeError("Internal error: more than one character given to mask_sequence") |
---|
541 | if fpos<0: |
---|
542 | fpos = 0 |
---|
543 | if tpos > len(seq): |
---|
544 | tpos = len(seq) |
---|
545 | |
---|
546 | newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:])) |
---|
547 | |
---|
548 | return newseq |
---|
549 | |
---|
550 | |
---|
551 | def fragment_sequences(sequence, qualities, splitchar): |
---|
552 | '''Works like split() on strings, except it does this on a sequence |
---|
553 | and the corresponding list with quality values. |
---|
554 | Returns a tuple for each fragment, each sublist has the fragment |
---|
555 | sequence as first and the fragment qualities as second elemnt''' |
---|
556 | |
---|
557 | # this is slow (due to zip and list appends... use an iterator over |
---|
558 | # the sequence find find variations and splices on seq and qual |
---|
559 | |
---|
560 | if len(sequence) != len(qualities): |
---|
561 | print sequence, qualities |
---|
562 | raise RuntimeError("Internal error: length of sequence and qualities don't match???") |
---|
563 | |
---|
564 | retlist = ([]) |
---|
565 | if len(sequence) == 0: |
---|
566 | return retlist |
---|
567 | |
---|
568 | actseq = ([]) |
---|
569 | actqual = ([]) |
---|
570 | if sequence[0] != splitchar: |
---|
571 | inseq = True |
---|
572 | else: |
---|
573 | inseq = False |
---|
574 | for char,qual in zip(sequence,qualities): |
---|
575 | if inseq: |
---|
576 | if char != splitchar: |
---|
577 | actseq.append(char) |
---|
578 | actqual.append(qual) |
---|
579 | else: |
---|
580 | retlist.append((''.join(actseq), actqual)) |
---|
581 | actseq = ([]) |
---|
582 | actqual = ([]) |
---|
583 | inseq = False |
---|
584 | else: |
---|
585 | if char != splitchar: |
---|
586 | inseq = True |
---|
587 | actseq.append(char) |
---|
588 | actqual.append(qual) |
---|
589 | |
---|
590 | if inseq and len(actseq): |
---|
591 | retlist.append((''.join(actseq), actqual)) |
---|
592 | |
---|
593 | return retlist |
---|
594 | |
---|
595 | |
---|
596 | def calc_subseq_boundaries(maskedseq, maskchar): |
---|
597 | '''E.g.: |
---|
598 | ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx......... |
---|
599 | to |
---|
600 | (0,8),(8,16),(16,26),(26,47),(47,56) |
---|
601 | ''' |
---|
602 | |
---|
603 | blist = ([]) |
---|
604 | if len(maskedseq) == 0: |
---|
605 | return blist |
---|
606 | |
---|
607 | inmask = True |
---|
608 | if maskedseq[0] != maskchar: |
---|
609 | inmask = False |
---|
610 | |
---|
611 | start = 0 |
---|
612 | for spos in range(len(maskedseq)): |
---|
613 | if inmask and maskedseq[spos] != maskchar: |
---|
614 | blist.append(([start,spos])) |
---|
615 | start = spos |
---|
616 | inmask = False |
---|
617 | elif not inmask and maskedseq[spos] == maskchar: |
---|
618 | blist.append(([start,spos])) |
---|
619 | start = spos |
---|
620 | inmask = True |
---|
621 | |
---|
622 | blist.append(([start,spos+1])) |
---|
623 | |
---|
624 | return blist |
---|
625 | |
---|
626 | |
---|
627 | def correct_for_smallhits(maskedseq, maskchar, linkername): |
---|
628 | '''If partial hits were found, take preventive measure: grow |
---|
629 | the masked areas by 20 bases in each direction |
---|
630 | Returns either unchanged "maskedseq" or a new sequence |
---|
631 | with some more characters masked. |
---|
632 | ''' |
---|
633 | global linkerlengths |
---|
634 | |
---|
635 | CEBUG = 0 |
---|
636 | |
---|
637 | if CEBUG : print "correct_for_smallhits" |
---|
638 | if CEBUG : print "Masked seq\n", maskedseq |
---|
639 | if CEBUG : print "Linkername: ", linkername |
---|
640 | |
---|
641 | if len(maskedseq) == 0: |
---|
642 | return maskedseq |
---|
643 | |
---|
644 | growl=40 |
---|
645 | growl2=growl/2 |
---|
646 | |
---|
647 | boundaries = calc_subseq_boundaries(maskedseq,maskchar) |
---|
648 | if CEBUG : print "Boundaries: ", boundaries |
---|
649 | |
---|
650 | foundpartial = False |
---|
651 | for bounds in boundaries: |
---|
652 | if CEBUG : print "\tbounds: ", bounds |
---|
653 | left, right = bounds |
---|
654 | if left != 0 and right != len(maskedseq): |
---|
655 | if maskedseq[left] == maskchar: |
---|
656 | # allow 10% discrepancy |
---|
657 | # -linkerlengths[linkername]/10 |
---|
658 | # that's a kind of safety net if there are slight sequencing |
---|
659 | # errors in the linker itself |
---|
660 | if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10: |
---|
661 | if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long." |
---|
662 | foundpartial = True |
---|
663 | |
---|
664 | if not foundpartial: |
---|
665 | return maskedseq |
---|
666 | |
---|
667 | # grow |
---|
668 | newseq = "" |
---|
669 | for bounds in boundaries: |
---|
670 | if CEBUG : print "Bounds: ", bounds |
---|
671 | left, right = bounds |
---|
672 | if maskedseq[left] == maskchar: |
---|
673 | newseq += maskedseq[left:right] |
---|
674 | else: |
---|
675 | clearstart = 0 |
---|
676 | if left > 0 : |
---|
677 | clearstart = left+growl2 |
---|
678 | clearstop = len(maskedseq) |
---|
679 | if right < len(maskedseq): |
---|
680 | clearstop = right-growl2 |
---|
681 | |
---|
682 | if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop |
---|
683 | |
---|
684 | if clearstop <= clearstart: |
---|
685 | newseq += maskchar * (right-left) |
---|
686 | else: |
---|
687 | if clearstart != left: |
---|
688 | newseq += maskchar * growl2 |
---|
689 | newseq += maskedseq[clearstart:clearstop] |
---|
690 | if clearstop != right: |
---|
691 | newseq += maskchar * growl2 |
---|
692 | |
---|
693 | #print "newseq\n",newseq |
---|
694 | |
---|
695 | return newseq |
---|
696 | |
---|
697 | |
---|
698 | def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh): |
---|
699 | '''Splits a paired end read and writes sequences into FASTA, FASTA qual |
---|
700 | and XML traceinfo file. Returns the number of sequences created. |
---|
701 | |
---|
702 | As the linker sequence may be anywhere in the read, including the ends |
---|
703 | and overlapping with bad quality sequence, we need to perform some |
---|
704 | computing and eventually set new clip points. |
---|
705 | |
---|
706 | If the resulting split yields only one sequence (because linker |
---|
707 | was not present or overlapping with left or right clip), only one |
---|
708 | sequence will be written with ".fn" appended to the name. |
---|
709 | |
---|
710 | If the read can be split, two reads will be written. The side left of |
---|
711 | the linker will be named ".r" and will be written in reverse complement |
---|
712 | into the file to conform with what approximately all assemblers expect |
---|
713 | when reading paired-end data: reads in forward direction in file. The side |
---|
714 | right of the linker will be named ".f" |
---|
715 | |
---|
716 | If SSAHA found partial linker (linker sequences < length of linker), |
---|
717 | the sequences will get a "_pl" furthermore be cut back thoroughly. |
---|
718 | |
---|
719 | If SSAHA found multiple occurences of the linker, the names will get an |
---|
720 | additional "_mlc" within the name to show that there was "multiple |
---|
721 | linker contamination". |
---|
722 | |
---|
723 | For multiple or partial linker, the "good" parts of the reads are |
---|
724 | stored with a ".part<number>" name, additionally they will not get |
---|
725 | template information in the XML |
---|
726 | ''' |
---|
727 | |
---|
728 | global ssahapematches |
---|
729 | |
---|
730 | CEBUG = 0 |
---|
731 | |
---|
732 | maskchar = "#" |
---|
733 | |
---|
734 | if CEBUG : print "Need to split: " + data['name'] |
---|
735 | |
---|
736 | numseqs = 0; |
---|
737 | readname = data['name'] |
---|
738 | readlen = data['number_of_bases'] |
---|
739 | |
---|
740 | leftclip, rightclip = return_merged_clips(data) |
---|
741 | seq, qual = get_read_data(data) |
---|
742 | |
---|
743 | if CEBUG : print "Original read:\n",seq |
---|
744 | |
---|
745 | maskedseq = seq |
---|
746 | if leftclip > 0: |
---|
747 | maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1) |
---|
748 | if rightclip < len(maskedseq): |
---|
749 | maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq)) |
---|
750 | |
---|
751 | leftclip, rightclip = return_merged_clips(data) |
---|
752 | readlen = data['number_of_bases'] |
---|
753 | |
---|
754 | if CEBUG : print "Readname:", readname |
---|
755 | if CEBUG : print "Readlen:", readlen |
---|
756 | if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']])) |
---|
757 | if CEBUG : print "matches:", ssahapematches[data['name']] |
---|
758 | |
---|
759 | for match in ssahapematches[data['name']]: |
---|
760 | score = int(match[0]) |
---|
761 | linkername = match[2] |
---|
762 | leftreadhit = int(match[3]) |
---|
763 | rightreadhit = int(match[4]) |
---|
764 | #leftlinkerhit = int(match[5]) |
---|
765 | #rightlinkerhit = int(match[6]) |
---|
766 | #direction = match[7] |
---|
767 | #hitlen = int(match[8]) |
---|
768 | #hitidentity = float(match[9]) |
---|
769 | |
---|
770 | if CEBUG : print match |
---|
771 | if CEBUG : print "Match with score:", score |
---|
772 | if CEBUG : print "Read before:\n", maskedseq |
---|
773 | maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit) |
---|
774 | if CEBUG : print "Masked seq:\n", maskedseq |
---|
775 | |
---|
776 | correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername) |
---|
777 | |
---|
778 | if len(maskedseq) != len(correctedseq): |
---|
779 | raise RuntimeError("Internal error: maskedseq != correctedseq") |
---|
780 | |
---|
781 | partialhits = False |
---|
782 | if correctedseq != maskedseq: |
---|
783 | if CEBUG : print "Partial hits in", readname |
---|
784 | if CEBUG : print "Original seq:\n", seq |
---|
785 | if CEBUG : print "Masked seq:\n", maskedseq |
---|
786 | if CEBUG : print "Corrected seq\n", correctedseq |
---|
787 | partialhits = True |
---|
788 | readname += "_pl" |
---|
789 | maskedseq = correctedseq |
---|
790 | |
---|
791 | fragments = fragment_sequences(maskedseq, qual, maskchar) |
---|
792 | |
---|
793 | if CEBUG : print "Fragments (", len(fragments), "): ", fragments |
---|
794 | |
---|
795 | mlcflag = False |
---|
796 | #if len(ssahapematches[data['name']]) > 1: |
---|
797 | # #print "Multi linker contamination" |
---|
798 | # mlcflag = True |
---|
799 | # readname += "_mlc" |
---|
800 | |
---|
801 | if len(fragments) > 2: |
---|
802 | if CEBUG : print "Multi linker contamination" |
---|
803 | mlcflag = True |
---|
804 | readname += "_mlc" |
---|
805 | |
---|
806 | |
---|
807 | #print fragments |
---|
808 | if mlcflag or partialhits: |
---|
809 | fragcounter = 1 |
---|
810 | readname += ".part" |
---|
811 | for frag in fragments: |
---|
812 | actseq = frag[0] |
---|
813 | if len(actseq) >= 20: |
---|
814 | actqual = frag[1] |
---|
815 | oname = readname + str(fragcounter) |
---|
816 | #seq_fh.write(">"+oname+"\n") |
---|
817 | #seq_fh.write(actseq+"\n") |
---|
818 | #qual_fh.write(">"+oname+"\n") |
---|
819 | #qual_fh.write(' '.join((str(q) for q in actqual))) |
---|
820 | #qual_fh.write("\n") |
---|
821 | write_sequence(oname,actseq,actqual,seq_fh,qual_fh) |
---|
822 | to_print = [create_basic_xml_info(oname,sff_fh.name)] |
---|
823 | # No clipping in XML ... the multiple and partial fragments |
---|
824 | # are clipped "hard" |
---|
825 | # No template ID and trace_end: we don't know the |
---|
826 | # orientation of the frahments. Even if it were |
---|
827 | # only two, the fact we had multiple linkers |
---|
828 | # says something went wrong, so simply do not |
---|
829 | # write any paired-end information for all these fragments |
---|
830 | to_print.append(' </trace>\n') |
---|
831 | xml_fh.write(''.join(to_print)) |
---|
832 | numseqs += 1 |
---|
833 | fragcounter += 1 |
---|
834 | else: |
---|
835 | if len(fragments) >2: |
---|
836 | raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.") |
---|
837 | # nothing will happen for 0 fragments |
---|
838 | if len(fragments) == 1: |
---|
839 | #print "Tada1" |
---|
840 | boundaries = calc_subseq_boundaries(maskedseq,maskchar) |
---|
841 | if len(boundaries) < 1 or len(boundaries) >3: |
---|
842 | raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname) |
---|
843 | if len(boundaries) == 3: |
---|
844 | # case: mask char on both sides of sequence |
---|
845 | #print "bounds3" |
---|
846 | data['clip_adapter_left']=1+boundaries[0][1] |
---|
847 | data['clip_adapter_right']=boundaries[2][0] |
---|
848 | elif len(boundaries) == 2: |
---|
849 | # case: mask char left or right of sequence |
---|
850 | #print "bounds2", |
---|
851 | if maskedseq[0] == maskchar : |
---|
852 | # case: mask char left |
---|
853 | #print "left" |
---|
854 | data['clip_adapter_left']=1+boundaries[0][1] |
---|
855 | else: |
---|
856 | # case: mask char right |
---|
857 | #print "right" |
---|
858 | data['clip_adapter_right']=boundaries[1][0] |
---|
859 | data['name'] = data['name'] + ".fn" |
---|
860 | write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh) |
---|
861 | numseqs = 1 |
---|
862 | elif len(fragments) == 2: |
---|
863 | #print "Tada2" |
---|
864 | oname = readname + ".r" |
---|
865 | seq, qual = get_read_data(data) |
---|
866 | |
---|
867 | startsearch = False |
---|
868 | for spos in range(len(maskedseq)): |
---|
869 | if maskedseq[spos] != maskchar: |
---|
870 | startsearch = True; |
---|
871 | else: |
---|
872 | if startsearch: |
---|
873 | break |
---|
874 | |
---|
875 | #print "\nspos: ", spos |
---|
876 | lseq=seq[:spos] |
---|
877 | #print "lseq:", lseq |
---|
878 | actseq = reverse_complement(lseq) |
---|
879 | lreadlen = len(actseq) |
---|
880 | lqual = qual[:spos]; |
---|
881 | # python hack to reverse a list/string/etc |
---|
882 | lqual = lqual[::-1]; |
---|
883 | |
---|
884 | #seq_fh.write(">"+oname+"\n") |
---|
885 | #seq_fh.write(actseq+"\n") |
---|
886 | #qual_fh.write(">"+oname+"\n") |
---|
887 | #qual_fh.write(' '.join((str(q) for q in lqual))) |
---|
888 | #qual_fh.write("\n") |
---|
889 | |
---|
890 | write_sequence(oname,actseq,lqual,seq_fh,qual_fh) |
---|
891 | |
---|
892 | to_print = [create_basic_xml_info(oname,sff_fh.name)] |
---|
893 | to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left'])); |
---|
894 | to_print.append(' <template_id>') |
---|
895 | to_print.append(readname) |
---|
896 | to_print.append('</template_id>\n') |
---|
897 | to_print.append(' <trace_end>r</trace_end>\n') |
---|
898 | to_print.append(' </trace>\n') |
---|
899 | xml_fh.write(''.join(to_print)) |
---|
900 | |
---|
901 | oname = readname + ".f" |
---|
902 | startsearch = False |
---|
903 | for spos in range(len(maskedseq)-1,-1,-1): |
---|
904 | if maskedseq[spos] != maskchar: |
---|
905 | startsearch = True; |
---|
906 | else: |
---|
907 | if startsearch: |
---|
908 | break |
---|
909 | |
---|
910 | actseq = seq[spos+1:] |
---|
911 | actqual = qual[spos+1:]; |
---|
912 | |
---|
913 | #print "\nspos: ", spos |
---|
914 | #print "rseq:", actseq |
---|
915 | |
---|
916 | #seq_fh.write(">"+oname+"\n") |
---|
917 | #seq_fh.write(actseq+"\n") |
---|
918 | #qual_fh.write(">"+oname+"\n") |
---|
919 | #qual_fh.write(' '.join((str(q) for q in actqual))) |
---|
920 | #qual_fh.write("\n") |
---|
921 | write_sequence(oname,actseq,actqual,seq_fh,qual_fh) |
---|
922 | |
---|
923 | rreadlen = len(actseq) |
---|
924 | to_print = [create_basic_xml_info(oname,sff_fh.name)] |
---|
925 | to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right']))); |
---|
926 | to_print.append(' <template_id>') |
---|
927 | to_print.append(readname) |
---|
928 | to_print.append('</template_id>\n') |
---|
929 | to_print.append(' <trace_end>f</trace_end>\n') |
---|
930 | to_print.append(' </trace>\n') |
---|
931 | xml_fh.write(''.join(to_print)) |
---|
932 | numseqs = 2 |
---|
933 | |
---|
934 | return numseqs |
---|
935 | |
---|
936 | |
---|
937 | |
---|
938 | def extract_reads_from_sff(config, sff_files): |
---|
939 | '''Given the configuration and the list of sff_files it writes the seqs, |
---|
940 | qualities and ancillary data into the output file(s). |
---|
941 | |
---|
942 | If file for paired-end linker was given, first extracts all sequences |
---|
943 | of an SFF and searches these against the linker(s) with SSAHA2 to |
---|
944 | create needed information to split reads. |
---|
945 | ''' |
---|
946 | |
---|
947 | global ssahapematches |
---|
948 | |
---|
949 | |
---|
950 | if len(sff_files) == 0 : |
---|
951 | raise RuntimeError("No SFF file given?") |
---|
952 | |
---|
953 | #we go through all input files |
---|
954 | for sff_file in sff_files: |
---|
955 | if not os.path.getsize(sff_file): |
---|
956 | raise RuntimeError('Empty file? : ' + sff_file) |
---|
957 | fh = open(sff_file, 'r') |
---|
958 | fh.close() |
---|
959 | |
---|
960 | openmode = 'w' |
---|
961 | if config['append']: |
---|
962 | openmode = 'a' |
---|
963 | |
---|
964 | seq_fh = open(config['seq_fname'], openmode) |
---|
965 | xml_fh = open(config['xml_fname'], openmode) |
---|
966 | if config['want_fastq']: |
---|
967 | qual_fh = None |
---|
968 | try: |
---|
969 | os.remove(config['qual_fname']) |
---|
970 | except : |
---|
971 | python_formattingwithoutbracesisdumb_dummy = 1 |
---|
972 | else: |
---|
973 | qual_fh = open(config['qual_fname'], openmode) |
---|
974 | |
---|
975 | if not config['append']: |
---|
976 | xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n') |
---|
977 | else: |
---|
978 | remove_last_xmltag_in_file(config['xml_fname'], "trace_volume") |
---|
979 | |
---|
980 | #we go through all input files |
---|
981 | for sff_file in sff_files: |
---|
982 | #print "Working on '" + sff_file + "':" |
---|
983 | ssahapematches.clear() |
---|
984 | |
---|
985 | seqcheckstore = ([]) |
---|
986 | |
---|
987 | debug = 0 |
---|
988 | |
---|
989 | if not debug and config['pelinker_fname']: |
---|
990 | #print "Creating temporary sequences from reads in '" + sff_file + "' ... ", |
---|
991 | sys.stdout.flush() |
---|
992 | |
---|
993 | if 0 : |
---|
994 | # for debugging |
---|
995 | pid = os.getpid() |
---|
996 | tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta' |
---|
997 | tmpfasta_fh = open(tmpfasta_fname, 'w') |
---|
998 | else: |
---|
999 | tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_', |
---|
1000 | suffix = '.fasta') |
---|
1001 | |
---|
1002 | sff_fh = open(sff_file, 'rb') |
---|
1003 | header_data = read_header(fileh=sff_fh) |
---|
1004 | for seq_data in sequences(fileh=sff_fh, header=header_data): |
---|
1005 | seq,qual = get_read_data(seq_data) |
---|
1006 | seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual) |
---|
1007 | tmpfasta_fh.write(seqstring) |
---|
1008 | #seq, qual, anci = extract_read_info(seq_data, sff_fh.name) |
---|
1009 | #tmpfasta_fh.write(seq) |
---|
1010 | #print "done." |
---|
1011 | tmpfasta_fh.seek(0) |
---|
1012 | |
---|
1013 | if 0 : |
---|
1014 | # for debugging |
---|
1015 | tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2' |
---|
1016 | tmpssaha_fh = open(tmpssaha_fname, 'w+') |
---|
1017 | else: |
---|
1018 | tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_', |
---|
1019 | suffix = '.ssaha2') |
---|
1020 | |
---|
1021 | launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh) |
---|
1022 | tmpfasta_fh.close() |
---|
1023 | |
---|
1024 | tmpssaha_fh.seek(0) |
---|
1025 | read_ssaha_data(tmpssaha_fh) |
---|
1026 | tmpssaha_fh.close() |
---|
1027 | |
---|
1028 | if debug: |
---|
1029 | tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r') |
---|
1030 | read_ssaha_data(tmpssaha_fh) |
---|
1031 | |
---|
1032 | #print "Converting '" + sff_file + "' ... ", |
---|
1033 | sys.stdout.flush() |
---|
1034 | sff_fh = open(sff_file, 'rb') |
---|
1035 | #read_header(infile) |
---|
1036 | header_data = read_header(fileh=sff_fh) |
---|
1037 | |
---|
1038 | #now convert all reads |
---|
1039 | nseqs_sff = 0 |
---|
1040 | nseqs_out = 0 |
---|
1041 | for seq_data in sequences(fileh=sff_fh, header=header_data): |
---|
1042 | nseqs_sff += 1 |
---|
1043 | |
---|
1044 | seq, qual = clip_read(seq_data) |
---|
1045 | seqcheckstore.append(seq[0:50]) |
---|
1046 | |
---|
1047 | #if nseqs_sff >1000: |
---|
1048 | # check_for_dubious_startseq(seqcheckstore,sff_file,seq_data) |
---|
1049 | # sys.exit() |
---|
1050 | |
---|
1051 | if ssahapematches.has_key(seq_data['name']): |
---|
1052 | #print "Paired end:",seq_data['name'] |
---|
1053 | nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh) |
---|
1054 | else: |
---|
1055 | #print "Normal:",seq_data['name'] |
---|
1056 | if config['pelinker_fname']: |
---|
1057 | seq_data['name'] = seq_data['name'] + ".fn" |
---|
1058 | write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh) |
---|
1059 | nseqs_out += 1 |
---|
1060 | #print "done." |
---|
1061 | #print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.' |
---|
1062 | sff_fh.close() |
---|
1063 | |
---|
1064 | check_for_dubious_startseq(seqcheckstore,sff_file,seq_data) |
---|
1065 | seqcheckstore = ([]) |
---|
1066 | |
---|
1067 | xml_fh.write('</trace_volume>\n') |
---|
1068 | |
---|
1069 | xml_fh.close() |
---|
1070 | seq_fh.close() |
---|
1071 | if qual_fh is not None: |
---|
1072 | qual_fh.close() |
---|
1073 | |
---|
1074 | return |
---|
1075 | |
---|
1076 | def check_for_dubious_startseq(seqcheckstore, sffname,seqdata): |
---|
1077 | |
---|
1078 | global stern_warning |
---|
1079 | |
---|
1080 | foundproblem = "" |
---|
1081 | for checklen in range(1,len(seqcheckstore[0])): |
---|
1082 | foundinloop = False |
---|
1083 | seqdict = {} |
---|
1084 | for seq in seqcheckstore: |
---|
1085 | shortseq = seq[0:checklen] |
---|
1086 | if shortseq in seqdict: |
---|
1087 | seqdict[shortseq] += 1 |
---|
1088 | else: |
---|
1089 | seqdict[shortseq] = 1 |
---|
1090 | |
---|
1091 | for shortseq, count in seqdict.items(): |
---|
1092 | if float(count)/len(seqcheckstore) >= 0.5: |
---|
1093 | foundinloop = True |
---|
1094 | stern_warning |
---|
1095 | foundproblem = "\n"+"*" * 80 |
---|
1096 | foundproblem += "\nWARNING: " |
---|
1097 | foundproblem += "weird sequences in file " + sffname + "\n\n" |
---|
1098 | foundproblem += "After applying left clips, " + str(count) + " sequences (=" |
---|
1099 | foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore)) |
---|
1100 | foundproblem += "%) start with these bases:\n" + shortseq |
---|
1101 | foundproblem += "\n\nThis does not look sane.\n\n" |
---|
1102 | foundproblem += "Countermeasures you *probably* must take:\n" |
---|
1103 | foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n" |
---|
1104 | foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n" |
---|
1105 | left,right = return_merged_clips(seqdata) |
---|
1106 | foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n" |
---|
1107 | foundproblem += "*" * 80 + "\n" |
---|
1108 | if not foundinloop : |
---|
1109 | break |
---|
1110 | if len(foundproblem): |
---|
1111 | print foundproblem |
---|
1112 | |
---|
1113 | |
---|
1114 | def parse_extra_info(info): |
---|
1115 | '''It parses the information that will go in the xml file. |
---|
1116 | |
---|
1117 | There are two formats accepted for the extra information: |
---|
1118 | key1:value1, key2:value2 |
---|
1119 | or: |
---|
1120 | file1.sff{key1:value1, key2:value2};file2.sff{key3:value3} |
---|
1121 | ''' |
---|
1122 | if not info: |
---|
1123 | return info |
---|
1124 | finfos = info.split(';') #information for each file |
---|
1125 | data_for_files = {} |
---|
1126 | for finfo in finfos: |
---|
1127 | #we split the file name from the rest |
---|
1128 | items = finfo.split('{') |
---|
1129 | if len(items) == 1: |
---|
1130 | fname = fake_sff_name |
---|
1131 | info = items[0] |
---|
1132 | else: |
---|
1133 | fname = items[0] |
---|
1134 | info = items[1] |
---|
1135 | #now we get each key,value pair in the info |
---|
1136 | info = info.replace('}', '') |
---|
1137 | data = {} |
---|
1138 | for item in info.split(','): |
---|
1139 | key, value = item.strip().split(':') |
---|
1140 | key = key.strip() |
---|
1141 | value = value.strip() |
---|
1142 | data[key] = value |
---|
1143 | data_for_files[fname] = data |
---|
1144 | return data_for_files |
---|
1145 | |
---|
1146 | |
---|
1147 | def return_merged_clips(data): |
---|
1148 | '''It returns the left and right positions to clip.''' |
---|
1149 | def max(a, b): |
---|
1150 | '''It returns the max of the two given numbers. |
---|
1151 | |
---|
1152 | It won't take into account the zero values. |
---|
1153 | ''' |
---|
1154 | if not a and not b: |
---|
1155 | return None |
---|
1156 | if not a: |
---|
1157 | return b |
---|
1158 | if not b: |
---|
1159 | return a |
---|
1160 | if a >= b: |
---|
1161 | return a |
---|
1162 | else: |
---|
1163 | return b |
---|
1164 | def min(a, b): |
---|
1165 | '''It returns the min of the two given numbers. |
---|
1166 | |
---|
1167 | It won't take into account the zero values. |
---|
1168 | ''' |
---|
1169 | if not a and not b: |
---|
1170 | return None |
---|
1171 | if not a: |
---|
1172 | return b |
---|
1173 | if not b: |
---|
1174 | return a |
---|
1175 | if a <= b: |
---|
1176 | return a |
---|
1177 | else: |
---|
1178 | return b |
---|
1179 | left = max(data['clip_adapter_left'], data['clip_qual_left']) |
---|
1180 | right = min(data['clip_adapter_right'], data['clip_qual_right']) |
---|
1181 | #maybe both clips where zero |
---|
1182 | if left is None: |
---|
1183 | left = 1 |
---|
1184 | if right is None: |
---|
1185 | right = data['number_of_bases'] |
---|
1186 | return left, right |
---|
1187 | |
---|
1188 | def sequence_case(data): |
---|
1189 | '''Given the data for one read it returns the seq with mixed case. |
---|
1190 | |
---|
1191 | The regions to be clipped will be lower case and the rest upper case. |
---|
1192 | ''' |
---|
1193 | left, right = return_merged_clips(data) |
---|
1194 | seq = data['bases'] |
---|
1195 | new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower())) |
---|
1196 | return new_seq |
---|
1197 | |
---|
1198 | def clip_read(data): |
---|
1199 | '''Given the data for one read it returns clipped seq and qual.''' |
---|
1200 | |
---|
1201 | qual = data['quality_scores'] |
---|
1202 | left, right = return_merged_clips(data) |
---|
1203 | seq = data['bases'] |
---|
1204 | qual = data['quality_scores'] |
---|
1205 | new_seq = seq[left-1:right] |
---|
1206 | new_qual = qual[left-1:right] |
---|
1207 | |
---|
1208 | return new_seq, new_qual |
---|
1209 | |
---|
1210 | |
---|
1211 | |
---|
1212 | def tests_for_ssaha(linker_fname): |
---|
1213 | '''Tests whether SSAHA2 can be successfully called.''' |
---|
1214 | |
---|
1215 | try: |
---|
1216 | print "Testing whether SSAHA2 is installed and can be launched ... ", |
---|
1217 | sys.stdout.flush() |
---|
1218 | fh = open('/dev/null', 'w') |
---|
1219 | retcode = subprocess.call(["ssaha2", "-v"], stdout = fh) |
---|
1220 | fh.close() |
---|
1221 | print "ok." |
---|
1222 | except : |
---|
1223 | print "nope? Uh oh ...\n\n" |
---|
1224 | raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?') |
---|
1225 | |
---|
1226 | |
---|
1227 | def load_linker_sequences(linker_fname): |
---|
1228 | '''Loads all linker sequences into memory, storing only the length |
---|
1229 | of each linker.''' |
---|
1230 | |
---|
1231 | global linkerlengths |
---|
1232 | |
---|
1233 | if not os.path.getsize(linker_fname): |
---|
1234 | raise RuntimeError("File empty? '" + linker_fname + "'") |
---|
1235 | fh = open(linker_fname, 'r') |
---|
1236 | linkerseqs = read_fasta(fh) |
---|
1237 | if len(linkerseqs) == 0: |
---|
1238 | raise RuntimeError(linker_fname + ": no sequence found?") |
---|
1239 | for i in linkerseqs: |
---|
1240 | if linkerlengths.has_key(i.name): |
---|
1241 | raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.") |
---|
1242 | linkerlengths[i.name] = len(i.sequence) |
---|
1243 | fh.close() |
---|
1244 | |
---|
1245 | |
---|
1246 | def launch_ssaha(linker_fname, query_fname, output_fh): |
---|
1247 | '''Launches SSAHA2 on the linker and query file, string SSAHA2 output |
---|
1248 | into the output filehandle''' |
---|
1249 | |
---|
1250 | try: |
---|
1251 | print "Searching linker sequences with SSAHA2 (this may take a while) ... ", |
---|
1252 | sys.stdout.flush() |
---|
1253 | retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh) |
---|
1254 | if retcode: |
---|
1255 | raise RuntimeError('Ups.') |
---|
1256 | else: |
---|
1257 | print "ok." |
---|
1258 | except: |
---|
1259 | print "\n" |
---|
1260 | raise RuntimeError('An error occured during the SSAHA2 execution, aborting.') |
---|
1261 | |
---|
1262 | def read_ssaha_data(ssahadata_fh): |
---|
1263 | '''Given file handle, reads file generated with SSAHA2 (with default |
---|
1264 | output format) and stores all matches as list ssahapematches |
---|
1265 | (ssaha paired-end matches) dictionary''' |
---|
1266 | |
---|
1267 | global ssahapematches |
---|
1268 | |
---|
1269 | print "Parsing SSAHA2 result file ... ", |
---|
1270 | sys.stdout.flush() |
---|
1271 | |
---|
1272 | for line in ssahadata_fh: |
---|
1273 | if line.startswith('ALIGNMENT'): |
---|
1274 | ml = line.split() |
---|
1275 | if len(ml) != 12 : |
---|
1276 | print "\n", line, |
---|
1277 | raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml))) |
---|
1278 | if not ssahapematches.has_key(ml[2]) : |
---|
1279 | ssahapematches[ml[2]] = ([]) |
---|
1280 | if ml[8] == 'F': |
---|
1281 | #print line, |
---|
1282 | |
---|
1283 | # store everything except the first element (output |
---|
1284 | # format name (ALIGNMENT)) and the last element |
---|
1285 | # (length) |
---|
1286 | ssahapematches[ml[2]].append(ml[1:-1]) |
---|
1287 | else: |
---|
1288 | #print ml |
---|
1289 | ml[4],ml[5] = ml[5],ml[4] |
---|
1290 | #print ml |
---|
1291 | ssahapematches[ml[2]].append(ml[1:-1]) |
---|
1292 | |
---|
1293 | print "done." |
---|
1294 | |
---|
1295 | |
---|
1296 | ########################################################################## |
---|
1297 | # |
---|
1298 | # BaCh: This block was shamelessly copied from |
---|
1299 | # http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/ |
---|
1300 | # and then subsequently modified to read fasta correctly |
---|
1301 | # It's still not fool proof, but should be good enough |
---|
1302 | # |
---|
1303 | ########################################################################## |
---|
1304 | |
---|
1305 | class Fasta: |
---|
1306 | def __init__(self, name, sequence): |
---|
1307 | self.name = name |
---|
1308 | self.sequence = sequence |
---|
1309 | |
---|
1310 | def read_fasta(file): |
---|
1311 | items = [] |
---|
1312 | aninstance = Fasta('', '') |
---|
1313 | linenum = 0 |
---|
1314 | for line in file: |
---|
1315 | linenum += 1 |
---|
1316 | if line.startswith(">"): |
---|
1317 | if len(aninstance.sequence): |
---|
1318 | items.append(aninstance) |
---|
1319 | aninstance = Fasta('', '') |
---|
1320 | # name == all characters until the first whitespace |
---|
1321 | # (split()[0]) but without the starting ">" ([1:]) |
---|
1322 | aninstance.name = line.split()[0][1:] |
---|
1323 | aninstance.sequence = '' |
---|
1324 | if len(aninstance.name) == 0: |
---|
1325 | raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?') |
---|
1326 | |
---|
1327 | else: |
---|
1328 | if len(aninstance.name) == 0: |
---|
1329 | raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?') |
---|
1330 | aninstance.sequence += line.strip() |
---|
1331 | |
---|
1332 | if len(aninstance.name) and len(aninstance.sequence): |
---|
1333 | items.append(aninstance) |
---|
1334 | |
---|
1335 | return items |
---|
1336 | ########################################################################## |
---|
1337 | |
---|
1338 | def version_string (): |
---|
1339 | return "sff_extract " + __version__ |
---|
1340 | |
---|
1341 | def read_config(): |
---|
1342 | '''It reads the configuration options from the command line arguments and |
---|
1343 | it returns a dict with them.''' |
---|
1344 | from optparse import OptionParser, OptionGroup |
---|
1345 | usage = "usage: %prog [options] sff1 sff2 ..." |
---|
1346 | desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\ |
---|
1347 | " and XML traceinfo format. When a paired-end linker sequence"\ |
---|
1348 | " is given (-l), use SSAHA2 to scan the sequences for the linker,"\ |
---|
1349 | " then split the sequences, removing the linker." |
---|
1350 | parser = OptionParser(usage = usage, version = version_string(), description = desc) |
---|
1351 | parser.add_option('-a', '--append', action="store_true", dest='append', |
---|
1352 | help='append output to existing files', default=False) |
---|
1353 | parser.add_option('-i', '--xml_info', dest='xml_info', |
---|
1354 | help='extra info to write in the xml file') |
---|
1355 | parser.add_option("-l", "--linker_file", dest="pelinker_fname", |
---|
1356 | help="FASTA file with paired-end linker sequences", metavar="FILE") |
---|
1357 | |
---|
1358 | group = OptionGroup(parser, "File name options","") |
---|
1359 | group.add_option('-c', '--clip', action="store_true", dest='clip', |
---|
1360 | help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False) |
---|
1361 | group.add_option('-u', '--upper_case', action="store_false", dest='mix_case', |
---|
1362 | help='all bases in upper case, including clipped ends', default=True) |
---|
1363 | group.add_option('', '--min_left_clip', dest='min_leftclip', |
---|
1364 | metavar="INTEGER", type = "int", |
---|
1365 | help='if the left clip coming from the SFF is smaller than this value, override it', default=0) |
---|
1366 | group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq', |
---|
1367 | help='store as FASTQ file instead of FASTA + FASTA quality file', default=False) |
---|
1368 | parser.add_option_group(group) |
---|
1369 | |
---|
1370 | group = OptionGroup(parser, "File name options","") |
---|
1371 | group.add_option("-o", "--out_basename", dest="basename", |
---|
1372 | help="base name for all output files") |
---|
1373 | group.add_option("-s", "--seq_file", dest="seq_fname", |
---|
1374 | help="output sequence file name", metavar="FILE") |
---|
1375 | group.add_option("-q", "--qual_file", dest="qual_fname", |
---|
1376 | help="output quality file name", metavar="FILE") |
---|
1377 | group.add_option("-x", "--xml_file", dest="xml_fname", |
---|
1378 | help="output ancillary xml file name", metavar="FILE") |
---|
1379 | parser.add_option_group(group) |
---|
1380 | |
---|
1381 | #default fnames |
---|
1382 | #is there an sff file? |
---|
1383 | basename = 'reads' |
---|
1384 | if sys.argv[-1][-4:].lower() == '.sff': |
---|
1385 | basename = sys.argv[-1][:-4] |
---|
1386 | def_seq_fname = basename + '.fasta' |
---|
1387 | def_qual_fname = basename + '.fasta.qual' |
---|
1388 | def_xml_fname = basename + '.xml' |
---|
1389 | def_pelinker_fname = '' |
---|
1390 | parser.set_defaults(seq_fname = def_seq_fname) |
---|
1391 | parser.set_defaults(qual_fname = def_qual_fname) |
---|
1392 | parser.set_defaults(xml_fname = def_xml_fname) |
---|
1393 | parser.set_defaults(pelinker_fname = def_pelinker_fname) |
---|
1394 | |
---|
1395 | #we parse the cmd line |
---|
1396 | (options, args) = parser.parse_args() |
---|
1397 | |
---|
1398 | #we put the result in a dict |
---|
1399 | global config |
---|
1400 | config = {} |
---|
1401 | for property in dir(options): |
---|
1402 | if property[0] == '_' or property in ('ensure_value', 'read_file', |
---|
1403 | 'read_module'): |
---|
1404 | continue |
---|
1405 | config[property] = getattr(options, property) |
---|
1406 | |
---|
1407 | if config['basename'] is None: |
---|
1408 | config['basename']=basename |
---|
1409 | |
---|
1410 | #if we have not set a file name with -s, -q or -x we set the basename |
---|
1411 | #based file name |
---|
1412 | if config['want_fastq']: |
---|
1413 | config['qual_fname'] = '' |
---|
1414 | if config['seq_fname'] == def_seq_fname: |
---|
1415 | config['seq_fname'] = config['basename'] + '.fastq' |
---|
1416 | else: |
---|
1417 | if config['seq_fname'] == def_seq_fname: |
---|
1418 | config['seq_fname'] = config['basename'] + '.fasta' |
---|
1419 | if config['qual_fname'] == def_qual_fname: |
---|
1420 | config['qual_fname'] = config['basename'] + '.fasta.qual' |
---|
1421 | |
---|
1422 | if config['xml_fname'] == def_xml_fname: |
---|
1423 | config['xml_fname'] = config['basename'] + '.xml' |
---|
1424 | |
---|
1425 | #we parse the extra info for the xml file |
---|
1426 | config['xml_info'] = parse_extra_info(config['xml_info']) |
---|
1427 | return config, args |
---|
1428 | |
---|
1429 | |
---|
1430 | |
---|
1431 | ########################################################################## |
---|
1432 | |
---|
1433 | |
---|
1434 | def testsome(): |
---|
1435 | sys.exit() |
---|
1436 | return |
---|
1437 | |
---|
1438 | |
---|
1439 | def debug(): |
---|
1440 | try: |
---|
1441 | dummy = 1 |
---|
1442 | #debug() |
---|
1443 | #testsome() |
---|
1444 | |
---|
1445 | config, args = read_config() |
---|
1446 | load_linker_sequences(config['pelinker_fname']) |
---|
1447 | |
---|
1448 | #pid = os.getpid() |
---|
1449 | pid = 15603 |
---|
1450 | |
---|
1451 | #tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta' |
---|
1452 | #tmpfasta_fh = open(tmpfasta_fname, 'w') |
---|
1453 | tmpfasta_fname = 'FLVI58L05.fa' |
---|
1454 | tmpfasta_fh = open(tmpfasta_fname, 'r') |
---|
1455 | |
---|
1456 | tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2' |
---|
1457 | tmpssaha_fh = open(tmpssaha_fname, 'w') |
---|
1458 | |
---|
1459 | launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh) |
---|
1460 | |
---|
1461 | tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r') |
---|
1462 | read_ssaha_data(tmpssaha_fh) |
---|
1463 | |
---|
1464 | sys.exit() |
---|
1465 | |
---|
1466 | extract_reads_from_sff(config, args) |
---|
1467 | |
---|
1468 | except (OSError, IOError, RuntimeError), errval: |
---|
1469 | print errval |
---|
1470 | sys.exit() |
---|
1471 | |
---|
1472 | sys.exit() |
---|
1473 | |
---|
1474 | |
---|
1475 | def main(): |
---|
1476 | |
---|
1477 | argv = sys.argv |
---|
1478 | if len(argv) == 1: |
---|
1479 | sys.argv.append('-h') |
---|
1480 | read_config() |
---|
1481 | sys.exit() |
---|
1482 | try: |
---|
1483 | #debug(); |
---|
1484 | |
---|
1485 | config, args = read_config() |
---|
1486 | |
---|
1487 | if config['pelinker_fname']: |
---|
1488 | #tests_for_ssaha(config['pelinker_fname']) |
---|
1489 | load_linker_sequences(config['pelinker_fname']) |
---|
1490 | if len(args) == 0: |
---|
1491 | raise RuntimeError("No SFF file given?") |
---|
1492 | extract_reads_from_sff(config, args) |
---|
1493 | except (OSError, IOError, RuntimeError), errval: |
---|
1494 | print errval |
---|
1495 | return 1 |
---|
1496 | |
---|
1497 | if stern_warning: |
---|
1498 | return 1 |
---|
1499 | |
---|
1500 | return 0 |
---|
1501 | |
---|
1502 | |
---|
1503 | |
---|
1504 | if __name__ == "__main__": |
---|
1505 | sys.exit(main()) |
---|