root/galaxy-central/tools/filters/sff_extract.py

リビジョン 2, 51.1 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/python
2'''This software extracts the seq, qual and ancillary information from an sff
3file, like the ones used by the 454 sequencer.
4
5Optionally, it can also split paired-end reads if given the linker sequence.
6The splitting is done with maximum match, i.e., every occurence of the linker
7sequence will be removed, even if occuring multiple times.'''
8
9#copyright Jose Blanca and Bastien Chevreux
10#COMAV institute, Universidad Politecnica de Valencia (UPV)
11#Valencia, Spain
12
13# additions to handle paired end reads by Bastien Chevreux
14# bugfixes for linker specific lengths: Lionel Guy
15
16#This program is free software: you can redistribute it and/or modify
17#it under the terms of the GNU General Public License as published by
18#the Free Software Foundation, either version 3 of the License, or
19#(at your option) any later version.
20#This program is distributed in the hope that it will be useful,
21#but WITHOUT ANY WARRANTY; without even the implied warranty of
22#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23#GNU General Public License for more details.
24#You should have received a copy of the GNU General Public License
25#along with this program.  If not, see <http://www.gnu.org/licenses/>.
26
27__author__ = 'Jose Blanca and Bastien Chevreux'
28__copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux'
29__license__ = 'GPLv3 or later'
30__version__ = '0.2.8'
31__email__ = 'jblanca@btc.upv.es'
32__status__ = 'beta'
33
34import struct
35import sys
36import os
37import subprocess
38import tempfile
39
40
41fake_sff_name = 'fake_sff_name'
42
43
44# readname as key: lines with matches from SSAHA, one best match
45ssahapematches = {}
46# linker readname as key: length of linker sequence
47linkerlengths = {}
48
49# set to true if something really fishy is going on with the sequences
50stern_warning = True
51
52def read_bin_fragment(struct_def, fileh, offset=0, data=None,
53                                                             byte_padding=None):
54    '''It reads a chunk of a binary file.
55
56    You have to provide the struct, a file object, the offset (where to start
57    reading).
58    Also you can provide an optional dict that will be populated with the
59    extracted data.
60    If a byte_padding is given the number of bytes read will be a multiple of
61    that number, adding the required pad at the end.
62    It returns the number of bytes reads and the data dict.
63    '''
64    if data is None:
65        data = {}
66
67    #we read each item
68    bytes_read = 0
69    for item in struct_def:
70        #we go to the place and read
71        fileh.seek(offset + bytes_read)
72        n_bytes = struct.calcsize(item[1])
73        buffer = fileh.read(n_bytes)
74        read = struct.unpack('>' + item[1], buffer)
75        if len(read) == 1:
76            read = read[0]
77        data[item[0]] = read
78        bytes_read += n_bytes
79
80    #if there is byte_padding the bytes_to_read should be a multiple of the
81    #byte_padding
82    if byte_padding is not None:
83        pad = byte_padding
84        bytes_read = ((bytes_read + pad - 1) // pad) * pad
85
86    return (bytes_read, data)
87
88
89def check_magic(magic):
90    '''It checks that the magic number of the file matches the sff magic.'''
91    if magic != 779314790:
92        raise RuntimeError('This file does not seems to be an sff file.')
93
94def check_version(version):
95    '''It checks that the version is supported, otherwise it raises an error.'''
96    supported = ('\x00', '\x00', '\x00', '\x01')
97    i = 0
98    for item in version:
99        if version[i] != supported[i]:
100            raise RuntimeError('SFF version not supported. Please contact the author of the software.')
101        i += 1
102
103def read_header(fileh):
104    '''It reads the header from the sff file and returns a dict with the
105    information'''
106    #first we read the first part of the header
107    head_struct = [
108        ('magic_number', 'I'),
109        ('version', 'cccc'),
110        ('index_offset', 'Q'),
111        ('index_length', 'I'),
112        ('number_of_reads', 'I'),
113        ('header_length', 'H'),
114        ('key_length', 'H'),
115        ('number_of_flows_per_read', 'H'),
116        ('flowgram_format_code', 'B'),
117    ]
118    data = {}
119    first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh,
120                                                            offset=0, data=data)
121    check_magic(data['magic_number'])
122    check_version(data['version'])
123    #now that we know the number_of_flows_per_read and the key_length
124    #we can read the second part of the header
125    struct2 = [
126        ('flow_chars', str(data['number_of_flows_per_read']) + 'c'),
127        ('key_sequence', str(data['key_length']) + 'c')
128    ]
129    read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes,
130                                                                      data=data)
131    return data
132
133
134def read_sequence(header, fileh, fposition):
135    '''It reads one read from the sff file located at the fposition and
136    returns a dict with the information.'''
137    header_length = header['header_length']
138    index_offset = header['index_offset']
139    index_length = header['index_length']
140
141    #the sequence struct
142    read_header_1 = [
143        ('read_header_length', 'H'),
144        ('name_length', 'H'),
145        ('number_of_bases', 'I'),
146        ('clip_qual_left', 'H'),
147        ('clip_qual_right', 'H'),
148        ('clip_adapter_left', 'H'),
149        ('clip_adapter_right', 'H'),
150    ]
151    def read_header_2(name_length):
152        '''It returns the struct definition for the second part of the header'''
153        return [('name', str(name_length) +'c')]
154    def read_data(number_of_bases):
155        '''It returns the struct definition for the read data section.'''
156        #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8}
157        if header['flowgram_format_code'] == 1:
158            flow_type = 'H'
159        else:
160            raise Error('file version not supported')
161        number_of_bases = str(number_of_bases)
162        return [
163            ('flowgram_values', str(header['number_of_flows_per_read']) +
164                                                                     flow_type),
165            ('flow_index_per_base', number_of_bases + 'B'),
166            ('bases', number_of_bases + 'c'),
167            ('quality_scores', number_of_bases + 'B'),
168        ]
169
170    data = {}
171    #we read the first part of the header
172    bytes_read, data = read_bin_fragment(struct_def=read_header_1,
173                                    fileh=fileh, offset=fposition, data=data)
174
175    read_bin_fragment(struct_def=read_header_2(data['name_length']),
176                          fileh=fileh, offset=fposition + bytes_read, data=data)
177    #we join the letters of the name
178    data['name'] = ''.join(data['name'])
179    offset = data['read_header_length']
180    #we read the sequence and the quality
181    read_data_st = read_data(data['number_of_bases'])
182    bytes_read, data = read_bin_fragment(struct_def=read_data_st,
183                                    fileh=fileh, offset=fposition + offset,
184                                    data=data, byte_padding=8)
185    #we join the bases
186    data['bases'] = ''.join(data['bases'])
187
188    #print data
189    #print "pre cqr: ", data['clip_qual_right']
190    #print "pre car: ", data['clip_adapter_right']
191    #print "pre cql: ", data['clip_qual_left']
192    #print "pre cal: ", data['clip_adapter_left']
193
194    # correct for the case the right clip is <= than the left clip
195    # in this case, left clip is 0 are set to 0 (right clip == 0 means
196    #  "whole sequence")
197    if data['clip_qual_right'] <= data['clip_qual_left'] :
198        data['clip_qual_right'] = 0
199        data['clip_qual_left'] = 0
200    if data['clip_adapter_right'] <= data['clip_adapter_left'] :
201        data['clip_adapter_right'] = 0
202        data['clip_adapter_left'] = 0
203
204    #the clipping section follows the NCBI's guidelines Trace Archive RFC
205    #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc
206    #if there's no adapter clip: qual -> vector
207    #else:  qual-> qual
208    #       adapter -> vector
209
210    if not data['clip_adapter_left']:
211        data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left']
212    if not data['clip_adapter_right']:
213        data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right']
214
215    # see whether we have to override the minimum left clips
216    if config['min_leftclip'] > 0:
217        if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']:
218            data['clip_adapter_left'] = config['min_leftclip']
219        if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']:
220            data['clip_qual_left'] = config['min_leftclip']
221
222       
223    #print "post cqr: ", data['clip_qual_right']
224    #print "post car: ", data['clip_adapter_right']
225    #print "post cql: ", data['clip_qual_left']
226    #print "post cal: ", data['clip_adapter_left']
227
228
229    # for handling the -c (clip) option gently, we already clip here
230    #  and set all clip points to the sequence end points
231    if config['clip']:
232        data['bases'], data['quality_scores'] = clip_read(data)
233
234        data['number_of_bases']=len(data['bases'])
235        data['clip_qual_right'] = data['number_of_bases']
236        data['clip_adapter_right'] = data['number_of_bases']
237        data['clip_qual_left'] = 0
238        data['clip_adapter_left'] = 0
239       
240    return data['read_header_length'] + bytes_read, data
241
242
243def sequences(fileh, header):
244    '''It returns a generator with the data for each read.'''
245    #now we can read all the sequences
246    fposition = header['header_length']    #position in the file
247    reads_read = 0
248    while True:
249        if fposition == header['index_offset']:
250            #we have to skip the index section
251            fposition += index_length
252            continue
253        else:
254            bytes_read, seq_data = read_sequence(header=header, fileh=fileh,
255                                                            fposition=fposition)
256            yield seq_data
257            fposition += bytes_read
258            reads_read += 1
259            if reads_read >= header['number_of_reads']:
260                break
261
262
263def remove_last_xmltag_in_file(fname, tag=None):
264    '''Given an xml file name and a tag, it removes the last tag of the
265    file if it matches the given tag. Tag removal is performed via file
266    truncation.
267   
268    It the given tag is not the last in the file, a RunTimeError will be
269    raised.
270
271    The resulting xml file will be not xml valid. This function is a hack
272    that allows to append records to xml files in a quick and dirty way.
273    '''
274
275    fh = open(fname, 'r+')
276    #we have to read from the end to the start of the file and keep the
277    #string enclosed by </ >
278    i = -1
279    last_tag = []   #the chars that form the last tag
280    start_offset = None     #in which byte does the last tag starts?
281    end_offset = None     #in which byte does the last tag ends?
282    while True:
283        fh.seek(i, 2)
284        char = fh.read(1)
285        if not char.isspace():
286            last_tag.append(char)
287        if char == '>':
288            end_offset = i
289        if char == '<':
290            start_offset = i
291            break
292        i -= 1
293
294    #we have read the last tag backwards
295    last_tag = ''.join(last_tag[::-1])
296    #we remove the </ and >
297    last_tag = last_tag.rstrip('>').lstrip('</')
298   
299    #we check that we're removing the asked tag
300    if tag is not None and tag != last_tag:
301        raise RuntimeError("The given xml tag wasn't the last one in the file")
302
303    # while we are at it: also remove all white spaces in that line :-)
304    i -= 1
305    while True:
306        fh.seek(i, 2)
307        char = fh.read(1)
308        if not char == ' ' and not char == '\t':
309            break;
310        if fh.tell() == 1:
311            break;
312        i -= 1
313
314    fh.truncate();
315
316    fh.close()
317    return last_tag
318
319
320def create_basic_xml_info(readname, fname):
321    '''Formats a number of read specific infos into XML format.
322    Currently formated: name and the tags set from command line
323    '''
324    to_print = ['    <trace>\n']
325    to_print.append('        <trace_name>')
326    to_print.append(readname)
327    to_print.append('</trace_name>\n')
328
329    #extra information
330    #do we have extra info for this file?
331    info = None
332    if config['xml_info']:
333        #with this name?
334        if fname in config['xml_info']:
335            info = config['xml_info'][fname]
336        else:
337        #with no name?
338            try:
339                info = config['xml_info'][fake_sff_name]
340            except KeyError:
341                pass
342    #we print the info that we have
343    if info:
344        for key in info:
345            to_print.append('        <' + key + '>' + info[key] + \
346                            '</' + key +'>\n')
347
348    return ''.join(to_print)
349
350
351def create_clip_xml_info(readlen, adapl, adapr, quall, qualr):
352    '''Takes the clip values of the read and formats them into XML
353    Corrects "wrong" values that might have resulted through
354    simplified calculations earlier in the process of conversion
355    (especially during splitting of paired-end reads)
356    '''
357
358    to_print = [""]
359
360    # if right borders are >= to read length, they don't need
361    #  to be printed
362    if adapr >= readlen:
363        adapr = 0
364    if qualr >= readlen:
365        qualr = 0
366
367    # BaCh
368    # when called via split_paired_end(), some values may be < 0
369    #  (when clip values were 0 previously)
370    # instead of putting tons of if clauses for different calculations there,
371    #  I centralise corrective measure here
372    # set all values <0 to 0
373
374    if adapr < 0:
375        adapr = 0
376    if qualr <0:
377        qualr = 0
378    if adapl < 0:
379        adapl = 0
380    if quall <0:
381        quall = 0
382
383    if quall:
384        to_print.append('        <clip_quality_left>')
385        to_print.append(str(quall))
386        to_print.append('</clip_quality_left>\n')
387    if qualr:
388        to_print.append('        <clip_quality_right>')
389        to_print.append(str(qualr))
390        to_print.append('</clip_quality_right>\n')
391    if adapl:
392        to_print.append('        <clip_vector_left>')
393        to_print.append(str(adapl))
394        to_print.append('</clip_vector_left>\n')
395    if adapr:
396        to_print.append('        <clip_vector_right>')
397        to_print.append(str(adapr))
398        to_print.append('</clip_vector_right>\n')
399    return ''.join(to_print)
400
401
402def create_xml_for_unpaired_read(data, fname):
403    '''Given the data for one read it returns an str with the xml ancillary
404    data.'''
405    to_print = [create_basic_xml_info(data['name'],fname)]
406    #clippings in the XML only if we do not hard clip
407    if not config['clip']:
408        to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right']));
409    to_print.append('    </trace>\n')
410    return ''.join(to_print)
411
412
413def format_as_fasta(name,seq,qual):
414    name_line = ''.join(('>', name,'\n'))
415    seqstring = ''.join((name_line, seq, '\n'))
416    qual_line = ' '.join([str(q) for q in qual])
417    qualstring = ''.join((name_line, qual_line, '\n'))
418    return seqstring, qualstring
419
420def format_as_fastq(name,seq,qual):
421    qual_line = ''.join([chr(q+33) for q in qual])
422    #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n'))
423    seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n'))
424    return seqstring
425
426
427def get_read_data(data):
428    '''Given the data for one read it returns 2 strs with the fasta seq
429    and fasta qual.'''
430    #seq and qual
431    if config['mix_case']:
432        seq = sequence_case(data)
433        qual = data['quality_scores']
434    else :
435        seq = data['bases']
436        qual = data['quality_scores']
437
438    return seq, qual
439
440def extract_read_info(data, fname):
441    '''Given the data for one read it returns 3 strs with the fasta seq, fasta
442    qual and xml ancillary data.'''
443
444    seq,qual = get_read_data(data)
445    seqstring, qualstring = format_as_fasta(data['name'],seq,qual)
446
447    #name_line = ''.join(('>', data['name'],'\n'))
448    #seq = ''.join((name_line, seq, '\n'))
449    #qual_line = ' '.join([str(q) for q in qual])
450    #qual = ''.join((name_line, qual_line, '\n'))
451
452    xmlstring = create_xml_for_unpaired_read(data, fname)
453
454    return seqstring, qualstring, xmlstring
455
456def write_sequence(name,seq,qual,seq_fh,qual_fh):
457    '''Write sequence and quality FASTA and FASTA qual filehandles
458    (or into FASTQ and XML)
459    if sequence length is 0, don't write'''
460
461    if len(seq) == 0 : return
462
463    if qual_fh is None:
464        seq_fh.write(format_as_fastq(name,seq,qual))
465    else:
466        seqstring, qualstring = format_as_fasta(name,seq,qual)
467        seq_fh.write(seqstring)
468        qual_fh.write(qualstring)
469    return
470
471def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh):
472    '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles
473    (or into FASTQ and XML)
474    if sequence length is 0, don't write'''
475
476    seq,qual = get_read_data(data)
477    if len(seq) == 0 : return
478
479    write_sequence(data['name'],seq,qual,seq_fh,qual_fh)
480
481    anci = create_xml_for_unpaired_read(data, sff_fh.name)
482    if anci is not None:
483        xml_fh.write(anci)
484    return
485
486
487def reverse_complement(seq):
488    '''Returns the reverse complement of a DNA sequence as string'''
489
490    compdict = {
491        'a': 't',
492        'c': 'g',
493        'g': 'c',
494        't': 'a',
495        'u': 't',
496        'm': 'k',
497        'r': 'y',
498        'w': 'w',
499        's': 's',
500        'y': 'r',
501        'k': 'm',
502        'v': 'b',
503        'h': 'd',
504        'd': 'h',
505        'b': 'v',
506        'x': 'x',
507        'n': 'n',
508        'A': 'T',
509        'C': 'G',
510        'G': 'C',
511        'T': 'A',
512        'U': 'T',
513        'M': 'K',
514        'R': 'Y',
515        'W': 'W',
516        'S': 'S',
517        'Y': 'R',
518        'K': 'M',
519        'V': 'B',
520        'H': 'D',
521        'D': 'H',
522        'B': 'V',
523        'X': 'X',
524        'N': 'N',
525        '*': '*'
526        }
527
528    complseq = ''.join([compdict[base] for base in seq])
529    # python hack to reverse a list/string/etc
530    complseq = complseq[::-1]
531    return complseq
532
533
534def mask_sequence(seq, maskchar, fpos, tpos):
535    '''Given a sequence, mask it with maskchar starting at fpos (including) and
536    ending at tpos (excluding)
537    '''
538
539    if len(maskchar) > 1:
540        raise RuntimeError("Internal error: more than one character given to mask_sequence")
541    if fpos<0:
542        fpos = 0
543    if tpos > len(seq):
544        tpos = len(seq)
545
546    newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:]))
547
548    return newseq
549
550
551def fragment_sequences(sequence, qualities, splitchar):
552    '''Works like split() on strings, except it does this on a sequence
553    and the corresponding list with quality values.
554    Returns a tuple for each fragment, each sublist has the fragment
555    sequence as first and the fragment qualities as second elemnt'''
556
557    # this is slow (due to zip and list appends... use an iterator over
558    #  the sequence find find variations and splices on seq and qual
559
560    if len(sequence) != len(qualities):
561        print sequence, qualities
562        raise RuntimeError("Internal error: length of sequence and qualities don't match???")
563
564    retlist = ([])
565    if len(sequence) == 0:
566        return retlist
567
568    actseq = ([])
569    actqual = ([])
570    if sequence[0] != splitchar:
571        inseq = True
572    else:
573        inseq = False
574    for char,qual in zip(sequence,qualities):
575        if inseq:
576            if char != splitchar:
577                actseq.append(char)
578                actqual.append(qual)
579            else:
580                retlist.append((''.join(actseq), actqual))
581                actseq = ([])
582                actqual = ([])
583                inseq = False
584        else:
585            if char != splitchar:
586                inseq = True
587                actseq.append(char)
588                actqual.append(qual)
589
590    if inseq and len(actseq):
591        retlist.append((''.join(actseq), actqual))
592       
593    return retlist
594
595
596def calc_subseq_boundaries(maskedseq, maskchar):
597    '''E.g.:
598       ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx.........
599       to
600         (0,8),(8,16),(16,26),(26,47),(47,56)
601    '''
602
603    blist = ([])
604    if len(maskedseq) == 0:
605        return blist
606
607    inmask = True
608    if maskedseq[0] != maskchar:
609        inmask = False
610
611    start = 0
612    for spos in range(len(maskedseq)):
613        if inmask and maskedseq[spos] != maskchar:
614            blist.append(([start,spos]))
615            start = spos
616            inmask = False
617        elif not inmask and maskedseq[spos] == maskchar:
618            blist.append(([start,spos]))
619            start = spos
620            inmask = True
621
622    blist.append(([start,spos+1]))
623
624    return blist
625
626
627def correct_for_smallhits(maskedseq, maskchar, linkername):
628    '''If partial hits were found, take preventive measure: grow
629        the masked areas by 20 bases in each direction
630       Returns either unchanged "maskedseq" or a new sequence
631        with some more characters masked.
632    '''
633    global linkerlengths
634
635    CEBUG = 0
636
637    if CEBUG : print "correct_for_smallhits"
638    if CEBUG : print "Masked seq\n", maskedseq
639    if CEBUG : print "Linkername: ", linkername
640   
641    if len(maskedseq) == 0:
642        return maskedseq
643
644    growl=40
645    growl2=growl/2
646
647    boundaries = calc_subseq_boundaries(maskedseq,maskchar)
648    if CEBUG : print "Boundaries: ", boundaries
649
650    foundpartial = False
651    for bounds in boundaries:
652        if CEBUG : print "\tbounds: ", bounds
653        left, right = bounds
654        if left != 0 and right != len(maskedseq):
655            if maskedseq[left] == maskchar:
656                # allow 10% discrepancy
657                #    -linkerlengths[linkername]/10
658                # that's a kind of safety net if there are slight sequencing
659                #  errors in the linker itself
660                if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10:
661                    if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long."
662                    foundpartial = True
663
664    if not foundpartial:
665        return maskedseq
666
667    # grow
668    newseq = ""
669    for bounds in boundaries:
670        if CEBUG : print "Bounds: ", bounds
671        left, right = bounds
672        if maskedseq[left] == maskchar:
673            newseq += maskedseq[left:right]
674        else:
675            clearstart = 0
676            if left > 0 :
677                clearstart = left+growl2
678            clearstop = len(maskedseq)
679            if right < len(maskedseq):
680                clearstop = right-growl2
681
682            if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop
683
684            if clearstop <= clearstart:
685                newseq += maskchar * (right-left)
686            else:
687                if clearstart != left:
688                    newseq += maskchar * growl2
689                newseq += maskedseq[clearstart:clearstop]
690                if clearstop != right:
691                    newseq += maskchar * growl2
692           
693        #print "newseq\n",newseq
694
695    return newseq
696
697
698def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh):
699    '''Splits a paired end read and writes sequences into FASTA, FASTA qual
700    and XML traceinfo file. Returns the number of sequences created.
701
702    As the linker sequence may be anywhere in the read, including the ends
703    and overlapping with bad quality sequence, we need to perform some
704    computing and eventually set new clip points.
705
706    If the resulting split yields only one sequence (because linker
707    was not present or overlapping with left or right clip), only one
708    sequence will be written with ".fn" appended to the name.
709
710    If the read can be split, two reads will be written. The side left of
711    the linker will be named ".r" and will be written in reverse complement
712    into the file to conform with what approximately all assemblers expect
713    when reading paired-end data: reads in forward direction in file. The side
714    right of the linker will be named ".f"
715
716    If SSAHA found partial linker (linker sequences < length of linker),
717    the sequences will get a "_pl" furthermore be cut back thoroughly.
718
719    If SSAHA found multiple occurences of the linker, the names will get an
720    additional "_mlc" within the name to show that there was "multiple
721    linker contamination".
722
723    For multiple or partial linker, the "good" parts of the reads are
724    stored with a ".part<number>" name, additionally they will not get
725    template information in the XML
726    '''
727
728    global ssahapematches
729
730    CEBUG = 0
731
732    maskchar = "#"
733
734    if CEBUG : print "Need to split: " + data['name']
735
736    numseqs = 0;
737    readname = data['name']
738    readlen = data['number_of_bases']
739
740    leftclip, rightclip = return_merged_clips(data)
741    seq, qual = get_read_data(data)
742
743    if CEBUG : print "Original read:\n",seq
744   
745    maskedseq = seq
746    if leftclip > 0:
747        maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1)
748    if rightclip < len(maskedseq):
749        maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq))
750   
751    leftclip, rightclip = return_merged_clips(data)
752    readlen = data['number_of_bases']
753   
754    if CEBUG : print "Readname:", readname
755    if CEBUG : print "Readlen:", readlen
756    if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']]))
757    if CEBUG : print "matches:", ssahapematches[data['name']]
758
759    for match in ssahapematches[data['name']]:
760        score = int(match[0])
761        linkername = match[2]
762        leftreadhit = int(match[3])
763        rightreadhit = int(match[4])
764        #leftlinkerhit = int(match[5])
765        #rightlinkerhit = int(match[6])
766        #direction = match[7]
767        #hitlen = int(match[8])
768        #hitidentity = float(match[9])
769
770        if CEBUG : print match
771        if CEBUG : print "Match with score:", score
772        if CEBUG : print "Read before:\n", maskedseq
773        maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit)
774        if CEBUG : print "Masked seq:\n", maskedseq
775
776    correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername)
777
778    if len(maskedseq) != len(correctedseq):
779        raise RuntimeError("Internal error: maskedseq != correctedseq")
780
781    partialhits = False
782    if correctedseq != maskedseq:
783        if CEBUG : print "Partial hits in", readname
784        if CEBUG : print "Original seq:\n", seq
785        if CEBUG : print "Masked seq:\n", maskedseq
786        if CEBUG : print "Corrected seq\n", correctedseq
787        partialhits = True
788        readname += "_pl"
789        maskedseq = correctedseq
790
791    fragments = fragment_sequences(maskedseq, qual, maskchar)
792
793    if CEBUG : print "Fragments (", len(fragments), "): ", fragments
794
795    mlcflag = False
796    #if len(ssahapematches[data['name']]) > 1:
797    #    #print "Multi linker contamination"
798    #    mlcflag = True
799    #    readname += "_mlc"
800
801    if len(fragments) > 2:
802        if CEBUG : print "Multi linker contamination"
803        mlcflag = True
804        readname += "_mlc"
805
806
807    #print fragments
808    if mlcflag or partialhits:
809        fragcounter = 1
810        readname += ".part"
811        for frag in fragments:
812            actseq = frag[0]
813            if len(actseq) >= 20:
814                actqual = frag[1]
815                oname = readname + str(fragcounter)
816                #seq_fh.write(">"+oname+"\n")
817                #seq_fh.write(actseq+"\n")
818                #qual_fh.write(">"+oname+"\n")
819                #qual_fh.write(' '.join((str(q) for q in actqual)))
820                #qual_fh.write("\n")
821                write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
822                to_print = [create_basic_xml_info(oname,sff_fh.name)]
823                # No clipping in XML ... the multiple and partial fragments
824                #  are clipped "hard"
825                # No template ID and trace_end: we don't know the
826                #  orientation of the frahments. Even if it were
827                #  only two, the fact we had multiple linkers
828                #  says something went wrong, so simply do not
829                #  write any paired-end information for all these fragments
830                to_print.append('    </trace>\n')
831                xml_fh.write(''.join(to_print))
832                numseqs += 1
833                fragcounter += 1
834    else:
835        if len(fragments) >2:
836            raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.")
837        # nothing will happen for 0 fragments
838        if len(fragments) == 1:
839            #print "Tada1"
840            boundaries = calc_subseq_boundaries(maskedseq,maskchar)
841            if len(boundaries) < 1 or len(boundaries) >3:
842                raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname)
843            if len(boundaries) == 3:
844                # case: mask char on both sides of sequence
845                #print "bounds3"
846                data['clip_adapter_left']=1+boundaries[0][1]
847                data['clip_adapter_right']=boundaries[2][0]
848            elif len(boundaries) == 2:
849                # case: mask char left or right of sequence
850                #print "bounds2",
851                if maskedseq[0] == maskchar :
852                    # case: mask char left
853                    #print "left"
854                    data['clip_adapter_left']=1+boundaries[0][1]
855                else:
856                    # case: mask char right
857                    #print "right"
858                    data['clip_adapter_right']=boundaries[1][0]
859            data['name'] = data['name'] + ".fn"
860            write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh)
861            numseqs = 1
862        elif len(fragments) == 2:
863            #print "Tada2"
864            oname = readname + ".r"
865            seq, qual = get_read_data(data)
866
867            startsearch = False
868            for spos in range(len(maskedseq)):
869                if maskedseq[spos] != maskchar:
870                    startsearch = True;
871                else:
872                    if startsearch:
873                        break
874
875            #print "\nspos: ", spos
876            lseq=seq[:spos]
877            #print "lseq:", lseq
878            actseq = reverse_complement(lseq)
879            lreadlen = len(actseq)
880            lqual = qual[:spos];
881            # python hack to reverse a list/string/etc
882            lqual = lqual[::-1];
883
884            #seq_fh.write(">"+oname+"\n")
885            #seq_fh.write(actseq+"\n")
886            #qual_fh.write(">"+oname+"\n")
887            #qual_fh.write(' '.join((str(q) for q in lqual)))
888            #qual_fh.write("\n")
889
890            write_sequence(oname,actseq,lqual,seq_fh,qual_fh)
891
892            to_print = [create_basic_xml_info(oname,sff_fh.name)]
893            to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left']));
894            to_print.append('        <template_id>')
895            to_print.append(readname)
896            to_print.append('</template_id>\n')
897            to_print.append('        <trace_end>r</trace_end>\n')
898            to_print.append('    </trace>\n')
899            xml_fh.write(''.join(to_print))
900
901            oname = readname + ".f"
902            startsearch = False
903            for spos in range(len(maskedseq)-1,-1,-1):
904                if maskedseq[spos] != maskchar:
905                    startsearch = True;
906                else:
907                    if startsearch:
908                        break
909
910            actseq = seq[spos+1:]
911            actqual = qual[spos+1:];
912       
913            #print "\nspos: ", spos
914            #print "rseq:", actseq
915
916            #seq_fh.write(">"+oname+"\n")
917            #seq_fh.write(actseq+"\n")
918            #qual_fh.write(">"+oname+"\n")
919            #qual_fh.write(' '.join((str(q) for q in actqual)))
920            #qual_fh.write("\n")
921            write_sequence(oname,actseq,actqual,seq_fh,qual_fh)
922           
923            rreadlen = len(actseq)
924            to_print = [create_basic_xml_info(oname,sff_fh.name)]
925            to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right'])));
926            to_print.append('        <template_id>')
927            to_print.append(readname)
928            to_print.append('</template_id>\n')
929            to_print.append('        <trace_end>f</trace_end>\n')
930            to_print.append('    </trace>\n')
931            xml_fh.write(''.join(to_print))
932            numseqs = 2
933
934    return numseqs
935
936
937
938def extract_reads_from_sff(config, sff_files):
939    '''Given the configuration and the list of sff_files it writes the seqs,
940    qualities and ancillary data into the output file(s).
941
942    If file for paired-end linker was given, first extracts all sequences
943    of an SFF and searches these against the linker(s) with SSAHA2 to
944    create needed information to split reads.
945    '''
946
947    global ssahapematches
948
949   
950    if len(sff_files) == 0 :
951        raise RuntimeError("No SFF file given?")
952
953    #we go through all input files
954    for sff_file in sff_files:
955        if not os.path.getsize(sff_file):
956            raise RuntimeError('Empty file? : ' + sff_file)
957        fh = open(sff_file, 'r')
958        fh.close()
959
960    openmode = 'w'
961    if config['append']:
962        openmode = 'a'
963
964    seq_fh = open(config['seq_fname'], openmode)
965    xml_fh = open(config['xml_fname'], openmode)
966    if config['want_fastq']:
967        qual_fh = None
968        try:
969            os.remove(config['qual_fname'])
970        except :
971            python_formattingwithoutbracesisdumb_dummy = 1
972    else:
973        qual_fh = open(config['qual_fname'], openmode)
974
975    if not config['append']:
976        xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n')
977    else:
978        remove_last_xmltag_in_file(config['xml_fname'], "trace_volume")
979
980    #we go through all input files
981    for sff_file in sff_files:
982        #print "Working on '" + sff_file + "':"
983        ssahapematches.clear()
984
985        seqcheckstore = ([])
986
987        debug = 0
988
989        if not debug and config['pelinker_fname']:
990            #print "Creating temporary sequences from reads in '" + sff_file + "' ... ",
991            sys.stdout.flush()
992
993            if 0 :
994                # for debugging
995                pid = os.getpid()
996                tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
997                tmpfasta_fh = open(tmpfasta_fname, 'w')
998            else:
999                tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_',
1000                                                          suffix = '.fasta')
1001
1002            sff_fh = open(sff_file, 'rb')
1003            header_data = read_header(fileh=sff_fh)
1004            for seq_data in sequences(fileh=sff_fh, header=header_data):
1005                seq,qual = get_read_data(seq_data)
1006                seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual)
1007                tmpfasta_fh.write(seqstring)
1008                #seq, qual, anci = extract_read_info(seq_data, sff_fh.name)
1009                #tmpfasta_fh.write(seq)
1010            #print "done."
1011            tmpfasta_fh.seek(0)
1012
1013            if 0 :
1014                # for debugging
1015                tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
1016                tmpssaha_fh = open(tmpssaha_fname, 'w+')
1017            else:
1018                tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_',
1019                                                          suffix = '.ssaha2')
1020
1021            launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
1022            tmpfasta_fh.close()
1023
1024            tmpssaha_fh.seek(0)
1025            read_ssaha_data(tmpssaha_fh)
1026            tmpssaha_fh.close()
1027
1028        if debug:
1029            tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r')
1030            read_ssaha_data(tmpssaha_fh)
1031
1032        #print "Converting '" + sff_file + "' ... ",
1033        sys.stdout.flush()
1034        sff_fh = open(sff_file, 'rb')
1035        #read_header(infile)
1036        header_data = read_header(fileh=sff_fh)
1037
1038        #now convert all reads
1039        nseqs_sff = 0
1040        nseqs_out = 0
1041        for seq_data in sequences(fileh=sff_fh, header=header_data):
1042            nseqs_sff += 1
1043
1044            seq, qual = clip_read(seq_data)
1045            seqcheckstore.append(seq[0:50])
1046
1047            #if nseqs_sff >1000:
1048            #    check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
1049            #    sys.exit()
1050
1051            if ssahapematches.has_key(seq_data['name']):
1052                #print "Paired end:",seq_data['name']
1053                nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
1054            else:
1055                #print "Normal:",seq_data['name']
1056                if config['pelinker_fname']:
1057                    seq_data['name'] = seq_data['name'] + ".fn"
1058                write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh)
1059                nseqs_out += 1
1060        #print "done."
1061        #print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.'
1062        sff_fh.close()
1063
1064        check_for_dubious_startseq(seqcheckstore,sff_file,seq_data)
1065        seqcheckstore = ([])
1066
1067    xml_fh.write('</trace_volume>\n')
1068
1069    xml_fh.close()
1070    seq_fh.close()
1071    if qual_fh is not None:
1072        qual_fh.close()
1073
1074    return
1075
1076def check_for_dubious_startseq(seqcheckstore, sffname,seqdata):
1077
1078    global stern_warning
1079
1080    foundproblem = ""
1081    for checklen in range(1,len(seqcheckstore[0])):
1082        foundinloop = False
1083        seqdict = {}
1084        for seq in seqcheckstore:
1085            shortseq = seq[0:checklen]
1086            if shortseq in seqdict:
1087                seqdict[shortseq] += 1
1088            else:
1089                seqdict[shortseq] = 1
1090
1091        for shortseq, count in seqdict.items():
1092            if float(count)/len(seqcheckstore) >= 0.5:
1093                foundinloop = True
1094                stern_warning
1095                foundproblem = "\n"+"*" * 80
1096                foundproblem += "\nWARNING: "
1097                foundproblem += "weird sequences in file " + sffname + "\n\n"
1098                foundproblem += "After applying left clips, " + str(count) + " sequences (="
1099                foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore))
1100                foundproblem += "%) start with these bases:\n" + shortseq
1101                foundproblem += "\n\nThis does not look sane.\n\n"
1102                foundproblem += "Countermeasures you *probably* must take:\n"
1103                foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n    corrected in the SFF.\n"
1104                foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n    react, use the --min_left_clip of sff_extract.\n"
1105                left,right = return_merged_clips(seqdata)
1106                foundproblem += "    (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n"
1107                foundproblem += "*" * 80 + "\n"
1108        if not foundinloop :
1109            break
1110    if len(foundproblem):
1111        print foundproblem
1112           
1113
1114def parse_extra_info(info):
1115    '''It parses the information that will go in the xml file.
1116
1117    There are two formats accepted for the extra information:
1118    key1:value1, key2:value2
1119    or:
1120    file1.sff{key1:value1, key2:value2};file2.sff{key3:value3}
1121    '''
1122    if not info:
1123        return info
1124    finfos = info.split(';')    #information for each file
1125    data_for_files = {}
1126    for finfo in finfos:
1127        #we split the file name from the rest
1128        items = finfo.split('{')
1129        if len(items) == 1:
1130            fname = fake_sff_name
1131            info = items[0]
1132        else:
1133            fname = items[0]
1134            info = items[1]
1135        #now we get each key,value pair in the info
1136        info = info.replace('}', '')
1137        data = {}
1138        for item in info.split(','):
1139            key, value = item.strip().split(':')
1140            key = key.strip()
1141            value = value.strip()
1142            data[key] = value
1143        data_for_files[fname] = data
1144    return data_for_files
1145
1146
1147def return_merged_clips(data):
1148    '''It returns the left and right positions to clip.'''
1149    def max(a, b):
1150        '''It returns the max of the two given numbers.
1151
1152        It won't take into account the zero values.
1153        '''
1154        if not a and not b:
1155            return None
1156        if not a:
1157            return b
1158        if not b:
1159            return a
1160        if a >= b:
1161            return a
1162        else:
1163            return b
1164    def min(a, b):
1165        '''It returns the min of the two given numbers.
1166
1167        It won't take into account the zero values.
1168        '''
1169        if not a and not b:
1170            return None
1171        if not a:
1172            return b
1173        if not b:
1174            return a
1175        if a <= b:
1176            return a
1177        else:
1178            return b
1179    left = max(data['clip_adapter_left'], data['clip_qual_left'])
1180    right = min(data['clip_adapter_right'], data['clip_qual_right'])
1181    #maybe both clips where zero
1182    if left is None:
1183        left = 1
1184    if right is None:
1185        right = data['number_of_bases']
1186    return left, right
1187
1188def sequence_case(data):
1189    '''Given the data for one read it returns the seq with mixed case.
1190
1191    The regions to be clipped will be lower case and the rest upper case.
1192    '''
1193    left, right = return_merged_clips(data)
1194    seq = data['bases']
1195    new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower()))
1196    return new_seq
1197
1198def clip_read(data):
1199    '''Given the data for one read it returns clipped seq and qual.'''
1200
1201    qual = data['quality_scores']
1202    left, right = return_merged_clips(data)
1203    seq = data['bases']
1204    qual = data['quality_scores']
1205    new_seq = seq[left-1:right]
1206    new_qual = qual[left-1:right]
1207
1208    return new_seq, new_qual
1209
1210
1211
1212def tests_for_ssaha(linker_fname):
1213    '''Tests whether SSAHA2 can be successfully called.'''
1214   
1215    try:
1216        print "Testing whether SSAHA2 is installed and can be launched ... ",
1217        sys.stdout.flush()
1218        fh = open('/dev/null', 'w')
1219        retcode = subprocess.call(["ssaha2", "-v"], stdout = fh)
1220        fh.close()
1221        print "ok."
1222    except :
1223        print "nope? Uh oh ...\n\n"
1224        raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?')
1225
1226
1227def load_linker_sequences(linker_fname):
1228    '''Loads all linker sequences into memory, storing only the length
1229    of each linker.'''
1230   
1231    global linkerlengths
1232
1233    if not os.path.getsize(linker_fname):
1234        raise RuntimeError("File empty? '" + linker_fname + "'")
1235    fh = open(linker_fname, 'r')
1236    linkerseqs = read_fasta(fh)
1237    if len(linkerseqs) == 0:
1238        raise RuntimeError(linker_fname + ": no sequence found?")
1239    for i in linkerseqs:
1240        if linkerlengths.has_key(i.name):
1241            raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.")
1242        linkerlengths[i.name] = len(i.sequence)
1243    fh.close()
1244
1245
1246def launch_ssaha(linker_fname, query_fname, output_fh):
1247    '''Launches SSAHA2 on the linker and query file, string SSAHA2 output
1248    into the output filehandle'''
1249
1250    try:
1251        print "Searching linker sequences with SSAHA2 (this may take a while) ... ",
1252        sys.stdout.flush()
1253        retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh)
1254        if retcode:
1255            raise RuntimeError('Ups.')
1256        else:
1257            print "ok."
1258    except:
1259        print "\n"
1260        raise RuntimeError('An error occured during the SSAHA2 execution, aborting.')
1261
1262def read_ssaha_data(ssahadata_fh):
1263    '''Given file handle, reads file generated with SSAHA2 (with default
1264    output format) and stores all matches as list ssahapematches
1265    (ssaha paired-end matches) dictionary'''
1266
1267    global ssahapematches
1268
1269    print "Parsing SSAHA2 result file ... ",
1270    sys.stdout.flush()
1271
1272    for line in ssahadata_fh:
1273        if line.startswith('ALIGNMENT'):
1274            ml = line.split()
1275            if len(ml) != 12 :
1276                print "\n", line,
1277                raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml)))
1278            if not ssahapematches.has_key(ml[2]) :
1279                ssahapematches[ml[2]] = ([])
1280            if ml[8] == 'F':
1281                #print line,
1282
1283                # store everything except the first element (output
1284                #  format name (ALIGNMENT)) and the last element
1285                #  (length)
1286                ssahapematches[ml[2]].append(ml[1:-1])
1287            else:
1288                #print ml
1289                ml[4],ml[5] = ml[5],ml[4]
1290                #print ml
1291                ssahapematches[ml[2]].append(ml[1:-1])
1292
1293    print "done."
1294
1295
1296##########################################################################
1297#
1298# BaCh: This block was shamelessly copied from
1299#  http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/
1300# and then subsequently modified to read fasta correctly
1301# It's still not fool proof, but should be good enough
1302#
1303##########################################################################
1304
1305class Fasta:
1306    def __init__(self, name, sequence):
1307        self.name = name
1308        self.sequence = sequence
1309
1310def read_fasta(file):
1311    items = []
1312    aninstance = Fasta('', '')
1313    linenum = 0
1314    for line in file:
1315        linenum += 1
1316        if line.startswith(">"):
1317            if len(aninstance.sequence):
1318                items.append(aninstance)
1319                aninstance = Fasta('', '')
1320            # name == all characters until the first whitespace
1321            #  (split()[0]) but without the starting ">" ([1:])
1322            aninstance.name = line.split()[0][1:]
1323            aninstance.sequence = ''
1324            if len(aninstance.name) == 0:
1325                raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?')
1326               
1327        else:
1328            if len(aninstance.name) == 0:
1329                raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?')
1330            aninstance.sequence += line.strip()
1331
1332    if len(aninstance.name) and len(aninstance.sequence):
1333        items.append(aninstance)
1334
1335    return items
1336##########################################################################
1337
1338def version_string ():
1339    return "sff_extract " + __version__
1340
1341def read_config():
1342    '''It reads the configuration options from the command line arguments and
1343    it returns a dict with them.'''
1344    from optparse import OptionParser, OptionGroup
1345    usage = "usage: %prog [options] sff1 sff2 ..."
1346    desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\
1347           " and XML traceinfo format. When a paired-end linker sequence"\
1348           " is given (-l), use SSAHA2 to scan the sequences for the linker,"\
1349           " then split the sequences, removing the linker."
1350    parser = OptionParser(usage = usage, version = version_string(), description = desc)
1351    parser.add_option('-a', '--append', action="store_true", dest='append',
1352            help='append output to existing files', default=False)
1353    parser.add_option('-i', '--xml_info', dest='xml_info',
1354            help='extra info to write in the xml file')
1355    parser.add_option("-l", "--linker_file", dest="pelinker_fname",
1356            help="FASTA file with paired-end linker sequences", metavar="FILE")
1357
1358    group = OptionGroup(parser, "File name options","")
1359    group.add_option('-c', '--clip', action="store_true", dest='clip',
1360                     help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False)
1361    group.add_option('-u', '--upper_case', action="store_false", dest='mix_case',
1362                      help='all bases in upper case, including clipped ends', default=True)
1363    group.add_option('', '--min_left_clip', dest='min_leftclip',
1364                     metavar="INTEGER", type = "int",
1365                     help='if the left clip coming from the SFF is smaller than this value, override it', default=0)
1366    group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq',
1367                      help='store as FASTQ file instead of FASTA + FASTA quality file', default=False)
1368    parser.add_option_group(group)
1369
1370    group = OptionGroup(parser, "File name options","")
1371    group.add_option("-o", "--out_basename", dest="basename",
1372            help="base name for all output files")
1373    group.add_option("-s", "--seq_file", dest="seq_fname",
1374            help="output sequence file name", metavar="FILE")
1375    group.add_option("-q", "--qual_file", dest="qual_fname",
1376            help="output quality file name", metavar="FILE")
1377    group.add_option("-x", "--xml_file", dest="xml_fname",
1378            help="output ancillary xml file name", metavar="FILE")
1379    parser.add_option_group(group)
1380
1381    #default fnames
1382    #is there an sff file?
1383    basename = 'reads'
1384    if sys.argv[-1][-4:].lower() == '.sff':
1385        basename = sys.argv[-1][:-4]
1386    def_seq_fname = basename + '.fasta'
1387    def_qual_fname = basename + '.fasta.qual'
1388    def_xml_fname = basename + '.xml'
1389    def_pelinker_fname = ''
1390    parser.set_defaults(seq_fname = def_seq_fname)
1391    parser.set_defaults(qual_fname = def_qual_fname)
1392    parser.set_defaults(xml_fname = def_xml_fname)
1393    parser.set_defaults(pelinker_fname = def_pelinker_fname)
1394
1395    #we parse the cmd line
1396    (options, args) = parser.parse_args()
1397
1398    #we put the result in a dict
1399    global config
1400    config = {}
1401    for property in dir(options):
1402        if property[0] == '_' or property in ('ensure_value', 'read_file',
1403                                                                'read_module'):
1404            continue
1405        config[property] = getattr(options, property)
1406
1407    if config['basename'] is None:
1408        config['basename']=basename
1409
1410    #if we have not set a file name with -s, -q or -x we set the basename
1411    #based file name
1412    if config['want_fastq']:
1413        config['qual_fname'] = ''
1414        if config['seq_fname'] == def_seq_fname:
1415            config['seq_fname'] = config['basename'] + '.fastq'
1416    else:
1417        if config['seq_fname'] == def_seq_fname:
1418            config['seq_fname'] = config['basename'] + '.fasta'
1419        if config['qual_fname'] == def_qual_fname:
1420            config['qual_fname'] = config['basename'] + '.fasta.qual'
1421
1422    if config['xml_fname'] == def_xml_fname:
1423        config['xml_fname'] = config['basename'] + '.xml'
1424
1425    #we parse the extra info for the xml file
1426    config['xml_info'] = parse_extra_info(config['xml_info'])
1427    return config, args
1428
1429
1430
1431##########################################################################
1432
1433
1434def testsome():
1435    sys.exit()
1436    return
1437
1438
1439def debug():
1440    try:
1441        dummy = 1
1442        #debug()
1443        #testsome()
1444
1445        config, args = read_config()
1446        load_linker_sequences(config['pelinker_fname'])
1447
1448        #pid = os.getpid()
1449        pid = 15603
1450
1451        #tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta'
1452        #tmpfasta_fh = open(tmpfasta_fname, 'w')
1453        tmpfasta_fname = 'FLVI58L05.fa'
1454        tmpfasta_fh = open(tmpfasta_fname, 'r')
1455
1456        tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2'
1457        tmpssaha_fh = open(tmpssaha_fname, 'w')
1458
1459        launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh)
1460
1461        tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r')   
1462        read_ssaha_data(tmpssaha_fh)
1463
1464        sys.exit()
1465
1466        extract_reads_from_sff(config, args)
1467
1468    except (OSError, IOError, RuntimeError), errval:
1469        print errval
1470        sys.exit()
1471
1472    sys.exit()
1473
1474
1475def main():
1476
1477    argv = sys.argv
1478    if len(argv) == 1:
1479        sys.argv.append('-h')
1480        read_config()
1481        sys.exit()
1482    try:
1483        #debug();
1484
1485        config, args = read_config()
1486
1487        if config['pelinker_fname']:
1488            #tests_for_ssaha(config['pelinker_fname'])
1489            load_linker_sequences(config['pelinker_fname'])
1490        if len(args) == 0:
1491            raise RuntimeError("No SFF file given?")
1492        extract_reads_from_sff(config, args)
1493    except (OSError, IOError, RuntimeError), errval:
1494        print errval
1495        return 1
1496
1497    if stern_warning:
1498        return 1
1499
1500    return 0
1501
1502
1503
1504if __name__ == "__main__":
1505        sys.exit(main())
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。