root/galaxy-central/tools/sr_mapping/lastz_paired_reads_wrapper.py

リビジョン 2, 42.8 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3"""
4Runs Lastz paired read alignment process
5Written for Lastz v. 1.02.00.
6
7# Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu),
8# then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu)
9
10This tool takes the following input:
11a. A collection of 454 paired end reads ( a fasta file )
12b. A linker sequence ( a very small fasta file )
13c. A reference genome ( nob, 2bit or fasta )
14
15and uses the following process:
161. Split reads into mates:  the input to this step is the read file XXX.fasta, and the output is three
17   files; XXX.short.fasta, XXX.long.fasta and XXX.mapping.  The mapping file records the information necessary
18   to convert mate coordinates back into the original read, which is needed later in the process.
19
202. Align short mates to the reference: this runs lastz against every chromosome.  The input is XXX.short.fasta
21   and the reference genome, and the output is a SAM file, XXX.short.sam.
22
233. Align long mates to the reference: this runs lastz against every chromosome.  The input is XXX.long.fasta
24   and the reference genome, and the output is a SAM file, XXX.long.sam.
25
264. Combine, and convert mate coordinates back to read coordinates.  The input is XXX.mapping, XXX.short.sam and
27   XXX.long.sam, and the output is XXX.sam.
28
29usage: lastz_paired_reads_wrapper.py [options]
30    --ref_name: The reference name to change all output matches to
31    --ref_source: The reference is cached or from the history
32    --source_select: Use pre-set or cached reference file
33    --input1: The name of the reference file if using history or reference base name if using cached
34    --input2: The reads file to align
35    --input3: The sequencing linker file
36    --input4: The base quality score 454 file
37    --ref_sequences: The number of sequences in the reference file if using one from history
38    --output: The name of the output file
39    --lastz_seqs_file_dir: Directory of local lastz_seqs.loc file
40"""
41import optparse, os, subprocess, shutil, sys, tempfile, time
42from string import maketrans
43
44from galaxy import eggs
45import pkg_resources
46pkg_resources.require( 'bx-python' )
47from bx.seq.twobit import *
48from bx.seq.fasta import FastaReader
49from galaxy.util.bunch import Bunch
50from galaxy.util import string_as_bool
51
52# Column indexes for SAM required fields
53SAM_QNAME_COLUMN = 0
54SAM_FLAG_COLUMN  = 1
55SAM_RNAME_COLUMN = 2
56SAM_POS_COLUMN   = 3
57SAM_MAPQ_COLUMN  = 4
58SAM_CIGAR_COLUMN = 5
59SAM_MRNM_COLUMN  = 6
60SAM_MPOS_COLUMN  = 7
61SAM_ISIZE_COLUMN = 8
62SAM_SEQ_COLUMN   = 9
63SAM_QUAL_COLUMN  = 10
64SAM_MIN_COLUMNS  = 11
65# SAM bit-encoded flags
66BAM_FPAIRED      =    1    # the read is paired in sequencing, no matter whether it is mapped in a pair
67BAM_FPROPER_PAIR =    2    # the read is mapped in a proper pair
68BAM_FUNMAP       =    4    # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
69BAM_FMUNMAP      =    8    # the mate is unmapped
70BAM_FREVERSE     =   16    # the read is mapped to the reverse strand
71BAM_FMREVERSE    =   32    # the mate is mapped to the reverse strand
72BAM_FREAD1       =   64    # this is read1
73BAM_FREAD2       =  128    # this is read2
74BAM_FSECONDARY   =  256    # not primary alignment
75BAM_FQCFAIL      =  512    # QC failure
76BAM_FDUP         = 1024    # optical or PCR duplicate
77
78# Keep track of all created temporary files so they can be deleted
79global tmp_file_names
80tmp_file_names = []
81# The values in the skipped_lines dict are tuples consisting of:
82# - the number of skipped lines for that error
83# If not a sequence error:
84# - the 1st line number on which the error was found
85# - the text of the 1st line on which the error was found
86# If a sequence error:
87# - The number of the sequence in the file
88# - the sequence name on which the error occurred
89# We may need to improve dealing with file position and text as
90# much of it comes from temporary files that are created from the
91# inputs, and not the inputs themselves, so this could be confusing
92# to the user.
93global skipped_lines
94skipped_lines = dict( bad_interval=( 0, 0, '' ),
95                      inconsistent_read_lengths=( 0, 0, '' ),
96                      inconsistent_reads=( 0, 0, '' ),
97                      inconsistent_sizes=( 0, 0, '' ),
98                      missing_mate=( 0, 0, '' ),
99                      missing_quals=( 0, 0, '' ),
100                      missing_seq=( 0, 0, '' ),
101                      multiple_seqs=( 0, 0, '' ),
102                      no_header=( 0, 0, '' ),
103                      num_fields=( 0, 0, '' ),
104                      reads_paired=( 0, 0, '' ),
105                      sam_flag=( 0, 0, '' ),
106                      sam_headers=( 0, 0, '' ),
107                      sam_min_columns=( 0, 0, '' ),
108                      two_mate_names=( 0, 0, '' ),
109                      wrong_seq_len=( 0, 0, '' ) )
110global total_skipped_lines
111total_skipped_lines = 0
112
113def stop_err( msg ):
114    sys.stderr.write( "%s" % msg )
115    sys.exit()
116
117def skip_line( error_key, position, text ):
118    if not skipped_lines[ error_key ][2]:
119        skipped_lines[ error_key ][1] = position
120        skipped_lines[ error_key ][2] = text
121    skipped_lines[ error_key ][0] += 1
122    total_skipped_lines += 1
123
124def get_tmp_file_name( dir=None, suffix=None ):
125    """
126    Return a unique temporary file name that can be managed.  The
127    file must be manually removed after use.
128    """
129    if dir and suffix:
130        tmp_fd, tmp_name = tempfile.mkstemp( dir=dir, suffix=suffix )
131    elif dir:
132        tmp_fd, tmp_name = tempfile.mkstemp( dir=dir )
133    elif suffix:
134        tmp_fd, tmp_name = tempfile.mkstemp( suffix=suffix )
135    os.close( tmp_fd )
136    tmp_file_names.append( tmp_name )
137    return tmp_name
138
139def run_command( command ):
140    proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE, )
141    proc.wait()
142    stderr = proc.stderr.read()
143    proc.wait()
144    if stderr:
145        stop_err( stderr )
146
147def split_paired_reads( input2, combined_linker_file_name ):
148    """
149    Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals
150    showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right
151    halves.
152   
153    The input intervals look like this.  Note that they may include multiple intervals for the same read
154    ( which should overlap ), and we use the union of them as the linker interval.  Non-overlaps are
155    reported to the user, and those reads are not processed.  Starts are origin zero.
156   
157        #name     strand start len size
158        FG3OYDA05FTEES +   219  42 283
159        FG3OYDA05FVOLL +   263  41 416
160        FG3OYDA05FFL7J +    81  42 421
161        FG3OYDA05FOQWE +    55  42 332
162        FG3OYDA05FV4DW +   297  42 388
163        FG3OYDA05FWAQV +   325  42 419
164        FG3OYDA05FVLGA +    90  42 367
165        FG3OYDA05FWJ71 +    58  42 276
166   
167    The output gives each half-sequence on a separate line, like this.  This allows easy sorting of the
168    sequences by length, after the fact.
169   
170        219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC...
171        22  FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG
172    """
173    # Bob says these should be hard-coded.
174    seq_len_lower_threshold = 17
175    short_mate_cutoff = 50
176    # We need to pass the name of this file back to the caller.
177    tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' )
178    mates_file = file( tmp_mates_file_name, "w+b" )
179    # Read the linker intervals
180    combined_linker_file = file( combined_linker_file_name, "rb" )
181    read_to_linker_dict = {}
182    i = 0
183    for i, line in enumerate( combined_linker_file ):
184        line = line.strip()
185        if line.startswith( "#" ):
186            continue
187        if line.find( '#' ) >= 0:
188            line = line.split( "#", 1 )[0].rstrip()
189        fields = line.split()
190        if len( fields ) != 4:
191            skip_line( 'num_fields', i+1, line )
192            continue
193        name, start, length, size = fields
194        start = int( start )
195        length = int( length )
196        size = int( size )
197        end = start + length
198        if end > size:
199            skip_line[ 'bad_interval' ] += 1
200            continue
201        if name not in read_to_linker_dict:
202            read_to_linker_dict[ name ] = ( start, end, size )
203            continue
204        if read_to_linker_dict[ name ] == None:
205            # Read previously marked as non-overlapping intervals, so skip this sequence - see below
206            continue
207        ( s, e, sz ) = read_to_linker_dict[ name ]
208        if sz != size:
209            skip_line( 'inconsistent_sizes', i+1, name )
210            continue
211        if s > end or e < start:
212            # Non-overlapping intervals, so skip this sequence
213            read_to_linker_dict[ name ] = None
214            continue
215        read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size )
216    combined_linker_file.close()
217    # We need to pass the name of this file back to the caller.
218    tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' )
219    mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' )
220    # Process the sequences
221    seqs = 0
222    fasta_reader = FastaReader( file( input2, 'rb' ) )
223    while True:
224        seq = fasta_reader.next()
225        if not seq:
226            break
227        seqs += 1
228        if seq.name not in read_to_linker_dict:
229            if seq.length > seq_len_lower_threshold:
230                mates_file.write( "%-3d %s   %s\n" % ( seq.length, seq.name, seq.text ) )
231            read_to_linker_dict[ seq.name ] = ""
232            continue
233        if read_to_linker_dict[ seq.name ] == "":
234            skip_line( 'multiple_seqs', seqs, seq.name )
235            continue
236        if read_to_linker_dict[ seq.name ] == None:
237            # Read previously marked as non-overlapping intervals, so skip this sequence - see above
238            continue
239        ( start, end, size ) = read_to_linker_dict[ seq.name ]
240        if seq.length != size:
241            skip_line( 'wrong_seq_len', seqs, seq.name )
242            continue
243        left = seq.text[ :start ]
244        right = seq.text[ end: ]
245        left_is_small = len( left ) <= seq_len_lower_threshold
246        right_is_small = len( right ) <= seq_len_lower_threshold
247        if left_is_small and right_is_small:
248            continue
249        if not left_is_small:
250            mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) )
251            mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) )
252        if not right_is_small:
253            mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) )
254            mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) )
255        read_to_linker_dict[ seq.name ] = ""
256    combined_linker_file.close()
257    mates_file.close()
258    mates_mapping_file.close()
259    # Create temporary files for short and long mates
260    tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' )
261    tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' )
262    tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' )
263    tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' )
264    i = 0
265    for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ):
266        fields = line.split()
267        seq_len = int( fields[0] )
268        seq_name = fields[1]
269        seq_text = fields[2]
270        if seq_len <= short_mate_cutoff:
271            tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
272        else:
273            tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
274    tmp_mates_short.close()
275    tmp_mates_long.close()
276    return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name
277
278def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ):
279    tmp_align_file_names = []
280    if ref_source == 'history':
281        # Reference is a fasta dataset from the history
282        # Create temporary files to contain the output from lastz executions
283        tmp_short_file_name = get_tmp_file_name( suffix='short_out' )
284        tmp_align_file_names.append( tmp_short_file_name )
285        tmp_long_file_name = get_tmp_file_name( suffix='long_out' )
286        tmp_align_file_names.append( tmp_long_file_name )
287        seqs = 0
288        fasta_reader = FastaReader( open( input1 ) )
289        while True:
290            # Read the next sequence from the reference dataset.  Note that if the reference contains
291            # a small number of chromosomes this loop is ok, but in many cases the genome has a bunch
292            # of small straggler scaffolds and contigs and it is a computational waste to do each one
293            # of these in its own run.  There is an I/O down side to running by subsets (even if they are
294            # one sequence per subset), compared to splitting the reference into sizes of 250 mb.  With
295            # the subset action, lastz still has to read and parse the entire file for every run (this
296            # is true for fasta, but for .2bit files it can access each sequence directly within the file,
297            # so the overhead is minimal).
298            """
299            :> output_file  (this creates the output file, empty)
300            while there are more sequences to align
301                find the next sequences that add up to 250M, put their names in farf.names
302                lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ...
303                  >> output_file
304            """
305            seq = fasta_reader.next()
306            if not seq:
307                break
308            seqs += 1
309            # Create a temporary file to contain the current sequence as input to lastz.
310            # We're doing this a bit differently here since we could be generating a huge
311            # number of temporary files.
312            tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs )
313            tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' )
314            tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
315            tmp_in_file.close()
316            # Align short mates
317            command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name )
318            command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
319            command += '>> %s' % tmp_short_file_name
320            run_command( command )
321            # Align long mates
322            command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name )
323            command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
324            command += '>> %s' % tmp_long_file_name
325            run_command( command )
326            # Remove the temporary file that contains the current sequence
327            os.remove( tmp_in_file_name )
328    else:
329        # Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file
330        tbf = TwoBitFile( open( input1, 'rb' ) )
331        for chrom in tbf.keys():
332            # Align short mates
333            tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom )
334            tmp_align_file_names.append( tmp_short_file_name )
335            command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name )
336            command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
337            command += '> %s' % tmp_short_file_name
338            run_command( command )
339            # Align long mates
340            tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom )
341            tmp_align_file_names.append( tmp_long_file_name )
342            command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name )
343            command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
344            command += '> %s' % tmp_long_file_name
345            run_command( command )
346    return tmp_align_file_names
347
348def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ):
349    """
350    Given a SAM file corresponding to alignments of *subsegments* of paired 'reads' to a reference sequence,
351    convert the positions on the subsegments to positions on the reads.  Also (optionally) add quality values.
352   
353    The input file is in SAM format, as shown below.  Each line represents the alignment of a part of a read
354    to a reference sequence.  Read pairs are indicated by suffixes in their names.  Normally, the suffixes _L
355    and _R indicate the left and right mates of reads (this can be overridden with the --left and --right
356    options).  Reads that were not mates have no suffix.
357   
358        (SAM header lines omitted)
359        F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M          * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT           *
360        F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S        * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat     *
361        F2YP0BU02HREML_R 0  chr21 16386077 255 33M5S        * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc             *
362        F2YP0BU02IOF1F_L 0  chr21 17567321 255 7S28M        * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC                *
363        F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S   * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt  *
364        F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M        * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA             *
365        F2YP0BU02JIMJ4_R 0  chr21 22383051 255 19M          * 0 0 CCCTTTATCATTTTTTATT                                *
366        F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M     * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT                   *
367        F2YP0BU02IODR5_L 0  chr21 30935325 255 37M          * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA              *
368        F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M      * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG                      *
369        F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M          * 0 0 CACACCTGTAACCCCAGCACTTT                            *
370        F2YP0BU02HKC61_R 0  chr21 31678718 255 40M          * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           *
371        F2YP0BU02HKC61_R 0  chr21 31678718 255 40M          * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           *
372        F2YP0BU02HVA88   16 chr21 31703558 255 1M1D35M8S    * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat       *
373        F2YP0BU02JDCF1_L 0  chr21 31816600 255 38M          * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT             *
374        F2YP0BU02GZ1GO_R 0  chr21 33360122 255 6S38M        * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC       *
375        F2YP0BU02FX387_L 16 chr22 14786201 255 26M          * 0 0 TGGATGAAGCTGGAAACCATCATTCT                         *
376        F2YP0BU02IF2NE_R 0  chr22 16960842 255 40M10S       * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc *
377        F2YP0BU02F4TVA   0  chr22 19200522 255 49M          * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA  *
378        F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M        * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG           *
379        F2YP0BU02FS4EM_R 0  chr22 30159364 255 29M          * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG                      *
380        F2YP0BU02G197P_L 0  chr22 32044496 255 40M10S       * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc *
381        F2YP0BU02FIING   16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG         *
382        F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M     * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA              *
383   
384    The user must provide a mapping file (which might better be called an unmapping file).  This file is usually
385    created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single
386    read (this means the left and right mates were part of a single read).  The mapping file contains four columns.
387    The first two give the mates's name (including the suffix) and the read name.  The last two columns describe how
388    much of the full original sequence is missing from the mate.  For example, in the read below, the left mate is
389    missing 63 on the right (42 for the linker and 21 for the right half).  The right mate is missing 339 on the left.
390   
391        left half:  TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA
392                    AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC
393                    AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA
394                    CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC
395                    ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG
396                    ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG
397        linker:     CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA
398        right half: CTCCTGCCTCAGCCTCCCGAG
399   
400        mate_name        read_name      offset_to_start offset_from_end
401        F2YP0BU02FS4EM_L F2YP0BU02FS4EM         0              71
402        F2YP0BU02FS4EM_R F2YP0BU02FS4EM       339               0
403   
404    The user can also specify a quality scores file, which should look something like this.  Quality values are presumed
405    to be PHRED scores, written in space-delimited decimal.
406   
407        >F2YP0BU02FS4EM
408        38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40
409        40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33
410        32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40
411        40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35
412        31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34
413        34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40
414        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
415        40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40
416        40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33
417        36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36
418        39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40
419        40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26
420        26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39
421        40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40
422        39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35
423        >F2YP0BU02G197P
424        40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40
425        40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
426        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
427        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
428        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40
429        40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40
430        40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40
431        40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40
432        40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
433        33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40
434        40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
435        40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40
436        40 40 40 39 39 39 40 40
437        >F2YP0BU02FIING
438        32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19
439        19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25
440   
441    The output file is also SAM:
442   
443        (SAM header lines omitted)
444        F2YP0BU02G7LK5 81  chr21 15557360 255 40M303H        * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT           D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII
445        F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S      * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat     AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@
446        F2YP0BU02HREML 65  chr21 16386077 255 320H33M5S      * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc             HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH
447        F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H      * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC                4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED
448        F2YP0BU02IKX84 81  chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt  ;;;=7@.55------?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII
449        F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M      * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA             IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII
450        F2YP0BU02JIMJ4 65  chr21 22383051 255 208H19M        * 0 0 CCCTTTATCATTTTTTATT                                555544E?GE113344I22
451        F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M   * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT                   IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII
452        F2YP0BU02IODR5 129 chr21 30935325 255 37M154H        * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA              6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@
453        F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M    * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG                      BB1552222<<>9==8;;?AA=??A???A
454        F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M        * 0 0 CACACCTGTAACCCCAGCACTTT                            IIIIIIIIIIICCCCIIIIIHHH
455        F2YP0BU02HKC61 65  chr21 31678718 255 300H40M        * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
456        F2YP0BU02HKC61 65  chr21 31678718 255 300H40M        * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
457        F2YP0BU02HVA88 16  chr21 31703558 255 1M1D35M8S      * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat       >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF
458        F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H        * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT             IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII
459        F2YP0BU02GZ1GO 65  chr21 33360122 255 76H6S38M       * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC       BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51
460        F2YP0BU02FX387 145 chr22 14786201 255 201H26M        * 0 0 TGGATGAAGCTGGAAACCATCATTCT                         IIHHHHHHHHHHHHHFFFFFFFFFFF
461        F2YP0BU02IF2NE 65  chr22 16960842 255 209H40M10S     * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7-
462        F2YP0BU02F4TVA 0   chr22 19200522 255 49M            * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA  FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5
463        F2YP0BU02HKC61 81  chr22 29516998 255 8S32M300H      * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG           2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA
464        F2YP0BU02FS4EM 65  chr22 30159364 255 339H29M        * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG                      IIIIHHEIIIIHHHH??=DDHIIIIIDDD
465        F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H     * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII
466        F2YP0BU02FIING 16  chr22 45959944 255 3M1I11M1I26M   * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG         :::9:32267=:114244/...446==<<<?@?:9::::AAA
467        F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M   * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA              IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII
468   
469    """
470    left_suffix       = "_L"
471    right_suffix      = "_R"
472    # Read the mapping
473    mate_to_read_dict = {}
474    i = 0
475    for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ):
476        line = line.strip()
477        if not line.startswith( "#" ):
478            fields = line.split()
479            if len( fields ) != 4:
480                skip_line( "num_fields", i+1, line )
481                continue
482            mate_name, read_name, s_offset, e_offset = fields
483            if mate_name in mate_to_read_dict:
484                skip_line( 'two_mate_names', i+1, mate_name )
485                continue
486            mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) )
487    # Read sequence data
488    read_to_nucs_dict = {}
489    seqs = 0
490    fasta_reader = FastaReader( file( input2, 'rb' ) )
491    while True:
492        seq = fasta_reader.next()
493        if not seq:
494            break
495        seqs += 1
496        seq_text_upper = seq.text.upper()
497        if seq.name in read_to_nucs_dict:
498            if seq_text_upper != read_to_nucs_dict[ seq.name ]:
499                skip_line( 'inconsistent_reads', seqs, seq.name )
500                continue
501        read_to_nucs_dict[ seq.name ] = seq_text_upper
502    # Read quality data
503    def quality_sequences( f ):
504        seq_name  = None
505        seq_quals = None
506        line_number = 0
507        for line in f:
508            line_number += 1
509            line = line.strip()
510            if line.startswith( ">" ):
511                if seq_name != None:
512                    yield ( seq_name, seq_quals, seq_line )
513                seq_name  = sequence_name( line )
514                seq_line  = line_number
515                seq_quals = []
516            elif seq_name is None:
517                skip_line( 'no_header', line_number, line )
518                continue
519            else:
520                seq_quals += [ int( q ) for q in line.split() ]
521        if seq_name is not None:
522            yield ( seq_name, seq_quals, seq_line )
523    def sequence_name( s ):
524        s = s[ 1: ].strip()
525        if not s:
526            return ""
527        else:
528            return s.split()[ 0 ]
529    read_to_quals_dict = {}
530    # TODO: should we use Dan's fastaNamedReader here?
531    for seq_name, quals, line_number in quality_sequences( file( input4 ) ):
532        quals = samify_phred_scores( quals )
533        if seq_name in read_to_quals_dict:
534            if quals != read_to_quals_dict[ seq_name ]:
535                skip_line( 'inconsistent_reads', line_number, seq_name )
536            continue
537        if len( quals ) != len( read_to_nucs_dict[ seq_name ] ):
538            skip_line( 'inconsistent_read_lengths', line_number, seq_name )
539            continue
540        read_to_quals_dict[ seq_name ] = quals
541    # process the SAM file
542    tmp_align_file_names = ' '.join( tmp_align_file_name_list )
543    combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' )
544    command = 'cat %s | grep -v "^@" | sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name )
545    run_command( command )
546    fout = file( output, 'w+b' )
547    has_non_header = False
548    i = 0
549    for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ):
550        line = line.strip()
551        if line.startswith( "@" ):
552            if has_non_header:
553                skip_line( 'sam_headers', i+1, line )
554                continue
555            fout.write( "%s\n" % line )
556            continue
557        has_non_header = True
558        fields = line.split()
559        num_fields = len( fields )
560        if num_fields < SAM_MIN_COLUMNS:
561            skip_line( 'sam_min_columns', i+1, line )
562            continue
563        # Set flags for mates
564        try:
565            flag = int( fields[ SAM_FLAG_COLUMN ] )
566        except ValueError:
567            skip_line( 'sam_flag', i+1, line )
568            continue
569        if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ):
570            skip_line( 'reads_paired', i+1, line )
571            continue
572        mate_name = fields[ SAM_QNAME_COLUMN ]
573        unmap_it = False
574        half = None
575        if mate_name.endswith( left_suffix ):
576            flag += BAM_FPAIRED + BAM_FREAD2
577            fields[ SAM_FLAG_COLUMN ] = "%d" % flag
578            unmap_it = True
579            half = "L"
580        elif mate_name.endswith( right_suffix ):
581            flag += BAM_FPAIRED + BAM_FREAD1
582            fields[ SAM_FLAG_COLUMN ] = "%d" % flag
583            unmap_it = True
584            half = "R"
585        on_plus_strand = ( flag & BAM_FREVERSE == 0 )
586        # Convert position from mate to read by adding clipping to cigar
587        if not unmap_it:
588            read_name = mate_name
589        else:
590            try:
591                read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ]
592            except KeyError:
593                skip_line( 'missing_mate', i+1, mate_name )
594                continue
595            cigar = fields[ SAM_CIGAR_COLUMN ]
596            cigar_prefix = None
597            cigar_suffix = None
598            if half == "L":
599                if on_plus_strand:
600                    if s_offset > 0:
601                        cigar_prefix = ( s_offset, "S" )
602                    if e_offset > 0:
603                        cigar_suffix = ( e_offset, "H" )
604                else:
605                    if e_offset > 0:
606                        cigar_prefix = ( e_offset, "H" )
607                    if s_offset > 0:
608                        cigar_suffix = ( s_offset, "S" )
609            elif half == "R":
610                if on_plus_strand:
611                    if s_offset > 0:
612                        cigar_prefix = ( s_offset, "H" )
613                    if e_offset > 0:
614                        cigar_suffix = ( e_offset, "S" )
615                else:
616                    if e_offset > 0:
617                        cigar_prefix = ( e_offset, "S" )
618                    if s_offset > 0:
619                        cigar_suffix = ( s_offset, "H" )
620            else:               
621                if on_plus_strand:
622                    if s_offset > 0:
623                        cigar_prefix = ( s_offset, "S" )
624                    if e_offset > 0:
625                        cigar_suffix = ( e_offset, "S" )
626                else:
627                    if e_offset > 0:
628                        cigar_prefix = ( e_offset, "S" )
629                    if s_offset > 0:
630                        cigar_suffix = ( s_offset, "S" )
631            if cigar_prefix != None:
632                count, op = cigar_prefix
633                cigar = prefix_cigar( "%d%s" % ( count, op ), cigar )
634                if op == "S":
635                    refPos = int( fields[ SAM_POS_COLUMN ] ) - count
636                    fields[ SAM_POS_COLUMN ] = "%d" % refPos
637            if cigar_suffix != None:
638                count, op = cigar_suffix
639                cigar = suffix_cigar( cigar,"%d%s" % ( count, op) )
640            fields[ SAM_QNAME_COLUMN ] = read_name
641            fields[ SAM_CIGAR_COLUMN ] = cigar
642        # Fetch sequence and quality values, and flip/clip them
643        if read_name not in read_to_nucs_dict:
644            skip_line( 'missing_seq', i+1, read_name )
645            continue
646        nucs = read_to_nucs_dict[ read_name ]
647        if not on_plus_strand:
648            nucs = reverse_complement( nucs )
649        quals = None
650        if read_to_quals_dict != None:
651            if read_name not in read_to_quals_dict:
652                skip_line( 'missing_quals', i+1, read_name )
653                continue
654            quals = read_to_quals_dict[ read_name ]
655            if not on_plus_strand:
656                quals = reverse_string( quals )
657        cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] )
658        nucs, quals = clip_for_cigar( cigar, nucs, quals )
659        fields[ SAM_SEQ_COLUMN ] = nucs
660        if quals != None:
661            fields[ SAM_QUAL_COLUMN ] = quals
662        # Output the line
663        fout.write( "%s\n" % "\t".join( fields ) )
664    fout.close()
665
666def prefix_cigar( prefix, cigar ):
667    ix = 0
668    while cigar[ ix ].isdigit():
669        ix += 1
670    if cigar[ ix ] != prefix[ -1 ]:
671        return prefix + cigar
672    count = int( prefix[ :-1 ] ) + int( cigar[ :ix ] )
673    return "%d%s%s" % ( count, prefix[ -1 ], cigar[ ix+1: ] )
674
675def suffix_cigar( cigar, suffix ):
676    if cigar[ -1 ] != suffix[ -1 ]:
677        return cigar + suffix
678    ix = len( cigar ) - 2
679    while cigar[ix].isdigit():
680        ix -= 1
681    ix += 1
682    count = int( cigar[ ix:-1 ] ) + int( suffix[ :-1 ] )
683    return "%s%d%s" % ( cigar[ :ix ], count, suffix[ -1 ] )
684
685def split_cigar( text ):
686    fields = []
687    field  = []
688    for ch in text:
689        if ch not in "MIDHS":
690            field += ch
691            continue
692        if field == []:
693            raise ValueError
694        fields += [ ( int( "".join( field ) ), ch ) ]
695        field = []
696    if field != []:
697        raise ValueError
698    return fields
699
700def clip_for_cigar( cigar, nucs, quals ):
701    # Hard clip prefix
702    count, op = cigar[0]
703    if op == "H":
704        nucs = nucs[ count: ]
705        if quals != None:
706            quals = quals[ count: ]
707        count, op = cigar[ 1 ]
708    # Soft clip prefix
709    if op == "S":
710        nucs = nucs[ :count ].lower() + nucs[ count: ]
711    # Hard clip suffix
712    count,op = cigar[ -1 ]
713    if op == "H":
714        nucs = nucs[ :-count ]
715        if quals != None:
716            quals = quals[ :-count ]
717        count, op = cigar[ -2 ]
718    # Soft clip suffix
719    if op == "S":
720        nucs = nucs[ :-count ] + nucs[ -count: ].lower()
721    return nucs, quals
722
723def samify_phred_scores( quals ):
724    """
725    Convert a decimal list of phred base-quality scores to a sam quality string.
726    Note that if a quality is outside the dynamic range of sam's ability to
727    represent it, we clip the value to the max allowed.  SAM quality scores
728    range from chr(33) to chr(126).
729    """
730    if min( quals ) >= 0 and max( quals ) <= 126-33:
731        return "".join( [ chr( 33 + q ) for q in quals ] )
732    else:
733        return "".join( [ chr( max( 33, min( 126, 33+q ) ) ) for q in quals ] )
734
735def reverse_complement( nucs ):
736    complementMap = maketrans( "ACGTacgt", "TGCAtgca" )
737    return nucs[ ::-1 ].translate( complementMap )
738
739def reverse_string( s ):
740    return s[ ::-1 ]
741
742def __main__():
743    # Parse command line
744    # input1: a reference genome ( 2bit or fasta )
745    # input2: a collection of 454 paired end reads ( a fasta file )
746    # input3: a linker sequence ( a very small fasta file )
747    # input4: a base quality score 454 file ( qual454 )
748    parser = optparse.OptionParser()
749    parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' )
750    parser.add_option( '', '--ref_source', dest='ref_source', help='The reference is cached or from the history' )
751    parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' )
752    parser.add_option( '', '--source_select', dest='source_select', help='Use pre-set or cached reference file' )
753    parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' )
754    parser.add_option( '', '--input2', dest='input2', help='The 454 reads file to align' )
755    parser.add_option( '', '--input3', dest='input3', help='The sequencing linker file' )
756    parser.add_option( '', '--input4', dest='input4', help='The base quality score 454 file' )
757    parser.add_option( '', '--output', dest='output', help='The output file' )
758    parser.add_option( '', '--lastz_seqs_file_dir', dest='lastz_seqs_file_dir', help='Directory of local lastz_seqs.loc file' )
759
760    ( options, args ) = parser.parse_args()
761    if options.ref_name != 'None':
762        ref_name = '[nickname=%s]' % options.ref_name
763    else:
764        ref_name = ''
765    if options.ref_source == 'history':
766        # Reference is a fasta dataset from the history
767        try:
768            # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
769            error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
770            ref_sequences = int( options.ref_sequences )
771            if ref_sequences < 1:
772                stop_err( error_msg )
773        except:
774            stop_err( error_msg )
775    else:
776        ref_sequences = 0
777    tmp_w12_name = get_tmp_file_name( suffix='vs_linker.W12' )
778    tmp_T1_name = get_tmp_file_name( suffix='vs_linker.T1' )
779    # Run lastz twice ( with different options ) on the linker sequence and paired end reads,
780    # looking for the linker ( each run finds some the other doesn't )
781    command = 'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \
782        ( options.input3, options.input2, tmp_w12_name )
783    run_command( command )
784    command = 'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \
785        ( options.input3, options.input2, tmp_T1_name )
786    run_command( command )
787    # Combine the alignment output from the two lastz runs
788    tmp_combined_linker_file_name = get_tmp_file_name( suffix='vs_linker' )
789    command = 'cat %s %s | sort -u > %s' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name )
790    run_command( command )
791    # Use the alignment info to split reads into left and right mates
792    tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name )
793    # Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates()
794    tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name )
795    # Combine and convert mate coordinates back to read coordinates
796    paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output )
797    # Delete all temporary files
798    for file_name in tmp_file_names:
799        os.remove( file_name )
800    # Handle any invalid lines in the input data
801    if total_skipped_lines:
802        msgs = dict( bad_interval="Bad interval in line",
803                     inconsistent_read_lengths="Inconsistent read/quality lengths for seq #",
804                     inconsistent_reads="Inconsistent reads for seq #",
805                     inconsistent_sizes="Inconsistent sizes for seq #",
806                     missing_mate="Mapping file does not include mate on line",
807                     missing_quals="Missing quality values for name on line",
808                     missing_seq="Missing sequence for name on line",
809                     multiple_seqs="Multiple names for seq #",
810                     no_header="First quality sequence has no header",
811                     num_fields="Must have 4 fields in line",
812                     reads_paired="SAM flag indicates reads already paired on line",
813                     sam_flag="Bad SAM flag on line",
814                     sam_headers="SAM headers on line",
815                     sam_min_columns="Need 11 columns on line",
816                     two_mate_names="Mate name already seen, line",
817                     wrong_seq_len="Size differs from length of seq #" )
818        print "Skipped %d invalid lines: "
819        msg = ""
820        for k, v in skipped_lines.items():
821            if v[0]:
822                # v[0] is the number of times the error occurred
823                # v[1] is the position of the line or sequence in the file
824                # v[2] is the name of the sequence or the text of the line
825                msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] )
826        print msg
827
828if __name__=="__main__": __main__()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。