| 1 | #!/usr/bin/env python | 
|---|
| 2 |  | 
|---|
| 3 | """ | 
|---|
| 4 | Runs Lastz paired read alignment process | 
|---|
| 5 | Written for Lastz v. 1.02.00. | 
|---|
| 6 |  | 
|---|
| 7 | # Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu), | 
|---|
| 8 | # then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu) | 
|---|
| 9 |  | 
|---|
| 10 | This tool takes the following input: | 
|---|
| 11 | a. A collection of 454 paired end reads ( a fasta file ) | 
|---|
| 12 | b. A linker sequence ( a very small fasta file ) | 
|---|
| 13 | c. A reference genome ( nob, 2bit or fasta ) | 
|---|
| 14 |  | 
|---|
| 15 | and uses the following process: | 
|---|
| 16 | 1. Split reads into mates:  the input to this step is the read file XXX.fasta, and the output is three | 
|---|
| 17 | files; XXX.short.fasta, XXX.long.fasta and XXX.mapping.  The mapping file records the information necessary | 
|---|
| 18 | to convert mate coordinates back into the original read, which is needed later in the process. | 
|---|
| 19 |  | 
|---|
| 20 | 2. Align short mates to the reference: this runs lastz against every chromosome.  The input is XXX.short.fasta | 
|---|
| 21 | and the reference genome, and the output is a SAM file, XXX.short.sam. | 
|---|
| 22 |  | 
|---|
| 23 | 3. Align long mates to the reference: this runs lastz against every chromosome.  The input is XXX.long.fasta | 
|---|
| 24 | and the reference genome, and the output is a SAM file, XXX.long.sam. | 
|---|
| 25 |  | 
|---|
| 26 | 4. Combine, and convert mate coordinates back to read coordinates.  The input is XXX.mapping, XXX.short.sam and | 
|---|
| 27 | XXX.long.sam, and the output is XXX.sam. | 
|---|
| 28 |  | 
|---|
| 29 | usage: lastz_paired_reads_wrapper.py [options] | 
|---|
| 30 | --ref_name: The reference name to change all output matches to | 
|---|
| 31 | --ref_source: The reference is cached or from the history | 
|---|
| 32 | --source_select: Use pre-set or cached reference file | 
|---|
| 33 | --input1: The name of the reference file if using history or reference base name if using cached | 
|---|
| 34 | --input2: The reads file to align | 
|---|
| 35 | --input3: The sequencing linker file | 
|---|
| 36 | --input4: The base quality score 454 file | 
|---|
| 37 | --ref_sequences: The number of sequences in the reference file if using one from history | 
|---|
| 38 | --output: The name of the output file | 
|---|
| 39 | --lastz_seqs_file_dir: Directory of local lastz_seqs.loc file | 
|---|
| 40 | """ | 
|---|
| 41 | import optparse, os, subprocess, shutil, sys, tempfile, time | 
|---|
| 42 | from string import maketrans | 
|---|
| 43 |  | 
|---|
| 44 | from galaxy import eggs | 
|---|
| 45 | import pkg_resources | 
|---|
| 46 | pkg_resources.require( 'bx-python' ) | 
|---|
| 47 | from bx.seq.twobit import * | 
|---|
| 48 | from bx.seq.fasta import FastaReader | 
|---|
| 49 | from galaxy.util.bunch import Bunch | 
|---|
| 50 | from galaxy.util import string_as_bool | 
|---|
| 51 |  | 
|---|
| 52 | # Column indexes for SAM required fields | 
|---|
| 53 | SAM_QNAME_COLUMN = 0 | 
|---|
| 54 | SAM_FLAG_COLUMN  = 1 | 
|---|
| 55 | SAM_RNAME_COLUMN = 2 | 
|---|
| 56 | SAM_POS_COLUMN   = 3 | 
|---|
| 57 | SAM_MAPQ_COLUMN  = 4 | 
|---|
| 58 | SAM_CIGAR_COLUMN = 5 | 
|---|
| 59 | SAM_MRNM_COLUMN  = 6 | 
|---|
| 60 | SAM_MPOS_COLUMN  = 7 | 
|---|
| 61 | SAM_ISIZE_COLUMN = 8 | 
|---|
| 62 | SAM_SEQ_COLUMN   = 9 | 
|---|
| 63 | SAM_QUAL_COLUMN  = 10 | 
|---|
| 64 | SAM_MIN_COLUMNS  = 11 | 
|---|
| 65 | # SAM bit-encoded flags | 
|---|
| 66 | BAM_FPAIRED      =    1    # the read is paired in sequencing, no matter whether it is mapped in a pair | 
|---|
| 67 | BAM_FPROPER_PAIR =    2    # the read is mapped in a proper pair | 
|---|
| 68 | BAM_FUNMAP       =    4    # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR | 
|---|
| 69 | BAM_FMUNMAP      =    8    # the mate is unmapped | 
|---|
| 70 | BAM_FREVERSE     =   16    # the read is mapped to the reverse strand | 
|---|
| 71 | BAM_FMREVERSE    =   32    # the mate is mapped to the reverse strand | 
|---|
| 72 | BAM_FREAD1       =   64    # this is read1 | 
|---|
| 73 | BAM_FREAD2       =  128    # this is read2 | 
|---|
| 74 | BAM_FSECONDARY   =  256    # not primary alignment | 
|---|
| 75 | BAM_FQCFAIL      =  512    # QC failure | 
|---|
| 76 | BAM_FDUP         = 1024    # optical or PCR duplicate | 
|---|
| 77 |  | 
|---|
| 78 | # Keep track of all created temporary files so they can be deleted | 
|---|
| 79 | global tmp_file_names | 
|---|
| 80 | tmp_file_names = [] | 
|---|
| 81 | # The values in the skipped_lines dict are tuples consisting of: | 
|---|
| 82 | # - the number of skipped lines for that error | 
|---|
| 83 | # If not a sequence error: | 
|---|
| 84 | # - the 1st line number on which the error was found | 
|---|
| 85 | # - the text of the 1st line on which the error was found | 
|---|
| 86 | # If a sequence error: | 
|---|
| 87 | # - The number of the sequence in the file | 
|---|
| 88 | # - the sequence name on which the error occurred | 
|---|
| 89 | # We may need to improve dealing with file position and text as | 
|---|
| 90 | # much of it comes from temporary files that are created from the | 
|---|
| 91 | # inputs, and not the inputs themselves, so this could be confusing | 
|---|
| 92 | # to the user. | 
|---|
| 93 | global skipped_lines | 
|---|
| 94 | skipped_lines = dict( bad_interval=( 0, 0, '' ), | 
|---|
| 95 | inconsistent_read_lengths=( 0, 0, '' ), | 
|---|
| 96 | inconsistent_reads=( 0, 0, '' ), | 
|---|
| 97 | inconsistent_sizes=( 0, 0, '' ), | 
|---|
| 98 | missing_mate=( 0, 0, '' ), | 
|---|
| 99 | missing_quals=( 0, 0, '' ), | 
|---|
| 100 | missing_seq=( 0, 0, '' ), | 
|---|
| 101 | multiple_seqs=( 0, 0, '' ), | 
|---|
| 102 | no_header=( 0, 0, '' ), | 
|---|
| 103 | num_fields=( 0, 0, '' ), | 
|---|
| 104 | reads_paired=( 0, 0, '' ), | 
|---|
| 105 | sam_flag=( 0, 0, '' ), | 
|---|
| 106 | sam_headers=( 0, 0, '' ), | 
|---|
| 107 | sam_min_columns=( 0, 0, '' ), | 
|---|
| 108 | two_mate_names=( 0, 0, '' ), | 
|---|
| 109 | wrong_seq_len=( 0, 0, '' ) ) | 
|---|
| 110 | global total_skipped_lines | 
|---|
| 111 | total_skipped_lines = 0 | 
|---|
| 112 |  | 
|---|
| 113 | def stop_err( msg ): | 
|---|
| 114 | sys.stderr.write( "%s" % msg ) | 
|---|
| 115 | sys.exit() | 
|---|
| 116 |  | 
|---|
| 117 | def skip_line( error_key, position, text ): | 
|---|
| 118 | if not skipped_lines[ error_key ][2]: | 
|---|
| 119 | skipped_lines[ error_key ][1] = position | 
|---|
| 120 | skipped_lines[ error_key ][2] = text | 
|---|
| 121 | skipped_lines[ error_key ][0] += 1 | 
|---|
| 122 | total_skipped_lines += 1 | 
|---|
| 123 |  | 
|---|
| 124 | def get_tmp_file_name( dir=None, suffix=None ): | 
|---|
| 125 | """ | 
|---|
| 126 | Return a unique temporary file name that can be managed.  The | 
|---|
| 127 | file must be manually removed after use. | 
|---|
| 128 | """ | 
|---|
| 129 | if dir and suffix: | 
|---|
| 130 | tmp_fd, tmp_name = tempfile.mkstemp( dir=dir, suffix=suffix ) | 
|---|
| 131 | elif dir: | 
|---|
| 132 | tmp_fd, tmp_name = tempfile.mkstemp( dir=dir ) | 
|---|
| 133 | elif suffix: | 
|---|
| 134 | tmp_fd, tmp_name = tempfile.mkstemp( suffix=suffix ) | 
|---|
| 135 | os.close( tmp_fd ) | 
|---|
| 136 | tmp_file_names.append( tmp_name ) | 
|---|
| 137 | return tmp_name | 
|---|
| 138 |  | 
|---|
| 139 | def run_command( command ): | 
|---|
| 140 | proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE, ) | 
|---|
| 141 | proc.wait() | 
|---|
| 142 | stderr = proc.stderr.read() | 
|---|
| 143 | proc.wait() | 
|---|
| 144 | if stderr: | 
|---|
| 145 | stop_err( stderr ) | 
|---|
| 146 |  | 
|---|
| 147 | def split_paired_reads( input2, combined_linker_file_name ): | 
|---|
| 148 | """ | 
|---|
| 149 | Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals | 
|---|
| 150 | showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right | 
|---|
| 151 | halves. | 
|---|
| 152 |  | 
|---|
| 153 | The input intervals look like this.  Note that they may include multiple intervals for the same read | 
|---|
| 154 | ( which should overlap ), and we use the union of them as the linker interval.  Non-overlaps are | 
|---|
| 155 | reported to the user, and those reads are not processed.  Starts are origin zero. | 
|---|
| 156 |  | 
|---|
| 157 | #name     strand start len size | 
|---|
| 158 | FG3OYDA05FTEES +   219  42 283 | 
|---|
| 159 | FG3OYDA05FVOLL +   263  41 416 | 
|---|
| 160 | FG3OYDA05FFL7J +    81  42 421 | 
|---|
| 161 | FG3OYDA05FOQWE +    55  42 332 | 
|---|
| 162 | FG3OYDA05FV4DW +   297  42 388 | 
|---|
| 163 | FG3OYDA05FWAQV +   325  42 419 | 
|---|
| 164 | FG3OYDA05FVLGA +    90  42 367 | 
|---|
| 165 | FG3OYDA05FWJ71 +    58  42 276 | 
|---|
| 166 |  | 
|---|
| 167 | The output gives each half-sequence on a separate line, like this.  This allows easy sorting of the | 
|---|
| 168 | sequences by length, after the fact. | 
|---|
| 169 |  | 
|---|
| 170 | 219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC... | 
|---|
| 171 | 22  FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG | 
|---|
| 172 | """ | 
|---|
| 173 | # Bob says these should be hard-coded. | 
|---|
| 174 | seq_len_lower_threshold = 17 | 
|---|
| 175 | short_mate_cutoff = 50 | 
|---|
| 176 | # We need to pass the name of this file back to the caller. | 
|---|
| 177 | tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' ) | 
|---|
| 178 | mates_file = file( tmp_mates_file_name, "w+b" ) | 
|---|
| 179 | # Read the linker intervals | 
|---|
| 180 | combined_linker_file = file( combined_linker_file_name, "rb" ) | 
|---|
| 181 | read_to_linker_dict = {} | 
|---|
| 182 | i = 0 | 
|---|
| 183 | for i, line in enumerate( combined_linker_file ): | 
|---|
| 184 | line = line.strip() | 
|---|
| 185 | if line.startswith( "#" ): | 
|---|
| 186 | continue | 
|---|
| 187 | if line.find( '#' ) >= 0: | 
|---|
| 188 | line = line.split( "#", 1 )[0].rstrip() | 
|---|
| 189 | fields = line.split() | 
|---|
| 190 | if len( fields ) != 4: | 
|---|
| 191 | skip_line( 'num_fields', i+1, line ) | 
|---|
| 192 | continue | 
|---|
| 193 | name, start, length, size = fields | 
|---|
| 194 | start = int( start ) | 
|---|
| 195 | length = int( length ) | 
|---|
| 196 | size = int( size ) | 
|---|
| 197 | end = start + length | 
|---|
| 198 | if end > size: | 
|---|
| 199 | skip_line[ 'bad_interval' ] += 1 | 
|---|
| 200 | continue | 
|---|
| 201 | if name not in read_to_linker_dict: | 
|---|
| 202 | read_to_linker_dict[ name ] = ( start, end, size ) | 
|---|
| 203 | continue | 
|---|
| 204 | if read_to_linker_dict[ name ] == None: | 
|---|
| 205 | # Read previously marked as non-overlapping intervals, so skip this sequence - see below | 
|---|
| 206 | continue | 
|---|
| 207 | ( s, e, sz ) = read_to_linker_dict[ name ] | 
|---|
| 208 | if sz != size: | 
|---|
| 209 | skip_line( 'inconsistent_sizes', i+1, name ) | 
|---|
| 210 | continue | 
|---|
| 211 | if s > end or e < start: | 
|---|
| 212 | # Non-overlapping intervals, so skip this sequence | 
|---|
| 213 | read_to_linker_dict[ name ] = None | 
|---|
| 214 | continue | 
|---|
| 215 | read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size ) | 
|---|
| 216 | combined_linker_file.close() | 
|---|
| 217 | # We need to pass the name of this file back to the caller. | 
|---|
| 218 | tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' ) | 
|---|
| 219 | mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' ) | 
|---|
| 220 | # Process the sequences | 
|---|
| 221 | seqs = 0 | 
|---|
| 222 | fasta_reader = FastaReader( file( input2, 'rb' ) ) | 
|---|
| 223 | while True: | 
|---|
| 224 | seq = fasta_reader.next() | 
|---|
| 225 | if not seq: | 
|---|
| 226 | break | 
|---|
| 227 | seqs += 1 | 
|---|
| 228 | if seq.name not in read_to_linker_dict: | 
|---|
| 229 | if seq.length > seq_len_lower_threshold: | 
|---|
| 230 | mates_file.write( "%-3d %s   %s\n" % ( seq.length, seq.name, seq.text ) ) | 
|---|
| 231 | read_to_linker_dict[ seq.name ] = "" | 
|---|
| 232 | continue | 
|---|
| 233 | if read_to_linker_dict[ seq.name ] == "": | 
|---|
| 234 | skip_line( 'multiple_seqs', seqs, seq.name ) | 
|---|
| 235 | continue | 
|---|
| 236 | if read_to_linker_dict[ seq.name ] == None: | 
|---|
| 237 | # Read previously marked as non-overlapping intervals, so skip this sequence - see above | 
|---|
| 238 | continue | 
|---|
| 239 | ( start, end, size ) = read_to_linker_dict[ seq.name ] | 
|---|
| 240 | if seq.length != size: | 
|---|
| 241 | skip_line( 'wrong_seq_len', seqs, seq.name ) | 
|---|
| 242 | continue | 
|---|
| 243 | left = seq.text[ :start ] | 
|---|
| 244 | right = seq.text[ end: ] | 
|---|
| 245 | left_is_small = len( left ) <= seq_len_lower_threshold | 
|---|
| 246 | right_is_small = len( right ) <= seq_len_lower_threshold | 
|---|
| 247 | if left_is_small and right_is_small: | 
|---|
| 248 | continue | 
|---|
| 249 | if not left_is_small: | 
|---|
| 250 | mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) ) | 
|---|
| 251 | mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) ) | 
|---|
| 252 | if not right_is_small: | 
|---|
| 253 | mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) ) | 
|---|
| 254 | mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) ) | 
|---|
| 255 | read_to_linker_dict[ seq.name ] = "" | 
|---|
| 256 | combined_linker_file.close() | 
|---|
| 257 | mates_file.close() | 
|---|
| 258 | mates_mapping_file.close() | 
|---|
| 259 | # Create temporary files for short and long mates | 
|---|
| 260 | tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' ) | 
|---|
| 261 | tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' ) | 
|---|
| 262 | tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' ) | 
|---|
| 263 | tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' ) | 
|---|
| 264 | i = 0 | 
|---|
| 265 | for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ): | 
|---|
| 266 | fields = line.split() | 
|---|
| 267 | seq_len = int( fields[0] ) | 
|---|
| 268 | seq_name = fields[1] | 
|---|
| 269 | seq_text = fields[2] | 
|---|
| 270 | if seq_len <= short_mate_cutoff: | 
|---|
| 271 | tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) ) | 
|---|
| 272 | else: | 
|---|
| 273 | tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) ) | 
|---|
| 274 | tmp_mates_short.close() | 
|---|
| 275 | tmp_mates_long.close() | 
|---|
| 276 | return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name | 
|---|
| 277 |  | 
|---|
| 278 | def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ): | 
|---|
| 279 | tmp_align_file_names = [] | 
|---|
| 280 | if ref_source == 'history': | 
|---|
| 281 | # Reference is a fasta dataset from the history | 
|---|
| 282 | # Create temporary files to contain the output from lastz executions | 
|---|
| 283 | tmp_short_file_name = get_tmp_file_name( suffix='short_out' ) | 
|---|
| 284 | tmp_align_file_names.append( tmp_short_file_name ) | 
|---|
| 285 | tmp_long_file_name = get_tmp_file_name( suffix='long_out' ) | 
|---|
| 286 | tmp_align_file_names.append( tmp_long_file_name ) | 
|---|
| 287 | seqs = 0 | 
|---|
| 288 | fasta_reader = FastaReader( open( input1 ) ) | 
|---|
| 289 | while True: | 
|---|
| 290 | # Read the next sequence from the reference dataset.  Note that if the reference contains | 
|---|
| 291 | # a small number of chromosomes this loop is ok, but in many cases the genome has a bunch | 
|---|
| 292 | # of small straggler scaffolds and contigs and it is a computational waste to do each one | 
|---|
| 293 | # of these in its own run.  There is an I/O down side to running by subsets (even if they are | 
|---|
| 294 | # one sequence per subset), compared to splitting the reference into sizes of 250 mb.  With | 
|---|
| 295 | # the subset action, lastz still has to read and parse the entire file for every run (this | 
|---|
| 296 | # is true for fasta, but for .2bit files it can access each sequence directly within the file, | 
|---|
| 297 | # so the overhead is minimal). | 
|---|
| 298 | """ | 
|---|
| 299 | :> output_file  (this creates the output file, empty) | 
|---|
| 300 | while there are more sequences to align | 
|---|
| 301 | find the next sequences that add up to 250M, put their names in farf.names | 
|---|
| 302 | lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ... | 
|---|
| 303 | >> output_file | 
|---|
| 304 | """ | 
|---|
| 305 | seq = fasta_reader.next() | 
|---|
| 306 | if not seq: | 
|---|
| 307 | break | 
|---|
| 308 | seqs += 1 | 
|---|
| 309 | # Create a temporary file to contain the current sequence as input to lastz. | 
|---|
| 310 | # We're doing this a bit differently here since we could be generating a huge | 
|---|
| 311 | # number of temporary files. | 
|---|
| 312 | tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs ) | 
|---|
| 313 | tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' ) | 
|---|
| 314 | tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) ) | 
|---|
| 315 | tmp_in_file.close() | 
|---|
| 316 | # Align short mates | 
|---|
| 317 | command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name ) | 
|---|
| 318 | command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- ' | 
|---|
| 319 | command += '>> %s' % tmp_short_file_name | 
|---|
| 320 | run_command( command ) | 
|---|
| 321 | # Align long mates | 
|---|
| 322 | command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name ) | 
|---|
| 323 | command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- ' | 
|---|
| 324 | command += '>> %s' % tmp_long_file_name | 
|---|
| 325 | run_command( command ) | 
|---|
| 326 | # Remove the temporary file that contains the current sequence | 
|---|
| 327 | os.remove( tmp_in_file_name ) | 
|---|
| 328 | else: | 
|---|
| 329 | # Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file | 
|---|
| 330 | tbf = TwoBitFile( open( input1, 'rb' ) ) | 
|---|
| 331 | for chrom in tbf.keys(): | 
|---|
| 332 | # Align short mates | 
|---|
| 333 | tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom ) | 
|---|
| 334 | tmp_align_file_names.append( tmp_short_file_name ) | 
|---|
| 335 | command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name ) | 
|---|
| 336 | command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- ' | 
|---|
| 337 | command += '> %s' % tmp_short_file_name | 
|---|
| 338 | run_command( command ) | 
|---|
| 339 | # Align long mates | 
|---|
| 340 | tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom ) | 
|---|
| 341 | tmp_align_file_names.append( tmp_long_file_name ) | 
|---|
| 342 | command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name ) | 
|---|
| 343 | command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- ' | 
|---|
| 344 | command += '> %s' % tmp_long_file_name | 
|---|
| 345 | run_command( command ) | 
|---|
| 346 | return tmp_align_file_names | 
|---|
| 347 |  | 
|---|
| 348 | def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ): | 
|---|
| 349 | """ | 
|---|
| 350 | Given a SAM file corresponding to alignments of *subsegments* of paired 'reads' to a reference sequence, | 
|---|
| 351 | convert the positions on the subsegments to positions on the reads.  Also (optionally) add quality values. | 
|---|
| 352 |  | 
|---|
| 353 | The input file is in SAM format, as shown below.  Each line represents the alignment of a part of a read | 
|---|
| 354 | to a reference sequence.  Read pairs are indicated by suffixes in their names.  Normally, the suffixes _L | 
|---|
| 355 | and _R indicate the left and right mates of reads (this can be overridden with the --left and --right | 
|---|
| 356 | options).  Reads that were not mates have no suffix. | 
|---|
| 357 |  | 
|---|
| 358 | (SAM header lines omitted) | 
|---|
| 359 | F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M          * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT           * | 
|---|
| 360 | F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S        * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat     * | 
|---|
| 361 | F2YP0BU02HREML_R 0  chr21 16386077 255 33M5S        * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc             * | 
|---|
| 362 | F2YP0BU02IOF1F_L 0  chr21 17567321 255 7S28M        * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC                * | 
|---|
| 363 | F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S   * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt  * | 
|---|
| 364 | F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M        * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA             * | 
|---|
| 365 | F2YP0BU02JIMJ4_R 0  chr21 22383051 255 19M          * 0 0 CCCTTTATCATTTTTTATT                                * | 
|---|
| 366 | F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M     * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT                   * | 
|---|
| 367 | F2YP0BU02IODR5_L 0  chr21 30935325 255 37M          * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA              * | 
|---|
| 368 | F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M      * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG                      * | 
|---|
| 369 | F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M          * 0 0 CACACCTGTAACCCCAGCACTTT                            * | 
|---|
| 370 | F2YP0BU02HKC61_R 0  chr21 31678718 255 40M          * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           * | 
|---|
| 371 | F2YP0BU02HKC61_R 0  chr21 31678718 255 40M          * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           * | 
|---|
| 372 | F2YP0BU02HVA88   16 chr21 31703558 255 1M1D35M8S    * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat       * | 
|---|
| 373 | F2YP0BU02JDCF1_L 0  chr21 31816600 255 38M          * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT             * | 
|---|
| 374 | F2YP0BU02GZ1GO_R 0  chr21 33360122 255 6S38M        * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC       * | 
|---|
| 375 | F2YP0BU02FX387_L 16 chr22 14786201 255 26M          * 0 0 TGGATGAAGCTGGAAACCATCATTCT                         * | 
|---|
| 376 | F2YP0BU02IF2NE_R 0  chr22 16960842 255 40M10S       * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc * | 
|---|
| 377 | F2YP0BU02F4TVA   0  chr22 19200522 255 49M          * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA  * | 
|---|
| 378 | F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M        * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG           * | 
|---|
| 379 | F2YP0BU02FS4EM_R 0  chr22 30159364 255 29M          * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG                      * | 
|---|
| 380 | F2YP0BU02G197P_L 0  chr22 32044496 255 40M10S       * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc * | 
|---|
| 381 | F2YP0BU02FIING   16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG         * | 
|---|
| 382 | F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M     * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA              * | 
|---|
| 383 |  | 
|---|
| 384 | The user must provide a mapping file (which might better be called an unmapping file).  This file is usually | 
|---|
| 385 | created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single | 
|---|
| 386 | read (this means the left and right mates were part of a single read).  The mapping file contains four columns. | 
|---|
| 387 | The first two give the mates's name (including the suffix) and the read name.  The last two columns describe how | 
|---|
| 388 | much of the full original sequence is missing from the mate.  For example, in the read below, the left mate is | 
|---|
| 389 | missing 63 on the right (42 for the linker and 21 for the right half).  The right mate is missing 339 on the left. | 
|---|
| 390 |  | 
|---|
| 391 | left half:  TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA | 
|---|
| 392 | AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC | 
|---|
| 393 | AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA | 
|---|
| 394 | CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC | 
|---|
| 395 | ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG | 
|---|
| 396 | ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG | 
|---|
| 397 | linker:     CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA | 
|---|
| 398 | right half: CTCCTGCCTCAGCCTCCCGAG | 
|---|
| 399 |  | 
|---|
| 400 | mate_name        read_name      offset_to_start offset_from_end | 
|---|
| 401 | F2YP0BU02FS4EM_L F2YP0BU02FS4EM         0              71 | 
|---|
| 402 | F2YP0BU02FS4EM_R F2YP0BU02FS4EM       339               0 | 
|---|
| 403 |  | 
|---|
| 404 | The user can also specify a quality scores file, which should look something like this.  Quality values are presumed | 
|---|
| 405 | to be PHRED scores, written in space-delimited decimal. | 
|---|
| 406 |  | 
|---|
| 407 | >F2YP0BU02FS4EM | 
|---|
| 408 | 38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40 | 
|---|
| 409 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33 | 
|---|
| 410 | 32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40 | 
|---|
| 411 | 40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35 | 
|---|
| 412 | 31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34 | 
|---|
| 413 | 34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40 | 
|---|
| 414 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 | 
|---|
| 415 | 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40 | 
|---|
| 416 | 40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33 | 
|---|
| 417 | 36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36 | 
|---|
| 418 | 39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40 | 
|---|
| 419 | 40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26 | 
|---|
| 420 | 26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 | 
|---|
| 421 | 40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40 | 
|---|
| 422 | 39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35 | 
|---|
| 423 | >F2YP0BU02G197P | 
|---|
| 424 | 40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40 | 
|---|
| 425 | 40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 | 
|---|
| 426 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 | 
|---|
| 427 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 | 
|---|
| 428 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40 | 
|---|
| 429 | 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 | 
|---|
| 430 | 40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40 | 
|---|
| 431 | 40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40 | 
|---|
| 432 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 | 
|---|
| 433 | 33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40 | 
|---|
| 434 | 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 | 
|---|
| 435 | 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 | 
|---|
| 436 | 40 40 40 39 39 39 40 40 | 
|---|
| 437 | >F2YP0BU02FIING | 
|---|
| 438 | 32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19 | 
|---|
| 439 | 19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25 | 
|---|
| 440 |  | 
|---|
| 441 | The output file is also SAM: | 
|---|
| 442 |  | 
|---|
| 443 | (SAM header lines omitted) | 
|---|
| 444 | F2YP0BU02G7LK5 81  chr21 15557360 255 40M303H        * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT           D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII | 
|---|
| 445 | F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S      * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat     AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@ | 
|---|
| 446 | F2YP0BU02HREML 65  chr21 16386077 255 320H33M5S      * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc             HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH | 
|---|
| 447 | F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H      * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC                4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED | 
|---|
| 448 | F2YP0BU02IKX84 81  chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt  ;;;=7@.55------?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII | 
|---|
| 449 | F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M      * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA             IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII | 
|---|
| 450 | F2YP0BU02JIMJ4 65  chr21 22383051 255 208H19M        * 0 0 CCCTTTATCATTTTTTATT                                555544E?GE113344I22 | 
|---|
| 451 | F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M   * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT                   IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII | 
|---|
| 452 | F2YP0BU02IODR5 129 chr21 30935325 255 37M154H        * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA              6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@ | 
|---|
| 453 | F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M    * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG                      BB1552222<<>9==8;;?AA=??A???A | 
|---|
| 454 | F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M        * 0 0 CACACCTGTAACCCCAGCACTTT                            IIIIIIIIIIICCCCIIIIIHHH | 
|---|
| 455 | F2YP0BU02HKC61 65  chr21 31678718 255 300H40M        * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442 | 
|---|
| 456 | F2YP0BU02HKC61 65  chr21 31678718 255 300H40M        * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT           AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442 | 
|---|
| 457 | F2YP0BU02HVA88 16  chr21 31703558 255 1M1D35M8S      * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat       >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF | 
|---|
| 458 | F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H        * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT             IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII | 
|---|
| 459 | F2YP0BU02GZ1GO 65  chr21 33360122 255 76H6S38M       * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC       BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51 | 
|---|
| 460 | F2YP0BU02FX387 145 chr22 14786201 255 201H26M        * 0 0 TGGATGAAGCTGGAAACCATCATTCT                         IIHHHHHHHHHHHHHFFFFFFFFFFF | 
|---|
| 461 | F2YP0BU02IF2NE 65  chr22 16960842 255 209H40M10S     * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7- | 
|---|
| 462 | F2YP0BU02F4TVA 0   chr22 19200522 255 49M            * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA  FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5 | 
|---|
| 463 | F2YP0BU02HKC61 81  chr22 29516998 255 8S32M300H      * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG           2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA | 
|---|
| 464 | F2YP0BU02FS4EM 65  chr22 30159364 255 339H29M        * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG                      IIIIHHEIIIIHHHH??=DDHIIIIIDDD | 
|---|
| 465 | F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H     * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII | 
|---|
| 466 | F2YP0BU02FIING 16  chr22 45959944 255 3M1I11M1I26M   * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG         :::9:32267=:114244/...446==<<<?@?:9::::AAA | 
|---|
| 467 | F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M   * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA              IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII | 
|---|
| 468 |  | 
|---|
| 469 | """ | 
|---|
| 470 | left_suffix       = "_L" | 
|---|
| 471 | right_suffix      = "_R" | 
|---|
| 472 | # Read the mapping | 
|---|
| 473 | mate_to_read_dict = {} | 
|---|
| 474 | i = 0 | 
|---|
| 475 | for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ): | 
|---|
| 476 | line = line.strip() | 
|---|
| 477 | if not line.startswith( "#" ): | 
|---|
| 478 | fields = line.split() | 
|---|
| 479 | if len( fields ) != 4: | 
|---|
| 480 | skip_line( "num_fields", i+1, line ) | 
|---|
| 481 | continue | 
|---|
| 482 | mate_name, read_name, s_offset, e_offset = fields | 
|---|
| 483 | if mate_name in mate_to_read_dict: | 
|---|
| 484 | skip_line( 'two_mate_names', i+1, mate_name ) | 
|---|
| 485 | continue | 
|---|
| 486 | mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) ) | 
|---|
| 487 | # Read sequence data | 
|---|
| 488 | read_to_nucs_dict = {} | 
|---|
| 489 | seqs = 0 | 
|---|
| 490 | fasta_reader = FastaReader( file( input2, 'rb' ) ) | 
|---|
| 491 | while True: | 
|---|
| 492 | seq = fasta_reader.next() | 
|---|
| 493 | if not seq: | 
|---|
| 494 | break | 
|---|
| 495 | seqs += 1 | 
|---|
| 496 | seq_text_upper = seq.text.upper() | 
|---|
| 497 | if seq.name in read_to_nucs_dict: | 
|---|
| 498 | if seq_text_upper != read_to_nucs_dict[ seq.name ]: | 
|---|
| 499 | skip_line( 'inconsistent_reads', seqs, seq.name ) | 
|---|
| 500 | continue | 
|---|
| 501 | read_to_nucs_dict[ seq.name ] = seq_text_upper | 
|---|
| 502 | # Read quality data | 
|---|
| 503 | def quality_sequences( f ): | 
|---|
| 504 | seq_name  = None | 
|---|
| 505 | seq_quals = None | 
|---|
| 506 | line_number = 0 | 
|---|
| 507 | for line in f: | 
|---|
| 508 | line_number += 1 | 
|---|
| 509 | line = line.strip() | 
|---|
| 510 | if line.startswith( ">" ): | 
|---|
| 511 | if seq_name != None: | 
|---|
| 512 | yield ( seq_name, seq_quals, seq_line ) | 
|---|
| 513 | seq_name  = sequence_name( line ) | 
|---|
| 514 | seq_line  = line_number | 
|---|
| 515 | seq_quals = [] | 
|---|
| 516 | elif seq_name is None: | 
|---|
| 517 | skip_line( 'no_header', line_number, line ) | 
|---|
| 518 | continue | 
|---|
| 519 | else: | 
|---|
| 520 | seq_quals += [ int( q ) for q in line.split() ] | 
|---|
| 521 | if seq_name is not None: | 
|---|
| 522 | yield ( seq_name, seq_quals, seq_line ) | 
|---|
| 523 | def sequence_name( s ): | 
|---|
| 524 | s = s[ 1: ].strip() | 
|---|
| 525 | if not s: | 
|---|
| 526 | return "" | 
|---|
| 527 | else: | 
|---|
| 528 | return s.split()[ 0 ] | 
|---|
| 529 | read_to_quals_dict = {} | 
|---|
| 530 | # TODO: should we use Dan's fastaNamedReader here? | 
|---|
| 531 | for seq_name, quals, line_number in quality_sequences( file( input4 ) ): | 
|---|
| 532 | quals = samify_phred_scores( quals ) | 
|---|
| 533 | if seq_name in read_to_quals_dict: | 
|---|
| 534 | if quals != read_to_quals_dict[ seq_name ]: | 
|---|
| 535 | skip_line( 'inconsistent_reads', line_number, seq_name ) | 
|---|
| 536 | continue | 
|---|
| 537 | if len( quals ) != len( read_to_nucs_dict[ seq_name ] ): | 
|---|
| 538 | skip_line( 'inconsistent_read_lengths', line_number, seq_name ) | 
|---|
| 539 | continue | 
|---|
| 540 | read_to_quals_dict[ seq_name ] = quals | 
|---|
| 541 | # process the SAM file | 
|---|
| 542 | tmp_align_file_names = ' '.join( tmp_align_file_name_list ) | 
|---|
| 543 | combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' ) | 
|---|
| 544 | command = 'cat %s | grep -v "^@" | sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name ) | 
|---|
| 545 | run_command( command ) | 
|---|
| 546 | fout = file( output, 'w+b' ) | 
|---|
| 547 | has_non_header = False | 
|---|
| 548 | i = 0 | 
|---|
| 549 | for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ): | 
|---|
| 550 | line = line.strip() | 
|---|
| 551 | if line.startswith( "@" ): | 
|---|
| 552 | if has_non_header: | 
|---|
| 553 | skip_line( 'sam_headers', i+1, line ) | 
|---|
| 554 | continue | 
|---|
| 555 | fout.write( "%s\n" % line ) | 
|---|
| 556 | continue | 
|---|
| 557 | has_non_header = True | 
|---|
| 558 | fields = line.split() | 
|---|
| 559 | num_fields = len( fields ) | 
|---|
| 560 | if num_fields < SAM_MIN_COLUMNS: | 
|---|
| 561 | skip_line( 'sam_min_columns', i+1, line ) | 
|---|
| 562 | continue | 
|---|
| 563 | # Set flags for mates | 
|---|
| 564 | try: | 
|---|
| 565 | flag = int( fields[ SAM_FLAG_COLUMN ] ) | 
|---|
| 566 | except ValueError: | 
|---|
| 567 | skip_line( 'sam_flag', i+1, line ) | 
|---|
| 568 | continue | 
|---|
| 569 | if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ): | 
|---|
| 570 | skip_line( 'reads_paired', i+1, line ) | 
|---|
| 571 | continue | 
|---|
| 572 | mate_name = fields[ SAM_QNAME_COLUMN ] | 
|---|
| 573 | unmap_it = False | 
|---|
| 574 | half = None | 
|---|
| 575 | if mate_name.endswith( left_suffix ): | 
|---|
| 576 | flag += BAM_FPAIRED + BAM_FREAD2 | 
|---|
| 577 | fields[ SAM_FLAG_COLUMN ] = "%d" % flag | 
|---|
| 578 | unmap_it = True | 
|---|
| 579 | half = "L" | 
|---|
| 580 | elif mate_name.endswith( right_suffix ): | 
|---|
| 581 | flag += BAM_FPAIRED + BAM_FREAD1 | 
|---|
| 582 | fields[ SAM_FLAG_COLUMN ] = "%d" % flag | 
|---|
| 583 | unmap_it = True | 
|---|
| 584 | half = "R" | 
|---|
| 585 | on_plus_strand = ( flag & BAM_FREVERSE == 0 ) | 
|---|
| 586 | # Convert position from mate to read by adding clipping to cigar | 
|---|
| 587 | if not unmap_it: | 
|---|
| 588 | read_name = mate_name | 
|---|
| 589 | else: | 
|---|
| 590 | try: | 
|---|
| 591 | read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ] | 
|---|
| 592 | except KeyError: | 
|---|
| 593 | skip_line( 'missing_mate', i+1, mate_name ) | 
|---|
| 594 | continue | 
|---|
| 595 | cigar = fields[ SAM_CIGAR_COLUMN ] | 
|---|
| 596 | cigar_prefix = None | 
|---|
| 597 | cigar_suffix = None | 
|---|
| 598 | if half == "L": | 
|---|
| 599 | if on_plus_strand: | 
|---|
| 600 | if s_offset > 0: | 
|---|
| 601 | cigar_prefix = ( s_offset, "S" ) | 
|---|
| 602 | if e_offset > 0: | 
|---|
| 603 | cigar_suffix = ( e_offset, "H" ) | 
|---|
| 604 | else: | 
|---|
| 605 | if e_offset > 0: | 
|---|
| 606 | cigar_prefix = ( e_offset, "H" ) | 
|---|
| 607 | if s_offset > 0: | 
|---|
| 608 | cigar_suffix = ( s_offset, "S" ) | 
|---|
| 609 | elif half == "R": | 
|---|
| 610 | if on_plus_strand: | 
|---|
| 611 | if s_offset > 0: | 
|---|
| 612 | cigar_prefix = ( s_offset, "H" ) | 
|---|
| 613 | if e_offset > 0: | 
|---|
| 614 | cigar_suffix = ( e_offset, "S" ) | 
|---|
| 615 | else: | 
|---|
| 616 | if e_offset > 0: | 
|---|
| 617 | cigar_prefix = ( e_offset, "S" ) | 
|---|
| 618 | if s_offset > 0: | 
|---|
| 619 | cigar_suffix = ( s_offset, "H" ) | 
|---|
| 620 | else: | 
|---|
| 621 | if on_plus_strand: | 
|---|
| 622 | if s_offset > 0: | 
|---|
| 623 | cigar_prefix = ( s_offset, "S" ) | 
|---|
| 624 | if e_offset > 0: | 
|---|
| 625 | cigar_suffix = ( e_offset, "S" ) | 
|---|
| 626 | else: | 
|---|
| 627 | if e_offset > 0: | 
|---|
| 628 | cigar_prefix = ( e_offset, "S" ) | 
|---|
| 629 | if s_offset > 0: | 
|---|
| 630 | cigar_suffix = ( s_offset, "S" ) | 
|---|
| 631 | if cigar_prefix != None: | 
|---|
| 632 | count, op = cigar_prefix | 
|---|
| 633 | cigar = prefix_cigar( "%d%s" % ( count, op ), cigar ) | 
|---|
| 634 | if op == "S": | 
|---|
| 635 | refPos = int( fields[ SAM_POS_COLUMN ] ) - count | 
|---|
| 636 | fields[ SAM_POS_COLUMN ] = "%d" % refPos | 
|---|
| 637 | if cigar_suffix != None: | 
|---|
| 638 | count, op = cigar_suffix | 
|---|
| 639 | cigar = suffix_cigar( cigar,"%d%s" % ( count, op) ) | 
|---|
| 640 | fields[ SAM_QNAME_COLUMN ] = read_name | 
|---|
| 641 | fields[ SAM_CIGAR_COLUMN ] = cigar | 
|---|
| 642 | # Fetch sequence and quality values, and flip/clip them | 
|---|
| 643 | if read_name not in read_to_nucs_dict: | 
|---|
| 644 | skip_line( 'missing_seq', i+1, read_name ) | 
|---|
| 645 | continue | 
|---|
| 646 | nucs = read_to_nucs_dict[ read_name ] | 
|---|
| 647 | if not on_plus_strand: | 
|---|
| 648 | nucs = reverse_complement( nucs ) | 
|---|
| 649 | quals = None | 
|---|
| 650 | if read_to_quals_dict != None: | 
|---|
| 651 | if read_name not in read_to_quals_dict: | 
|---|
| 652 | skip_line( 'missing_quals', i+1, read_name ) | 
|---|
| 653 | continue | 
|---|
| 654 | quals = read_to_quals_dict[ read_name ] | 
|---|
| 655 | if not on_plus_strand: | 
|---|
| 656 | quals = reverse_string( quals ) | 
|---|
| 657 | cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] ) | 
|---|
| 658 | nucs, quals = clip_for_cigar( cigar, nucs, quals ) | 
|---|
| 659 | fields[ SAM_SEQ_COLUMN ] = nucs | 
|---|
| 660 | if quals != None: | 
|---|
| 661 | fields[ SAM_QUAL_COLUMN ] = quals | 
|---|
| 662 | # Output the line | 
|---|
| 663 | fout.write( "%s\n" % "\t".join( fields ) ) | 
|---|
| 664 | fout.close() | 
|---|
| 665 |  | 
|---|
| 666 | def prefix_cigar( prefix, cigar ): | 
|---|
| 667 | ix = 0 | 
|---|
| 668 | while cigar[ ix ].isdigit(): | 
|---|
| 669 | ix += 1 | 
|---|
| 670 | if cigar[ ix ] != prefix[ -1 ]: | 
|---|
| 671 | return prefix + cigar | 
|---|
| 672 | count = int( prefix[ :-1 ] ) + int( cigar[ :ix ] ) | 
|---|
| 673 | return "%d%s%s" % ( count, prefix[ -1 ], cigar[ ix+1: ] ) | 
|---|
| 674 |  | 
|---|
| 675 | def suffix_cigar( cigar, suffix ): | 
|---|
| 676 | if cigar[ -1 ] != suffix[ -1 ]: | 
|---|
| 677 | return cigar + suffix | 
|---|
| 678 | ix = len( cigar ) - 2 | 
|---|
| 679 | while cigar[ix].isdigit(): | 
|---|
| 680 | ix -= 1 | 
|---|
| 681 | ix += 1 | 
|---|
| 682 | count = int( cigar[ ix:-1 ] ) + int( suffix[ :-1 ] ) | 
|---|
| 683 | return "%s%d%s" % ( cigar[ :ix ], count, suffix[ -1 ] ) | 
|---|
| 684 |  | 
|---|
| 685 | def split_cigar( text ): | 
|---|
| 686 | fields = [] | 
|---|
| 687 | field  = [] | 
|---|
| 688 | for ch in text: | 
|---|
| 689 | if ch not in "MIDHS": | 
|---|
| 690 | field += ch | 
|---|
| 691 | continue | 
|---|
| 692 | if field == []: | 
|---|
| 693 | raise ValueError | 
|---|
| 694 | fields += [ ( int( "".join( field ) ), ch ) ] | 
|---|
| 695 | field = [] | 
|---|
| 696 | if field != []: | 
|---|
| 697 | raise ValueError | 
|---|
| 698 | return fields | 
|---|
| 699 |  | 
|---|
| 700 | def clip_for_cigar( cigar, nucs, quals ): | 
|---|
| 701 | # Hard clip prefix | 
|---|
| 702 | count, op = cigar[0] | 
|---|
| 703 | if op == "H": | 
|---|
| 704 | nucs = nucs[ count: ] | 
|---|
| 705 | if quals != None: | 
|---|
| 706 | quals = quals[ count: ] | 
|---|
| 707 | count, op = cigar[ 1 ] | 
|---|
| 708 | # Soft clip prefix | 
|---|
| 709 | if op == "S": | 
|---|
| 710 | nucs = nucs[ :count ].lower() + nucs[ count: ] | 
|---|
| 711 | # Hard clip suffix | 
|---|
| 712 | count,op = cigar[ -1 ] | 
|---|
| 713 | if op == "H": | 
|---|
| 714 | nucs = nucs[ :-count ] | 
|---|
| 715 | if quals != None: | 
|---|
| 716 | quals = quals[ :-count ] | 
|---|
| 717 | count, op = cigar[ -2 ] | 
|---|
| 718 | # Soft clip suffix | 
|---|
| 719 | if op == "S": | 
|---|
| 720 | nucs = nucs[ :-count ] + nucs[ -count: ].lower() | 
|---|
| 721 | return nucs, quals | 
|---|
| 722 |  | 
|---|
| 723 | def samify_phred_scores( quals ): | 
|---|
| 724 | """ | 
|---|
| 725 | Convert a decimal list of phred base-quality scores to a sam quality string. | 
|---|
| 726 | Note that if a quality is outside the dynamic range of sam's ability to | 
|---|
| 727 | represent it, we clip the value to the max allowed.  SAM quality scores | 
|---|
| 728 | range from chr(33) to chr(126). | 
|---|
| 729 | """ | 
|---|
| 730 | if min( quals ) >= 0 and max( quals ) <= 126-33: | 
|---|
| 731 | return "".join( [ chr( 33 + q ) for q in quals ] ) | 
|---|
| 732 | else: | 
|---|
| 733 | return "".join( [ chr( max( 33, min( 126, 33+q ) ) ) for q in quals ] ) | 
|---|
| 734 |  | 
|---|
| 735 | def reverse_complement( nucs ): | 
|---|
| 736 | complementMap = maketrans( "ACGTacgt", "TGCAtgca" ) | 
|---|
| 737 | return nucs[ ::-1 ].translate( complementMap ) | 
|---|
| 738 |  | 
|---|
| 739 | def reverse_string( s ): | 
|---|
| 740 | return s[ ::-1 ] | 
|---|
| 741 |  | 
|---|
| 742 | def __main__(): | 
|---|
| 743 | # Parse command line | 
|---|
| 744 | # input1: a reference genome ( 2bit or fasta ) | 
|---|
| 745 | # input2: a collection of 454 paired end reads ( a fasta file ) | 
|---|
| 746 | # input3: a linker sequence ( a very small fasta file ) | 
|---|
| 747 | # input4: a base quality score 454 file ( qual454 ) | 
|---|
| 748 | parser = optparse.OptionParser() | 
|---|
| 749 | parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' ) | 
|---|
| 750 | parser.add_option( '', '--ref_source', dest='ref_source', help='The reference is cached or from the history' ) | 
|---|
| 751 | parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' ) | 
|---|
| 752 | parser.add_option( '', '--source_select', dest='source_select', help='Use pre-set or cached reference file' ) | 
|---|
| 753 | parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' ) | 
|---|
| 754 | parser.add_option( '', '--input2', dest='input2', help='The 454 reads file to align' ) | 
|---|
| 755 | parser.add_option( '', '--input3', dest='input3', help='The sequencing linker file' ) | 
|---|
| 756 | parser.add_option( '', '--input4', dest='input4', help='The base quality score 454 file' ) | 
|---|
| 757 | parser.add_option( '', '--output', dest='output', help='The output file' ) | 
|---|
| 758 | parser.add_option( '', '--lastz_seqs_file_dir', dest='lastz_seqs_file_dir', help='Directory of local lastz_seqs.loc file' ) | 
|---|
| 759 |  | 
|---|
| 760 | ( options, args ) = parser.parse_args() | 
|---|
| 761 | if options.ref_name != 'None': | 
|---|
| 762 | ref_name = '[nickname=%s]' % options.ref_name | 
|---|
| 763 | else: | 
|---|
| 764 | ref_name = '' | 
|---|
| 765 | if options.ref_source == 'history': | 
|---|
| 766 | # Reference is a fasta dataset from the history | 
|---|
| 767 | try: | 
|---|
| 768 | # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ). | 
|---|
| 769 | error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes." | 
|---|
| 770 | ref_sequences = int( options.ref_sequences ) | 
|---|
| 771 | if ref_sequences < 1: | 
|---|
| 772 | stop_err( error_msg ) | 
|---|
| 773 | except: | 
|---|
| 774 | stop_err( error_msg ) | 
|---|
| 775 | else: | 
|---|
| 776 | ref_sequences = 0 | 
|---|
| 777 | tmp_w12_name = get_tmp_file_name( suffix='vs_linker.W12' ) | 
|---|
| 778 | tmp_T1_name = get_tmp_file_name( suffix='vs_linker.T1' ) | 
|---|
| 779 | # Run lastz twice ( with different options ) on the linker sequence and paired end reads, | 
|---|
| 780 | # looking for the linker ( each run finds some the other doesn't ) | 
|---|
| 781 | command = 'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \ | 
|---|
| 782 | ( options.input3, options.input2, tmp_w12_name ) | 
|---|
| 783 | run_command( command ) | 
|---|
| 784 | command = 'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \ | 
|---|
| 785 | ( options.input3, options.input2, tmp_T1_name ) | 
|---|
| 786 | run_command( command ) | 
|---|
| 787 | # Combine the alignment output from the two lastz runs | 
|---|
| 788 | tmp_combined_linker_file_name = get_tmp_file_name( suffix='vs_linker' ) | 
|---|
| 789 | command = 'cat %s %s | sort -u > %s' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name ) | 
|---|
| 790 | run_command( command ) | 
|---|
| 791 | # Use the alignment info to split reads into left and right mates | 
|---|
| 792 | tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name ) | 
|---|
| 793 | # Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates() | 
|---|
| 794 | tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ) | 
|---|
| 795 | # Combine and convert mate coordinates back to read coordinates | 
|---|
| 796 | paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output ) | 
|---|
| 797 | # Delete all temporary files | 
|---|
| 798 | for file_name in tmp_file_names: | 
|---|
| 799 | os.remove( file_name ) | 
|---|
| 800 | # Handle any invalid lines in the input data | 
|---|
| 801 | if total_skipped_lines: | 
|---|
| 802 | msgs = dict( bad_interval="Bad interval in line", | 
|---|
| 803 | inconsistent_read_lengths="Inconsistent read/quality lengths for seq #", | 
|---|
| 804 | inconsistent_reads="Inconsistent reads for seq #", | 
|---|
| 805 | inconsistent_sizes="Inconsistent sizes for seq #", | 
|---|
| 806 | missing_mate="Mapping file does not include mate on line", | 
|---|
| 807 | missing_quals="Missing quality values for name on line", | 
|---|
| 808 | missing_seq="Missing sequence for name on line", | 
|---|
| 809 | multiple_seqs="Multiple names for seq #", | 
|---|
| 810 | no_header="First quality sequence has no header", | 
|---|
| 811 | num_fields="Must have 4 fields in line", | 
|---|
| 812 | reads_paired="SAM flag indicates reads already paired on line", | 
|---|
| 813 | sam_flag="Bad SAM flag on line", | 
|---|
| 814 | sam_headers="SAM headers on line", | 
|---|
| 815 | sam_min_columns="Need 11 columns on line", | 
|---|
| 816 | two_mate_names="Mate name already seen, line", | 
|---|
| 817 | wrong_seq_len="Size differs from length of seq #" ) | 
|---|
| 818 | print "Skipped %d invalid lines: " | 
|---|
| 819 | msg = "" | 
|---|
| 820 | for k, v in skipped_lines.items(): | 
|---|
| 821 | if v[0]: | 
|---|
| 822 | # v[0] is the number of times the error occurred | 
|---|
| 823 | # v[1] is the position of the line or sequence in the file | 
|---|
| 824 | # v[2] is the name of the sequence or the text of the line | 
|---|
| 825 | msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] ) | 
|---|
| 826 | print msg | 
|---|
| 827 |  | 
|---|
| 828 | if __name__=="__main__": __main__() | 
|---|