1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """ |
---|
4 | Runs Lastz paired read alignment process |
---|
5 | Written for Lastz v. 1.02.00. |
---|
6 | |
---|
7 | # Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu), |
---|
8 | # then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu) |
---|
9 | |
---|
10 | This tool takes the following input: |
---|
11 | a. A collection of 454 paired end reads ( a fasta file ) |
---|
12 | b. A linker sequence ( a very small fasta file ) |
---|
13 | c. A reference genome ( nob, 2bit or fasta ) |
---|
14 | |
---|
15 | and uses the following process: |
---|
16 | 1. Split reads into mates: the input to this step is the read file XXX.fasta, and the output is three |
---|
17 | files; XXX.short.fasta, XXX.long.fasta and XXX.mapping. The mapping file records the information necessary |
---|
18 | to convert mate coordinates back into the original read, which is needed later in the process. |
---|
19 | |
---|
20 | 2. Align short mates to the reference: this runs lastz against every chromosome. The input is XXX.short.fasta |
---|
21 | and the reference genome, and the output is a SAM file, XXX.short.sam. |
---|
22 | |
---|
23 | 3. Align long mates to the reference: this runs lastz against every chromosome. The input is XXX.long.fasta |
---|
24 | and the reference genome, and the output is a SAM file, XXX.long.sam. |
---|
25 | |
---|
26 | 4. Combine, and convert mate coordinates back to read coordinates. The input is XXX.mapping, XXX.short.sam and |
---|
27 | XXX.long.sam, and the output is XXX.sam. |
---|
28 | |
---|
29 | usage: lastz_paired_reads_wrapper.py [options] |
---|
30 | --ref_name: The reference name to change all output matches to |
---|
31 | --ref_source: The reference is cached or from the history |
---|
32 | --source_select: Use pre-set or cached reference file |
---|
33 | --input1: The name of the reference file if using history or reference base name if using cached |
---|
34 | --input2: The reads file to align |
---|
35 | --input3: The sequencing linker file |
---|
36 | --input4: The base quality score 454 file |
---|
37 | --ref_sequences: The number of sequences in the reference file if using one from history |
---|
38 | --output: The name of the output file |
---|
39 | --lastz_seqs_file_dir: Directory of local lastz_seqs.loc file |
---|
40 | """ |
---|
41 | import optparse, os, subprocess, shutil, sys, tempfile, time |
---|
42 | from string import maketrans |
---|
43 | |
---|
44 | from galaxy import eggs |
---|
45 | import pkg_resources |
---|
46 | pkg_resources.require( 'bx-python' ) |
---|
47 | from bx.seq.twobit import * |
---|
48 | from bx.seq.fasta import FastaReader |
---|
49 | from galaxy.util.bunch import Bunch |
---|
50 | from galaxy.util import string_as_bool |
---|
51 | |
---|
52 | # Column indexes for SAM required fields |
---|
53 | SAM_QNAME_COLUMN = 0 |
---|
54 | SAM_FLAG_COLUMN = 1 |
---|
55 | SAM_RNAME_COLUMN = 2 |
---|
56 | SAM_POS_COLUMN = 3 |
---|
57 | SAM_MAPQ_COLUMN = 4 |
---|
58 | SAM_CIGAR_COLUMN = 5 |
---|
59 | SAM_MRNM_COLUMN = 6 |
---|
60 | SAM_MPOS_COLUMN = 7 |
---|
61 | SAM_ISIZE_COLUMN = 8 |
---|
62 | SAM_SEQ_COLUMN = 9 |
---|
63 | SAM_QUAL_COLUMN = 10 |
---|
64 | SAM_MIN_COLUMNS = 11 |
---|
65 | # SAM bit-encoded flags |
---|
66 | BAM_FPAIRED = 1 # the read is paired in sequencing, no matter whether it is mapped in a pair |
---|
67 | BAM_FPROPER_PAIR = 2 # the read is mapped in a proper pair |
---|
68 | BAM_FUNMAP = 4 # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR |
---|
69 | BAM_FMUNMAP = 8 # the mate is unmapped |
---|
70 | BAM_FREVERSE = 16 # the read is mapped to the reverse strand |
---|
71 | BAM_FMREVERSE = 32 # the mate is mapped to the reverse strand |
---|
72 | BAM_FREAD1 = 64 # this is read1 |
---|
73 | BAM_FREAD2 = 128 # this is read2 |
---|
74 | BAM_FSECONDARY = 256 # not primary alignment |
---|
75 | BAM_FQCFAIL = 512 # QC failure |
---|
76 | BAM_FDUP = 1024 # optical or PCR duplicate |
---|
77 | |
---|
78 | # Keep track of all created temporary files so they can be deleted |
---|
79 | global tmp_file_names |
---|
80 | tmp_file_names = [] |
---|
81 | # The values in the skipped_lines dict are tuples consisting of: |
---|
82 | # - the number of skipped lines for that error |
---|
83 | # If not a sequence error: |
---|
84 | # - the 1st line number on which the error was found |
---|
85 | # - the text of the 1st line on which the error was found |
---|
86 | # If a sequence error: |
---|
87 | # - The number of the sequence in the file |
---|
88 | # - the sequence name on which the error occurred |
---|
89 | # We may need to improve dealing with file position and text as |
---|
90 | # much of it comes from temporary files that are created from the |
---|
91 | # inputs, and not the inputs themselves, so this could be confusing |
---|
92 | # to the user. |
---|
93 | global skipped_lines |
---|
94 | skipped_lines = dict( bad_interval=( 0, 0, '' ), |
---|
95 | inconsistent_read_lengths=( 0, 0, '' ), |
---|
96 | inconsistent_reads=( 0, 0, '' ), |
---|
97 | inconsistent_sizes=( 0, 0, '' ), |
---|
98 | missing_mate=( 0, 0, '' ), |
---|
99 | missing_quals=( 0, 0, '' ), |
---|
100 | missing_seq=( 0, 0, '' ), |
---|
101 | multiple_seqs=( 0, 0, '' ), |
---|
102 | no_header=( 0, 0, '' ), |
---|
103 | num_fields=( 0, 0, '' ), |
---|
104 | reads_paired=( 0, 0, '' ), |
---|
105 | sam_flag=( 0, 0, '' ), |
---|
106 | sam_headers=( 0, 0, '' ), |
---|
107 | sam_min_columns=( 0, 0, '' ), |
---|
108 | two_mate_names=( 0, 0, '' ), |
---|
109 | wrong_seq_len=( 0, 0, '' ) ) |
---|
110 | global total_skipped_lines |
---|
111 | total_skipped_lines = 0 |
---|
112 | |
---|
113 | def stop_err( msg ): |
---|
114 | sys.stderr.write( "%s" % msg ) |
---|
115 | sys.exit() |
---|
116 | |
---|
117 | def skip_line( error_key, position, text ): |
---|
118 | if not skipped_lines[ error_key ][2]: |
---|
119 | skipped_lines[ error_key ][1] = position |
---|
120 | skipped_lines[ error_key ][2] = text |
---|
121 | skipped_lines[ error_key ][0] += 1 |
---|
122 | total_skipped_lines += 1 |
---|
123 | |
---|
124 | def get_tmp_file_name( dir=None, suffix=None ): |
---|
125 | """ |
---|
126 | Return a unique temporary file name that can be managed. The |
---|
127 | file must be manually removed after use. |
---|
128 | """ |
---|
129 | if dir and suffix: |
---|
130 | tmp_fd, tmp_name = tempfile.mkstemp( dir=dir, suffix=suffix ) |
---|
131 | elif dir: |
---|
132 | tmp_fd, tmp_name = tempfile.mkstemp( dir=dir ) |
---|
133 | elif suffix: |
---|
134 | tmp_fd, tmp_name = tempfile.mkstemp( suffix=suffix ) |
---|
135 | os.close( tmp_fd ) |
---|
136 | tmp_file_names.append( tmp_name ) |
---|
137 | return tmp_name |
---|
138 | |
---|
139 | def run_command( command ): |
---|
140 | proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE, ) |
---|
141 | proc.wait() |
---|
142 | stderr = proc.stderr.read() |
---|
143 | proc.wait() |
---|
144 | if stderr: |
---|
145 | stop_err( stderr ) |
---|
146 | |
---|
147 | def split_paired_reads( input2, combined_linker_file_name ): |
---|
148 | """ |
---|
149 | Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals |
---|
150 | showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right |
---|
151 | halves. |
---|
152 | |
---|
153 | The input intervals look like this. Note that they may include multiple intervals for the same read |
---|
154 | ( which should overlap ), and we use the union of them as the linker interval. Non-overlaps are |
---|
155 | reported to the user, and those reads are not processed. Starts are origin zero. |
---|
156 | |
---|
157 | #name strand start len size |
---|
158 | FG3OYDA05FTEES + 219 42 283 |
---|
159 | FG3OYDA05FVOLL + 263 41 416 |
---|
160 | FG3OYDA05FFL7J + 81 42 421 |
---|
161 | FG3OYDA05FOQWE + 55 42 332 |
---|
162 | FG3OYDA05FV4DW + 297 42 388 |
---|
163 | FG3OYDA05FWAQV + 325 42 419 |
---|
164 | FG3OYDA05FVLGA + 90 42 367 |
---|
165 | FG3OYDA05FWJ71 + 58 42 276 |
---|
166 | |
---|
167 | The output gives each half-sequence on a separate line, like this. This allows easy sorting of the |
---|
168 | sequences by length, after the fact. |
---|
169 | |
---|
170 | 219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC... |
---|
171 | 22 FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG |
---|
172 | """ |
---|
173 | # Bob says these should be hard-coded. |
---|
174 | seq_len_lower_threshold = 17 |
---|
175 | short_mate_cutoff = 50 |
---|
176 | # We need to pass the name of this file back to the caller. |
---|
177 | tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' ) |
---|
178 | mates_file = file( tmp_mates_file_name, "w+b" ) |
---|
179 | # Read the linker intervals |
---|
180 | combined_linker_file = file( combined_linker_file_name, "rb" ) |
---|
181 | read_to_linker_dict = {} |
---|
182 | i = 0 |
---|
183 | for i, line in enumerate( combined_linker_file ): |
---|
184 | line = line.strip() |
---|
185 | if line.startswith( "#" ): |
---|
186 | continue |
---|
187 | if line.find( '#' ) >= 0: |
---|
188 | line = line.split( "#", 1 )[0].rstrip() |
---|
189 | fields = line.split() |
---|
190 | if len( fields ) != 4: |
---|
191 | skip_line( 'num_fields', i+1, line ) |
---|
192 | continue |
---|
193 | name, start, length, size = fields |
---|
194 | start = int( start ) |
---|
195 | length = int( length ) |
---|
196 | size = int( size ) |
---|
197 | end = start + length |
---|
198 | if end > size: |
---|
199 | skip_line[ 'bad_interval' ] += 1 |
---|
200 | continue |
---|
201 | if name not in read_to_linker_dict: |
---|
202 | read_to_linker_dict[ name ] = ( start, end, size ) |
---|
203 | continue |
---|
204 | if read_to_linker_dict[ name ] == None: |
---|
205 | # Read previously marked as non-overlapping intervals, so skip this sequence - see below |
---|
206 | continue |
---|
207 | ( s, e, sz ) = read_to_linker_dict[ name ] |
---|
208 | if sz != size: |
---|
209 | skip_line( 'inconsistent_sizes', i+1, name ) |
---|
210 | continue |
---|
211 | if s > end or e < start: |
---|
212 | # Non-overlapping intervals, so skip this sequence |
---|
213 | read_to_linker_dict[ name ] = None |
---|
214 | continue |
---|
215 | read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size ) |
---|
216 | combined_linker_file.close() |
---|
217 | # We need to pass the name of this file back to the caller. |
---|
218 | tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' ) |
---|
219 | mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' ) |
---|
220 | # Process the sequences |
---|
221 | seqs = 0 |
---|
222 | fasta_reader = FastaReader( file( input2, 'rb' ) ) |
---|
223 | while True: |
---|
224 | seq = fasta_reader.next() |
---|
225 | if not seq: |
---|
226 | break |
---|
227 | seqs += 1 |
---|
228 | if seq.name not in read_to_linker_dict: |
---|
229 | if seq.length > seq_len_lower_threshold: |
---|
230 | mates_file.write( "%-3d %s %s\n" % ( seq.length, seq.name, seq.text ) ) |
---|
231 | read_to_linker_dict[ seq.name ] = "" |
---|
232 | continue |
---|
233 | if read_to_linker_dict[ seq.name ] == "": |
---|
234 | skip_line( 'multiple_seqs', seqs, seq.name ) |
---|
235 | continue |
---|
236 | if read_to_linker_dict[ seq.name ] == None: |
---|
237 | # Read previously marked as non-overlapping intervals, so skip this sequence - see above |
---|
238 | continue |
---|
239 | ( start, end, size ) = read_to_linker_dict[ seq.name ] |
---|
240 | if seq.length != size: |
---|
241 | skip_line( 'wrong_seq_len', seqs, seq.name ) |
---|
242 | continue |
---|
243 | left = seq.text[ :start ] |
---|
244 | right = seq.text[ end: ] |
---|
245 | left_is_small = len( left ) <= seq_len_lower_threshold |
---|
246 | right_is_small = len( right ) <= seq_len_lower_threshold |
---|
247 | if left_is_small and right_is_small: |
---|
248 | continue |
---|
249 | if not left_is_small: |
---|
250 | mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) ) |
---|
251 | mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) ) |
---|
252 | if not right_is_small: |
---|
253 | mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) ) |
---|
254 | mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) ) |
---|
255 | read_to_linker_dict[ seq.name ] = "" |
---|
256 | combined_linker_file.close() |
---|
257 | mates_file.close() |
---|
258 | mates_mapping_file.close() |
---|
259 | # Create temporary files for short and long mates |
---|
260 | tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' ) |
---|
261 | tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' ) |
---|
262 | tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' ) |
---|
263 | tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' ) |
---|
264 | i = 0 |
---|
265 | for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ): |
---|
266 | fields = line.split() |
---|
267 | seq_len = int( fields[0] ) |
---|
268 | seq_name = fields[1] |
---|
269 | seq_text = fields[2] |
---|
270 | if seq_len <= short_mate_cutoff: |
---|
271 | tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) ) |
---|
272 | else: |
---|
273 | tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) ) |
---|
274 | tmp_mates_short.close() |
---|
275 | tmp_mates_long.close() |
---|
276 | return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name |
---|
277 | |
---|
278 | def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ): |
---|
279 | tmp_align_file_names = [] |
---|
280 | if ref_source == 'history': |
---|
281 | # Reference is a fasta dataset from the history |
---|
282 | # Create temporary files to contain the output from lastz executions |
---|
283 | tmp_short_file_name = get_tmp_file_name( suffix='short_out' ) |
---|
284 | tmp_align_file_names.append( tmp_short_file_name ) |
---|
285 | tmp_long_file_name = get_tmp_file_name( suffix='long_out' ) |
---|
286 | tmp_align_file_names.append( tmp_long_file_name ) |
---|
287 | seqs = 0 |
---|
288 | fasta_reader = FastaReader( open( input1 ) ) |
---|
289 | while True: |
---|
290 | # Read the next sequence from the reference dataset. Note that if the reference contains |
---|
291 | # a small number of chromosomes this loop is ok, but in many cases the genome has a bunch |
---|
292 | # of small straggler scaffolds and contigs and it is a computational waste to do each one |
---|
293 | # of these in its own run. There is an I/O down side to running by subsets (even if they are |
---|
294 | # one sequence per subset), compared to splitting the reference into sizes of 250 mb. With |
---|
295 | # the subset action, lastz still has to read and parse the entire file for every run (this |
---|
296 | # is true for fasta, but for .2bit files it can access each sequence directly within the file, |
---|
297 | # so the overhead is minimal). |
---|
298 | """ |
---|
299 | :> output_file (this creates the output file, empty) |
---|
300 | while there are more sequences to align |
---|
301 | find the next sequences that add up to 250M, put their names in farf.names |
---|
302 | lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ... |
---|
303 | >> output_file |
---|
304 | """ |
---|
305 | seq = fasta_reader.next() |
---|
306 | if not seq: |
---|
307 | break |
---|
308 | seqs += 1 |
---|
309 | # Create a temporary file to contain the current sequence as input to lastz. |
---|
310 | # We're doing this a bit differently here since we could be generating a huge |
---|
311 | # number of temporary files. |
---|
312 | tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs ) |
---|
313 | tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' ) |
---|
314 | tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) ) |
---|
315 | tmp_in_file.close() |
---|
316 | # Align short mates |
---|
317 | command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name ) |
---|
318 | command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- ' |
---|
319 | command += '>> %s' % tmp_short_file_name |
---|
320 | run_command( command ) |
---|
321 | # Align long mates |
---|
322 | command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name ) |
---|
323 | command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- ' |
---|
324 | command += '>> %s' % tmp_long_file_name |
---|
325 | run_command( command ) |
---|
326 | # Remove the temporary file that contains the current sequence |
---|
327 | os.remove( tmp_in_file_name ) |
---|
328 | else: |
---|
329 | # Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file |
---|
330 | tbf = TwoBitFile( open( input1, 'rb' ) ) |
---|
331 | for chrom in tbf.keys(): |
---|
332 | # Align short mates |
---|
333 | tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom ) |
---|
334 | tmp_align_file_names.append( tmp_short_file_name ) |
---|
335 | command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name ) |
---|
336 | command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- ' |
---|
337 | command += '> %s' % tmp_short_file_name |
---|
338 | run_command( command ) |
---|
339 | # Align long mates |
---|
340 | tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom ) |
---|
341 | tmp_align_file_names.append( tmp_long_file_name ) |
---|
342 | command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name ) |
---|
343 | command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- ' |
---|
344 | command += '> %s' % tmp_long_file_name |
---|
345 | run_command( command ) |
---|
346 | return tmp_align_file_names |
---|
347 | |
---|
348 | def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ): |
---|
349 | """ |
---|
350 | Given a SAM file corresponding to alignments of *subsegments* of paired 'reads' to a reference sequence, |
---|
351 | convert the positions on the subsegments to positions on the reads. Also (optionally) add quality values. |
---|
352 | |
---|
353 | The input file is in SAM format, as shown below. Each line represents the alignment of a part of a read |
---|
354 | to a reference sequence. Read pairs are indicated by suffixes in their names. Normally, the suffixes _L |
---|
355 | and _R indicate the left and right mates of reads (this can be overridden with the --left and --right |
---|
356 | options). Reads that were not mates have no suffix. |
---|
357 | |
---|
358 | (SAM header lines omitted) |
---|
359 | F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT * |
---|
360 | F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat * |
---|
361 | F2YP0BU02HREML_R 0 chr21 16386077 255 33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc * |
---|
362 | F2YP0BU02IOF1F_L 0 chr21 17567321 255 7S28M * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC * |
---|
363 | F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt * |
---|
364 | F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA * |
---|
365 | F2YP0BU02JIMJ4_R 0 chr21 22383051 255 19M * 0 0 CCCTTTATCATTTTTTATT * |
---|
366 | F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT * |
---|
367 | F2YP0BU02IODR5_L 0 chr21 30935325 255 37M * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA * |
---|
368 | F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG * |
---|
369 | F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M * 0 0 CACACCTGTAACCCCAGCACTTT * |
---|
370 | F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT * |
---|
371 | F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT * |
---|
372 | F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat * |
---|
373 | F2YP0BU02JDCF1_L 0 chr21 31816600 255 38M * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT * |
---|
374 | F2YP0BU02GZ1GO_R 0 chr21 33360122 255 6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC * |
---|
375 | F2YP0BU02FX387_L 16 chr22 14786201 255 26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT * |
---|
376 | F2YP0BU02IF2NE_R 0 chr22 16960842 255 40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc * |
---|
377 | F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA * |
---|
378 | F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG * |
---|
379 | F2YP0BU02FS4EM_R 0 chr22 30159364 255 29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG * |
---|
380 | F2YP0BU02G197P_L 0 chr22 32044496 255 40M10S * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc * |
---|
381 | F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG * |
---|
382 | F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA * |
---|
383 | |
---|
384 | The user must provide a mapping file (which might better be called an unmapping file). This file is usually |
---|
385 | created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single |
---|
386 | read (this means the left and right mates were part of a single read). The mapping file contains four columns. |
---|
387 | The first two give the mates's name (including the suffix) and the read name. The last two columns describe how |
---|
388 | much of the full original sequence is missing from the mate. For example, in the read below, the left mate is |
---|
389 | missing 63 on the right (42 for the linker and 21 for the right half). The right mate is missing 339 on the left. |
---|
390 | |
---|
391 | left half: TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA |
---|
392 | AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC |
---|
393 | AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA |
---|
394 | CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC |
---|
395 | ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG |
---|
396 | ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG |
---|
397 | linker: CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA |
---|
398 | right half: CTCCTGCCTCAGCCTCCCGAG |
---|
399 | |
---|
400 | mate_name read_name offset_to_start offset_from_end |
---|
401 | F2YP0BU02FS4EM_L F2YP0BU02FS4EM 0 71 |
---|
402 | F2YP0BU02FS4EM_R F2YP0BU02FS4EM 339 0 |
---|
403 | |
---|
404 | The user can also specify a quality scores file, which should look something like this. Quality values are presumed |
---|
405 | to be PHRED scores, written in space-delimited decimal. |
---|
406 | |
---|
407 | >F2YP0BU02FS4EM |
---|
408 | 38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40 |
---|
409 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33 |
---|
410 | 32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40 |
---|
411 | 40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35 |
---|
412 | 31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34 |
---|
413 | 34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40 |
---|
414 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 |
---|
415 | 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40 |
---|
416 | 40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33 |
---|
417 | 36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36 |
---|
418 | 39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40 |
---|
419 | 40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26 |
---|
420 | 26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 |
---|
421 | 40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40 |
---|
422 | 39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35 |
---|
423 | >F2YP0BU02G197P |
---|
424 | 40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40 |
---|
425 | 40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 |
---|
426 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 |
---|
427 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 |
---|
428 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40 |
---|
429 | 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 |
---|
430 | 40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40 |
---|
431 | 40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40 |
---|
432 | 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 |
---|
433 | 33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40 |
---|
434 | 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 |
---|
435 | 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 |
---|
436 | 40 40 40 39 39 39 40 40 |
---|
437 | >F2YP0BU02FIING |
---|
438 | 32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19 |
---|
439 | 19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25 |
---|
440 | |
---|
441 | The output file is also SAM: |
---|
442 | |
---|
443 | (SAM header lines omitted) |
---|
444 | F2YP0BU02G7LK5 81 chr21 15557360 255 40M303H * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII |
---|
445 | F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@ |
---|
446 | F2YP0BU02HREML 65 chr21 16386077 255 320H33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH |
---|
447 | F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC 4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED |
---|
448 | F2YP0BU02IKX84 81 chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt ;;;=7@.55------?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII |
---|
449 | F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII |
---|
450 | F2YP0BU02JIMJ4 65 chr21 22383051 255 208H19M * 0 0 CCCTTTATCATTTTTTATT 555544E?GE113344I22 |
---|
451 | F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII |
---|
452 | F2YP0BU02IODR5 129 chr21 30935325 255 37M154H * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA 6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@ |
---|
453 | F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG BB1552222<<>9==8;;?AA=??A???A |
---|
454 | F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M * 0 0 CACACCTGTAACCCCAGCACTTT IIIIIIIIIIICCCCIIIIIHHH |
---|
455 | F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442 |
---|
456 | F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442 |
---|
457 | F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF |
---|
458 | F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII |
---|
459 | F2YP0BU02GZ1GO 65 chr21 33360122 255 76H6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51 |
---|
460 | F2YP0BU02FX387 145 chr22 14786201 255 201H26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT IIHHHHHHHHHHHHHFFFFFFFFFFF |
---|
461 | F2YP0BU02IF2NE 65 chr22 16960842 255 209H40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7- |
---|
462 | F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5 |
---|
463 | F2YP0BU02HKC61 81 chr22 29516998 255 8S32M300H * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG 2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA |
---|
464 | F2YP0BU02FS4EM 65 chr22 30159364 255 339H29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG IIIIHHEIIIIHHHH??=DDHIIIIIDDD |
---|
465 | F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII |
---|
466 | F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG :::9:32267=:114244/...446==<<<?@?:9::::AAA |
---|
467 | F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII |
---|
468 | |
---|
469 | """ |
---|
470 | left_suffix = "_L" |
---|
471 | right_suffix = "_R" |
---|
472 | # Read the mapping |
---|
473 | mate_to_read_dict = {} |
---|
474 | i = 0 |
---|
475 | for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ): |
---|
476 | line = line.strip() |
---|
477 | if not line.startswith( "#" ): |
---|
478 | fields = line.split() |
---|
479 | if len( fields ) != 4: |
---|
480 | skip_line( "num_fields", i+1, line ) |
---|
481 | continue |
---|
482 | mate_name, read_name, s_offset, e_offset = fields |
---|
483 | if mate_name in mate_to_read_dict: |
---|
484 | skip_line( 'two_mate_names', i+1, mate_name ) |
---|
485 | continue |
---|
486 | mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) ) |
---|
487 | # Read sequence data |
---|
488 | read_to_nucs_dict = {} |
---|
489 | seqs = 0 |
---|
490 | fasta_reader = FastaReader( file( input2, 'rb' ) ) |
---|
491 | while True: |
---|
492 | seq = fasta_reader.next() |
---|
493 | if not seq: |
---|
494 | break |
---|
495 | seqs += 1 |
---|
496 | seq_text_upper = seq.text.upper() |
---|
497 | if seq.name in read_to_nucs_dict: |
---|
498 | if seq_text_upper != read_to_nucs_dict[ seq.name ]: |
---|
499 | skip_line( 'inconsistent_reads', seqs, seq.name ) |
---|
500 | continue |
---|
501 | read_to_nucs_dict[ seq.name ] = seq_text_upper |
---|
502 | # Read quality data |
---|
503 | def quality_sequences( f ): |
---|
504 | seq_name = None |
---|
505 | seq_quals = None |
---|
506 | line_number = 0 |
---|
507 | for line in f: |
---|
508 | line_number += 1 |
---|
509 | line = line.strip() |
---|
510 | if line.startswith( ">" ): |
---|
511 | if seq_name != None: |
---|
512 | yield ( seq_name, seq_quals, seq_line ) |
---|
513 | seq_name = sequence_name( line ) |
---|
514 | seq_line = line_number |
---|
515 | seq_quals = [] |
---|
516 | elif seq_name is None: |
---|
517 | skip_line( 'no_header', line_number, line ) |
---|
518 | continue |
---|
519 | else: |
---|
520 | seq_quals += [ int( q ) for q in line.split() ] |
---|
521 | if seq_name is not None: |
---|
522 | yield ( seq_name, seq_quals, seq_line ) |
---|
523 | def sequence_name( s ): |
---|
524 | s = s[ 1: ].strip() |
---|
525 | if not s: |
---|
526 | return "" |
---|
527 | else: |
---|
528 | return s.split()[ 0 ] |
---|
529 | read_to_quals_dict = {} |
---|
530 | # TODO: should we use Dan's fastaNamedReader here? |
---|
531 | for seq_name, quals, line_number in quality_sequences( file( input4 ) ): |
---|
532 | quals = samify_phred_scores( quals ) |
---|
533 | if seq_name in read_to_quals_dict: |
---|
534 | if quals != read_to_quals_dict[ seq_name ]: |
---|
535 | skip_line( 'inconsistent_reads', line_number, seq_name ) |
---|
536 | continue |
---|
537 | if len( quals ) != len( read_to_nucs_dict[ seq_name ] ): |
---|
538 | skip_line( 'inconsistent_read_lengths', line_number, seq_name ) |
---|
539 | continue |
---|
540 | read_to_quals_dict[ seq_name ] = quals |
---|
541 | # process the SAM file |
---|
542 | tmp_align_file_names = ' '.join( tmp_align_file_name_list ) |
---|
543 | combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' ) |
---|
544 | command = 'cat %s | grep -v "^@" | sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name ) |
---|
545 | run_command( command ) |
---|
546 | fout = file( output, 'w+b' ) |
---|
547 | has_non_header = False |
---|
548 | i = 0 |
---|
549 | for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ): |
---|
550 | line = line.strip() |
---|
551 | if line.startswith( "@" ): |
---|
552 | if has_non_header: |
---|
553 | skip_line( 'sam_headers', i+1, line ) |
---|
554 | continue |
---|
555 | fout.write( "%s\n" % line ) |
---|
556 | continue |
---|
557 | has_non_header = True |
---|
558 | fields = line.split() |
---|
559 | num_fields = len( fields ) |
---|
560 | if num_fields < SAM_MIN_COLUMNS: |
---|
561 | skip_line( 'sam_min_columns', i+1, line ) |
---|
562 | continue |
---|
563 | # Set flags for mates |
---|
564 | try: |
---|
565 | flag = int( fields[ SAM_FLAG_COLUMN ] ) |
---|
566 | except ValueError: |
---|
567 | skip_line( 'sam_flag', i+1, line ) |
---|
568 | continue |
---|
569 | if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ): |
---|
570 | skip_line( 'reads_paired', i+1, line ) |
---|
571 | continue |
---|
572 | mate_name = fields[ SAM_QNAME_COLUMN ] |
---|
573 | unmap_it = False |
---|
574 | half = None |
---|
575 | if mate_name.endswith( left_suffix ): |
---|
576 | flag += BAM_FPAIRED + BAM_FREAD2 |
---|
577 | fields[ SAM_FLAG_COLUMN ] = "%d" % flag |
---|
578 | unmap_it = True |
---|
579 | half = "L" |
---|
580 | elif mate_name.endswith( right_suffix ): |
---|
581 | flag += BAM_FPAIRED + BAM_FREAD1 |
---|
582 | fields[ SAM_FLAG_COLUMN ] = "%d" % flag |
---|
583 | unmap_it = True |
---|
584 | half = "R" |
---|
585 | on_plus_strand = ( flag & BAM_FREVERSE == 0 ) |
---|
586 | # Convert position from mate to read by adding clipping to cigar |
---|
587 | if not unmap_it: |
---|
588 | read_name = mate_name |
---|
589 | else: |
---|
590 | try: |
---|
591 | read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ] |
---|
592 | except KeyError: |
---|
593 | skip_line( 'missing_mate', i+1, mate_name ) |
---|
594 | continue |
---|
595 | cigar = fields[ SAM_CIGAR_COLUMN ] |
---|
596 | cigar_prefix = None |
---|
597 | cigar_suffix = None |
---|
598 | if half == "L": |
---|
599 | if on_plus_strand: |
---|
600 | if s_offset > 0: |
---|
601 | cigar_prefix = ( s_offset, "S" ) |
---|
602 | if e_offset > 0: |
---|
603 | cigar_suffix = ( e_offset, "H" ) |
---|
604 | else: |
---|
605 | if e_offset > 0: |
---|
606 | cigar_prefix = ( e_offset, "H" ) |
---|
607 | if s_offset > 0: |
---|
608 | cigar_suffix = ( s_offset, "S" ) |
---|
609 | elif half == "R": |
---|
610 | if on_plus_strand: |
---|
611 | if s_offset > 0: |
---|
612 | cigar_prefix = ( s_offset, "H" ) |
---|
613 | if e_offset > 0: |
---|
614 | cigar_suffix = ( e_offset, "S" ) |
---|
615 | else: |
---|
616 | if e_offset > 0: |
---|
617 | cigar_prefix = ( e_offset, "S" ) |
---|
618 | if s_offset > 0: |
---|
619 | cigar_suffix = ( s_offset, "H" ) |
---|
620 | else: |
---|
621 | if on_plus_strand: |
---|
622 | if s_offset > 0: |
---|
623 | cigar_prefix = ( s_offset, "S" ) |
---|
624 | if e_offset > 0: |
---|
625 | cigar_suffix = ( e_offset, "S" ) |
---|
626 | else: |
---|
627 | if e_offset > 0: |
---|
628 | cigar_prefix = ( e_offset, "S" ) |
---|
629 | if s_offset > 0: |
---|
630 | cigar_suffix = ( s_offset, "S" ) |
---|
631 | if cigar_prefix != None: |
---|
632 | count, op = cigar_prefix |
---|
633 | cigar = prefix_cigar( "%d%s" % ( count, op ), cigar ) |
---|
634 | if op == "S": |
---|
635 | refPos = int( fields[ SAM_POS_COLUMN ] ) - count |
---|
636 | fields[ SAM_POS_COLUMN ] = "%d" % refPos |
---|
637 | if cigar_suffix != None: |
---|
638 | count, op = cigar_suffix |
---|
639 | cigar = suffix_cigar( cigar,"%d%s" % ( count, op) ) |
---|
640 | fields[ SAM_QNAME_COLUMN ] = read_name |
---|
641 | fields[ SAM_CIGAR_COLUMN ] = cigar |
---|
642 | # Fetch sequence and quality values, and flip/clip them |
---|
643 | if read_name not in read_to_nucs_dict: |
---|
644 | skip_line( 'missing_seq', i+1, read_name ) |
---|
645 | continue |
---|
646 | nucs = read_to_nucs_dict[ read_name ] |
---|
647 | if not on_plus_strand: |
---|
648 | nucs = reverse_complement( nucs ) |
---|
649 | quals = None |
---|
650 | if read_to_quals_dict != None: |
---|
651 | if read_name not in read_to_quals_dict: |
---|
652 | skip_line( 'missing_quals', i+1, read_name ) |
---|
653 | continue |
---|
654 | quals = read_to_quals_dict[ read_name ] |
---|
655 | if not on_plus_strand: |
---|
656 | quals = reverse_string( quals ) |
---|
657 | cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] ) |
---|
658 | nucs, quals = clip_for_cigar( cigar, nucs, quals ) |
---|
659 | fields[ SAM_SEQ_COLUMN ] = nucs |
---|
660 | if quals != None: |
---|
661 | fields[ SAM_QUAL_COLUMN ] = quals |
---|
662 | # Output the line |
---|
663 | fout.write( "%s\n" % "\t".join( fields ) ) |
---|
664 | fout.close() |
---|
665 | |
---|
666 | def prefix_cigar( prefix, cigar ): |
---|
667 | ix = 0 |
---|
668 | while cigar[ ix ].isdigit(): |
---|
669 | ix += 1 |
---|
670 | if cigar[ ix ] != prefix[ -1 ]: |
---|
671 | return prefix + cigar |
---|
672 | count = int( prefix[ :-1 ] ) + int( cigar[ :ix ] ) |
---|
673 | return "%d%s%s" % ( count, prefix[ -1 ], cigar[ ix+1: ] ) |
---|
674 | |
---|
675 | def suffix_cigar( cigar, suffix ): |
---|
676 | if cigar[ -1 ] != suffix[ -1 ]: |
---|
677 | return cigar + suffix |
---|
678 | ix = len( cigar ) - 2 |
---|
679 | while cigar[ix].isdigit(): |
---|
680 | ix -= 1 |
---|
681 | ix += 1 |
---|
682 | count = int( cigar[ ix:-1 ] ) + int( suffix[ :-1 ] ) |
---|
683 | return "%s%d%s" % ( cigar[ :ix ], count, suffix[ -1 ] ) |
---|
684 | |
---|
685 | def split_cigar( text ): |
---|
686 | fields = [] |
---|
687 | field = [] |
---|
688 | for ch in text: |
---|
689 | if ch not in "MIDHS": |
---|
690 | field += ch |
---|
691 | continue |
---|
692 | if field == []: |
---|
693 | raise ValueError |
---|
694 | fields += [ ( int( "".join( field ) ), ch ) ] |
---|
695 | field = [] |
---|
696 | if field != []: |
---|
697 | raise ValueError |
---|
698 | return fields |
---|
699 | |
---|
700 | def clip_for_cigar( cigar, nucs, quals ): |
---|
701 | # Hard clip prefix |
---|
702 | count, op = cigar[0] |
---|
703 | if op == "H": |
---|
704 | nucs = nucs[ count: ] |
---|
705 | if quals != None: |
---|
706 | quals = quals[ count: ] |
---|
707 | count, op = cigar[ 1 ] |
---|
708 | # Soft clip prefix |
---|
709 | if op == "S": |
---|
710 | nucs = nucs[ :count ].lower() + nucs[ count: ] |
---|
711 | # Hard clip suffix |
---|
712 | count,op = cigar[ -1 ] |
---|
713 | if op == "H": |
---|
714 | nucs = nucs[ :-count ] |
---|
715 | if quals != None: |
---|
716 | quals = quals[ :-count ] |
---|
717 | count, op = cigar[ -2 ] |
---|
718 | # Soft clip suffix |
---|
719 | if op == "S": |
---|
720 | nucs = nucs[ :-count ] + nucs[ -count: ].lower() |
---|
721 | return nucs, quals |
---|
722 | |
---|
723 | def samify_phred_scores( quals ): |
---|
724 | """ |
---|
725 | Convert a decimal list of phred base-quality scores to a sam quality string. |
---|
726 | Note that if a quality is outside the dynamic range of sam's ability to |
---|
727 | represent it, we clip the value to the max allowed. SAM quality scores |
---|
728 | range from chr(33) to chr(126). |
---|
729 | """ |
---|
730 | if min( quals ) >= 0 and max( quals ) <= 126-33: |
---|
731 | return "".join( [ chr( 33 + q ) for q in quals ] ) |
---|
732 | else: |
---|
733 | return "".join( [ chr( max( 33, min( 126, 33+q ) ) ) for q in quals ] ) |
---|
734 | |
---|
735 | def reverse_complement( nucs ): |
---|
736 | complementMap = maketrans( "ACGTacgt", "TGCAtgca" ) |
---|
737 | return nucs[ ::-1 ].translate( complementMap ) |
---|
738 | |
---|
739 | def reverse_string( s ): |
---|
740 | return s[ ::-1 ] |
---|
741 | |
---|
742 | def __main__(): |
---|
743 | # Parse command line |
---|
744 | # input1: a reference genome ( 2bit or fasta ) |
---|
745 | # input2: a collection of 454 paired end reads ( a fasta file ) |
---|
746 | # input3: a linker sequence ( a very small fasta file ) |
---|
747 | # input4: a base quality score 454 file ( qual454 ) |
---|
748 | parser = optparse.OptionParser() |
---|
749 | parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' ) |
---|
750 | parser.add_option( '', '--ref_source', dest='ref_source', help='The reference is cached or from the history' ) |
---|
751 | parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' ) |
---|
752 | parser.add_option( '', '--source_select', dest='source_select', help='Use pre-set or cached reference file' ) |
---|
753 | parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' ) |
---|
754 | parser.add_option( '', '--input2', dest='input2', help='The 454 reads file to align' ) |
---|
755 | parser.add_option( '', '--input3', dest='input3', help='The sequencing linker file' ) |
---|
756 | parser.add_option( '', '--input4', dest='input4', help='The base quality score 454 file' ) |
---|
757 | parser.add_option( '', '--output', dest='output', help='The output file' ) |
---|
758 | parser.add_option( '', '--lastz_seqs_file_dir', dest='lastz_seqs_file_dir', help='Directory of local lastz_seqs.loc file' ) |
---|
759 | |
---|
760 | ( options, args ) = parser.parse_args() |
---|
761 | if options.ref_name != 'None': |
---|
762 | ref_name = '[nickname=%s]' % options.ref_name |
---|
763 | else: |
---|
764 | ref_name = '' |
---|
765 | if options.ref_source == 'history': |
---|
766 | # Reference is a fasta dataset from the history |
---|
767 | try: |
---|
768 | # Ensure there is at least 1 sequence in the dataset ( this may not be necessary ). |
---|
769 | error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes." |
---|
770 | ref_sequences = int( options.ref_sequences ) |
---|
771 | if ref_sequences < 1: |
---|
772 | stop_err( error_msg ) |
---|
773 | except: |
---|
774 | stop_err( error_msg ) |
---|
775 | else: |
---|
776 | ref_sequences = 0 |
---|
777 | tmp_w12_name = get_tmp_file_name( suffix='vs_linker.W12' ) |
---|
778 | tmp_T1_name = get_tmp_file_name( suffix='vs_linker.T1' ) |
---|
779 | # Run lastz twice ( with different options ) on the linker sequence and paired end reads, |
---|
780 | # looking for the linker ( each run finds some the other doesn't ) |
---|
781 | command = 'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \ |
---|
782 | ( options.input3, options.input2, tmp_w12_name ) |
---|
783 | run_command( command ) |
---|
784 | command = 'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \ |
---|
785 | ( options.input3, options.input2, tmp_T1_name ) |
---|
786 | run_command( command ) |
---|
787 | # Combine the alignment output from the two lastz runs |
---|
788 | tmp_combined_linker_file_name = get_tmp_file_name( suffix='vs_linker' ) |
---|
789 | command = 'cat %s %s | sort -u > %s' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name ) |
---|
790 | run_command( command ) |
---|
791 | # Use the alignment info to split reads into left and right mates |
---|
792 | tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name ) |
---|
793 | # Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates() |
---|
794 | tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ) |
---|
795 | # Combine and convert mate coordinates back to read coordinates |
---|
796 | paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output ) |
---|
797 | # Delete all temporary files |
---|
798 | for file_name in tmp_file_names: |
---|
799 | os.remove( file_name ) |
---|
800 | # Handle any invalid lines in the input data |
---|
801 | if total_skipped_lines: |
---|
802 | msgs = dict( bad_interval="Bad interval in line", |
---|
803 | inconsistent_read_lengths="Inconsistent read/quality lengths for seq #", |
---|
804 | inconsistent_reads="Inconsistent reads for seq #", |
---|
805 | inconsistent_sizes="Inconsistent sizes for seq #", |
---|
806 | missing_mate="Mapping file does not include mate on line", |
---|
807 | missing_quals="Missing quality values for name on line", |
---|
808 | missing_seq="Missing sequence for name on line", |
---|
809 | multiple_seqs="Multiple names for seq #", |
---|
810 | no_header="First quality sequence has no header", |
---|
811 | num_fields="Must have 4 fields in line", |
---|
812 | reads_paired="SAM flag indicates reads already paired on line", |
---|
813 | sam_flag="Bad SAM flag on line", |
---|
814 | sam_headers="SAM headers on line", |
---|
815 | sam_min_columns="Need 11 columns on line", |
---|
816 | two_mate_names="Mate name already seen, line", |
---|
817 | wrong_seq_len="Size differs from length of seq #" ) |
---|
818 | print "Skipped %d invalid lines: " |
---|
819 | msg = "" |
---|
820 | for k, v in skipped_lines.items(): |
---|
821 | if v[0]: |
---|
822 | # v[0] is the number of times the error occurred |
---|
823 | # v[1] is the position of the line or sequence in the file |
---|
824 | # v[2] is the name of the sequence or the text of the line |
---|
825 | msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] ) |
---|
826 | print msg |
---|
827 | |
---|
828 | if __name__=="__main__": __main__() |
---|