Context Navigation

lastz_paired_reads_wrapper.py

リビジョン 2, 42.8 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2
3	"""
4	Runs Lastz paired read alignment process
5	Written for Lastz v. 1.02.00.
6
7	# Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu),
8	# then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu)
9
10	This tool takes the following input:
11	a. A collection of 454 paired end reads ( a fasta file )
12	b. A linker sequence ( a very small fasta file )
13	c. A reference genome ( nob, 2bit or fasta )
14
15	and uses the following process:
16	1. Split reads into mates: the input to this step is the read file XXX.fasta, and the output is three
17	files; XXX.short.fasta, XXX.long.fasta and XXX.mapping. The mapping file records the information necessary
18	to convert mate coordinates back into the original read, which is needed later in the process.
19
20	2. Align short mates to the reference: this runs lastz against every chromosome. The input is XXX.short.fasta
21	and the reference genome, and the output is a SAM file, XXX.short.sam.
22
23	3. Align long mates to the reference: this runs lastz against every chromosome. The input is XXX.long.fasta
24	and the reference genome, and the output is a SAM file, XXX.long.sam.
25
26	4. Combine, and convert mate coordinates back to read coordinates. The input is XXX.mapping, XXX.short.sam and
27	XXX.long.sam, and the output is XXX.sam.
28
29	usage: lastz_paired_reads_wrapper.py [options]
30	--ref_name: The reference name to change all output matches to
31	--ref_source: The reference is cached or from the history
32	--source_select: Use pre-set or cached reference file
33	--input1: The name of the reference file if using history or reference base name if using cached
34	--input2: The reads file to align
35	--input3: The sequencing linker file
36	--input4: The base quality score 454 file
37	--ref_sequences: The number of sequences in the reference file if using one from history
38	--output: The name of the output file
39	--lastz_seqs_file_dir: Directory of local lastz_seqs.loc file
40	"""
41	import optparse, os, subprocess, shutil, sys, tempfile, time
42	from string import maketrans
43
44	from galaxy import eggs
45	import pkg_resources
46	pkg_resources.require( 'bx-python' )
47	from bx.seq.twobit import *
48	from bx.seq.fasta import FastaReader
49	from galaxy.util.bunch import Bunch
50	from galaxy.util import string_as_bool
51
52	# Column indexes for SAM required fields
53	SAM_QNAME_COLUMN = 0
54	SAM_FLAG_COLUMN = 1
55	SAM_RNAME_COLUMN = 2
56	SAM_POS_COLUMN = 3
57	SAM_MAPQ_COLUMN = 4
58	SAM_CIGAR_COLUMN = 5
59	SAM_MRNM_COLUMN = 6
60	SAM_MPOS_COLUMN = 7
61	SAM_ISIZE_COLUMN = 8
62	SAM_SEQ_COLUMN = 9
63	SAM_QUAL_COLUMN = 10
64	SAM_MIN_COLUMNS = 11
65	# SAM bit-encoded flags
66	BAM_FPAIRED = 1 # the read is paired in sequencing, no matter whether it is mapped in a pair
67	BAM_FPROPER_PAIR = 2 # the read is mapped in a proper pair
68	BAM_FUNMAP = 4 # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
69	BAM_FMUNMAP = 8 # the mate is unmapped
70	BAM_FREVERSE = 16 # the read is mapped to the reverse strand
71	BAM_FMREVERSE = 32 # the mate is mapped to the reverse strand
72	BAM_FREAD1 = 64 # this is read1
73	BAM_FREAD2 = 128 # this is read2
74	BAM_FSECONDARY = 256 # not primary alignment
75	BAM_FQCFAIL = 512 # QC failure
76	BAM_FDUP = 1024 # optical or PCR duplicate
77
78	# Keep track of all created temporary files so they can be deleted
79	global tmp_file_names
80	tmp_file_names = []
81	# The values in the skipped_lines dict are tuples consisting of:
82	# - the number of skipped lines for that error
83	# If not a sequence error:
84	# - the 1st line number on which the error was found
85	# - the text of the 1st line on which the error was found
86	# If a sequence error:
87	# - The number of the sequence in the file
88	# - the sequence name on which the error occurred
89	# We may need to improve dealing with file position and text as
90	# much of it comes from temporary files that are created from the
91	# inputs, and not the inputs themselves, so this could be confusing
92	# to the user.
93	global skipped_lines
94	skipped_lines = dict( bad_interval=( 0, 0, '' ),
95	inconsistent_read_lengths=( 0, 0, '' ),
96	inconsistent_reads=( 0, 0, '' ),
97	inconsistent_sizes=( 0, 0, '' ),
98	missing_mate=( 0, 0, '' ),
99	missing_quals=( 0, 0, '' ),
100	missing_seq=( 0, 0, '' ),
101	multiple_seqs=( 0, 0, '' ),
102	no_header=( 0, 0, '' ),
103	num_fields=( 0, 0, '' ),
104	reads_paired=( 0, 0, '' ),
105	sam_flag=( 0, 0, '' ),
106	sam_headers=( 0, 0, '' ),
107	sam_min_columns=( 0, 0, '' ),
108	two_mate_names=( 0, 0, '' ),
109	wrong_seq_len=( 0, 0, '' ) )
110	global total_skipped_lines
111	total_skipped_lines = 0
112
113	def stop_err( msg ):
114	sys.stderr.write( "%s" % msg )
115	sys.exit()
116
117	def skip_line( error_key, position, text ):
118	if not skipped_lines[ error_key ][2]:
119	skipped_lines[ error_key ][1] = position
120	skipped_lines[ error_key ][2] = text
121	skipped_lines[ error_key ][0] += 1
122	total_skipped_lines += 1
123
124	def get_tmp_file_name( dir=None, suffix=None ):
125	"""
126	Return a unique temporary file name that can be managed. The
127	file must be manually removed after use.
128	"""
129	if dir and suffix:
130	tmp_fd, tmp_name = tempfile.mkstemp( dir=dir, suffix=suffix )
131	elif dir:
132	tmp_fd, tmp_name = tempfile.mkstemp( dir=dir )
133	elif suffix:
134	tmp_fd, tmp_name = tempfile.mkstemp( suffix=suffix )
135	os.close( tmp_fd )
136	tmp_file_names.append( tmp_name )
137	return tmp_name
138
139	def run_command( command ):
140	proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE, )
141	proc.wait()
142	stderr = proc.stderr.read()
143	proc.wait()
144	if stderr:
145	stop_err( stderr )
146
147	def split_paired_reads( input2, combined_linker_file_name ):
148	"""
149	Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals
150	showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right
151	halves.
152
153	The input intervals look like this. Note that they may include multiple intervals for the same read
154	( which should overlap ), and we use the union of them as the linker interval. Non-overlaps are
155	reported to the user, and those reads are not processed. Starts are origin zero.
156
157	#name strand start len size
158	FG3OYDA05FTEES + 219 42 283
159	FG3OYDA05FVOLL + 263 41 416
160	FG3OYDA05FFL7J + 81 42 421
161	FG3OYDA05FOQWE + 55 42 332
162	FG3OYDA05FV4DW + 297 42 388
163	FG3OYDA05FWAQV + 325 42 419
164	FG3OYDA05FVLGA + 90 42 367
165	FG3OYDA05FWJ71 + 58 42 276
166
167	The output gives each half-sequence on a separate line, like this. This allows easy sorting of the
168	sequences by length, after the fact.
169
170	219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC...
171	22 FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG
172	"""
173	# Bob says these should be hard-coded.
174	seq_len_lower_threshold = 17
175	short_mate_cutoff = 50
176	# We need to pass the name of this file back to the caller.
177	tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' )
178	mates_file = file( tmp_mates_file_name, "w+b" )
179	# Read the linker intervals
180	combined_linker_file = file( combined_linker_file_name, "rb" )
181	read_to_linker_dict = {}
182	i = 0
183	for i, line in enumerate( combined_linker_file ):
184	line = line.strip()
185	if line.startswith( "#" ):
186	continue
187	if line.find( '#' ) >= 0:
188	line = line.split( "#", 1 )[0].rstrip()
189	fields = line.split()
190	if len( fields ) != 4:
191	skip_line( 'num_fields', i+1, line )
192	continue
193	name, start, length, size = fields
194	start = int( start )
195	length = int( length )
196	size = int( size )
197	end = start + length
198	if end > size:
199	skip_line[ 'bad_interval' ] += 1
200	continue
201	if name not in read_to_linker_dict:
202	read_to_linker_dict[ name ] = ( start, end, size )
203	continue
204	if read_to_linker_dict[ name ] == None:
205	# Read previously marked as non-overlapping intervals, so skip this sequence - see below
206	continue
207	( s, e, sz ) = read_to_linker_dict[ name ]
208	if sz != size:
209	skip_line( 'inconsistent_sizes', i+1, name )
210	continue
211	if s > end or e < start:
212	# Non-overlapping intervals, so skip this sequence
213	read_to_linker_dict[ name ] = None
214	continue
215	read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size )
216	combined_linker_file.close()
217	# We need to pass the name of this file back to the caller.
218	tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' )
219	mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' )
220	# Process the sequences
221	seqs = 0
222	fasta_reader = FastaReader( file( input2, 'rb' ) )
223	while True:
224	seq = fasta_reader.next()
225	if not seq:
226	break
227	seqs += 1
228	if seq.name not in read_to_linker_dict:
229	if seq.length > seq_len_lower_threshold:
230	mates_file.write( "%-3d %s %s\n" % ( seq.length, seq.name, seq.text ) )
231	read_to_linker_dict[ seq.name ] = ""
232	continue
233	if read_to_linker_dict[ seq.name ] == "":
234	skip_line( 'multiple_seqs', seqs, seq.name )
235	continue
236	if read_to_linker_dict[ seq.name ] == None:
237	# Read previously marked as non-overlapping intervals, so skip this sequence - see above
238	continue
239	( start, end, size ) = read_to_linker_dict[ seq.name ]
240	if seq.length != size:
241	skip_line( 'wrong_seq_len', seqs, seq.name )
242	continue
243	left = seq.text[ :start ]
244	right = seq.text[ end: ]
245	left_is_small = len( left ) <= seq_len_lower_threshold
246	right_is_small = len( right ) <= seq_len_lower_threshold
247	if left_is_small and right_is_small:
248	continue
249	if not left_is_small:
250	mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) )
251	mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) )
252	if not right_is_small:
253	mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) )
254	mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) )
255	read_to_linker_dict[ seq.name ] = ""
256	combined_linker_file.close()
257	mates_file.close()
258	mates_mapping_file.close()
259	# Create temporary files for short and long mates
260	tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' )
261	tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' )
262	tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' )
263	tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' )
264	i = 0
265	for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ):
266	fields = line.split()
267	seq_len = int( fields[0] )
268	seq_name = fields[1]
269	seq_text = fields[2]
270	if seq_len <= short_mate_cutoff:
271	tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
272	else:
273	tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
274	tmp_mates_short.close()
275	tmp_mates_long.close()
276	return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name
277
278	def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ):
279	tmp_align_file_names = []
280	if ref_source == 'history':
281	# Reference is a fasta dataset from the history
282	# Create temporary files to contain the output from lastz executions
283	tmp_short_file_name = get_tmp_file_name( suffix='short_out' )
284	tmp_align_file_names.append( tmp_short_file_name )
285	tmp_long_file_name = get_tmp_file_name( suffix='long_out' )
286	tmp_align_file_names.append( tmp_long_file_name )
287	seqs = 0
288	fasta_reader = FastaReader( open( input1 ) )
289	while True:
290	# Read the next sequence from the reference dataset. Note that if the reference contains
291	# a small number of chromosomes this loop is ok, but in many cases the genome has a bunch
292	# of small straggler scaffolds and contigs and it is a computational waste to do each one
293	# of these in its own run. There is an I/O down side to running by subsets (even if they are
294	# one sequence per subset), compared to splitting the reference into sizes of 250 mb. With
295	# the subset action, lastz still has to read and parse the entire file for every run (this
296	# is true for fasta, but for .2bit files it can access each sequence directly within the file,
297	# so the overhead is minimal).
298	"""
299	:> output_file (this creates the output file, empty)
300	while there are more sequences to align
301	find the next sequences that add up to 250M, put their names in farf.names
302	lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ...
303	>> output_file
304	"""
305	seq = fasta_reader.next()
306	if not seq:
307	break
308	seqs += 1
309	# Create a temporary file to contain the current sequence as input to lastz.
310	# We're doing this a bit differently here since we could be generating a huge
311	# number of temporary files.
312	tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs )
313	tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' )
314	tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
315	tmp_in_file.close()
316	# Align short mates
317	command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name )
318	command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
319	command += '>> %s' % tmp_short_file_name
320	run_command( command )
321	# Align long mates
322	command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name )
323	command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
324	command += '>> %s' % tmp_long_file_name
325	run_command( command )
326	# Remove the temporary file that contains the current sequence
327	os.remove( tmp_in_file_name )
328	else:
329	# Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file
330	tbf = TwoBitFile( open( input1, 'rb' ) )
331	for chrom in tbf.keys():
332	# Align short mates
333	tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom )
334	tmp_align_file_names.append( tmp_short_file_name )
335	command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name )
336	command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
337	command += '> %s' % tmp_short_file_name
338	run_command( command )
339	# Align long mates
340	tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom )
341	tmp_align_file_names.append( tmp_long_file_name )
342	command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name )
343	command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
344	command += '> %s' % tmp_long_file_name
345	run_command( command )
346	return tmp_align_file_names
347
348	def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ):
349	"""
350	Given a SAM file corresponding to alignments of subsegments of paired 'reads' to a reference sequence,
351	convert the positions on the subsegments to positions on the reads. Also (optionally) add quality values.
352
353	The input file is in SAM format, as shown below. Each line represents the alignment of a part of a read
354	to a reference sequence. Read pairs are indicated by suffixes in their names. Normally, the suffixes _L
355	and _R indicate the left and right mates of reads (this can be overridden with the --left and --right
356	options). Reads that were not mates have no suffix.
357
358	(SAM header lines omitted)
359	F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT *
360	F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat *
361	F2YP0BU02HREML_R 0 chr21 16386077 255 33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc *
362	F2YP0BU02IOF1F_L 0 chr21 17567321 255 7S28M * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC *
363	F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt *
364	F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA *
365	F2YP0BU02JIMJ4_R 0 chr21 22383051 255 19M * 0 0 CCCTTTATCATTTTTTATT *
366	F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT *
367	F2YP0BU02IODR5_L 0 chr21 30935325 255 37M * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA *
368	F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG *
369	F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M * 0 0 CACACCTGTAACCCCAGCACTTT *
370	F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT *
371	F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT *
372	F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat *
373	F2YP0BU02JDCF1_L 0 chr21 31816600 255 38M * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT *
374	F2YP0BU02GZ1GO_R 0 chr21 33360122 255 6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC *
375	F2YP0BU02FX387_L 16 chr22 14786201 255 26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT *
376	F2YP0BU02IF2NE_R 0 chr22 16960842 255 40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc *
377	F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA *
378	F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG *
379	F2YP0BU02FS4EM_R 0 chr22 30159364 255 29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG *
380	F2YP0BU02G197P_L 0 chr22 32044496 255 40M10S * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc *
381	F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG *
382	F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA *
383
384	The user must provide a mapping file (which might better be called an unmapping file). This file is usually
385	created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single
386	read (this means the left and right mates were part of a single read). The mapping file contains four columns.
387	The first two give the mates's name (including the suffix) and the read name. The last two columns describe how
388	much of the full original sequence is missing from the mate. For example, in the read below, the left mate is
389	missing 63 on the right (42 for the linker and 21 for the right half). The right mate is missing 339 on the left.
390
391	left half: TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA
392	AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC
393	AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA
394	CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC
395	ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG
396	ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG
397	linker: CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA
398	right half: CTCCTGCCTCAGCCTCCCGAG
399
400	mate_name read_name offset_to_start offset_from_end
401	F2YP0BU02FS4EM_L F2YP0BU02FS4EM 0 71
402	F2YP0BU02FS4EM_R F2YP0BU02FS4EM 339 0
403
404	The user can also specify a quality scores file, which should look something like this. Quality values are presumed
405	to be PHRED scores, written in space-delimited decimal.
406
407	>F2YP0BU02FS4EM
408	38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40
409	40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33
410	32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40
411	40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35
412	31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34
413	34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40
414	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
415	40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40
416	40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33
417	36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36
418	39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40
419	40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26
420	26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39
421	40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40
422	39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35
423	>F2YP0BU02G197P
424	40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40
425	40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
426	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
427	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
428	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40
429	40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40
430	40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40
431	40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40
432	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
433	33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40
434	40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
435	40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40
436	40 40 40 39 39 39 40 40
437	>F2YP0BU02FIING
438	32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19
439	19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25
440
441	The output file is also SAM:
442
443	(SAM header lines omitted)
444	F2YP0BU02G7LK5 81 chr21 15557360 255 40M303H * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII
445	F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@
446	F2YP0BU02HREML 65 chr21 16386077 255 320H33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH
447	F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC 4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED
448	F2YP0BU02IKX84 81 chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt ;;;=7@.55------?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII
449	F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII
450	F2YP0BU02JIMJ4 65 chr21 22383051 255 208H19M * 0 0 CCCTTTATCATTTTTTATT 555544E?GE113344I22
451	F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII
452	F2YP0BU02IODR5 129 chr21 30935325 255 37M154H * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA 6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@
453	F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG BB1552222<<>9==8;;?AA=??A???A
454	F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M * 0 0 CACACCTGTAACCCCAGCACTTT IIIIIIIIIIICCCCIIIIIHHH
455	F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
456	F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
457	F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF
458	F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII
459	F2YP0BU02GZ1GO 65 chr21 33360122 255 76H6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51
460	F2YP0BU02FX387 145 chr22 14786201 255 201H26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT IIHHHHHHHHHHHHHFFFFFFFFFFF
461	F2YP0BU02IF2NE 65 chr22 16960842 255 209H40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7-
462	F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5
463	F2YP0BU02HKC61 81 chr22 29516998 255 8S32M300H * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG 2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA
464	F2YP0BU02FS4EM 65 chr22 30159364 255 339H29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG IIIIHHEIIIIHHHH??=DDHIIIIIDDD
465	F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII
466	F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG :::9:32267=:114244/...446==<<<?@?:9::::AAA
467	F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII
468
469	"""
470	left_suffix = "_L"
471	right_suffix = "_R"
472	# Read the mapping
473	mate_to_read_dict = {}
474	i = 0
475	for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ):
476	line = line.strip()
477	if not line.startswith( "#" ):
478	fields = line.split()
479	if len( fields ) != 4:
480	skip_line( "num_fields", i+1, line )
481	continue
482	mate_name, read_name, s_offset, e_offset = fields
483	if mate_name in mate_to_read_dict:
484	skip_line( 'two_mate_names', i+1, mate_name )
485	continue
486	mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) )
487	# Read sequence data
488	read_to_nucs_dict = {}
489	seqs = 0
490	fasta_reader = FastaReader( file( input2, 'rb' ) )
491	while True:
492	seq = fasta_reader.next()
493	if not seq:
494	break
495	seqs += 1
496	seq_text_upper = seq.text.upper()
497	if seq.name in read_to_nucs_dict:
498	if seq_text_upper != read_to_nucs_dict[ seq.name ]:
499	skip_line( 'inconsistent_reads', seqs, seq.name )
500	continue
501	read_to_nucs_dict[ seq.name ] = seq_text_upper
502	# Read quality data
503	def quality_sequences( f ):
504	seq_name = None
505	seq_quals = None
506	line_number = 0
507	for line in f:
508	line_number += 1
509	line = line.strip()
510	if line.startswith( ">" ):
511	if seq_name != None:
512	yield ( seq_name, seq_quals, seq_line )
513	seq_name = sequence_name( line )
514	seq_line = line_number
515	seq_quals = []
516	elif seq_name is None:
517	skip_line( 'no_header', line_number, line )
518	continue
519	else:
520	seq_quals += [ int( q ) for q in line.split() ]
521	if seq_name is not None:
522	yield ( seq_name, seq_quals, seq_line )
523	def sequence_name( s ):
524	s = s[ 1: ].strip()
525	if not s:
526	return ""
527	else:
528	return s.split()[ 0 ]
529	read_to_quals_dict = {}
530	# TODO: should we use Dan's fastaNamedReader here?
531	for seq_name, quals, line_number in quality_sequences( file( input4 ) ):
532	quals = samify_phred_scores( quals )
533	if seq_name in read_to_quals_dict:
534	if quals != read_to_quals_dict[ seq_name ]:
535	skip_line( 'inconsistent_reads', line_number, seq_name )
536	continue
537	if len( quals ) != len( read_to_nucs_dict[ seq_name ] ):
538	skip_line( 'inconsistent_read_lengths', line_number, seq_name )
539	continue
540	read_to_quals_dict[ seq_name ] = quals
541	# process the SAM file
542	tmp_align_file_names = ' '.join( tmp_align_file_name_list )
543	combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' )
544	command = 'cat %s \| grep -v "^@" \| sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name )
545	run_command( command )
546	fout = file( output, 'w+b' )
547	has_non_header = False
548	i = 0
549	for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ):
550	line = line.strip()
551	if line.startswith( "@" ):
552	if has_non_header:
553	skip_line( 'sam_headers', i+1, line )
554	continue
555	fout.write( "%s\n" % line )
556	continue
557	has_non_header = True
558	fields = line.split()
559	num_fields = len( fields )
560	if num_fields < SAM_MIN_COLUMNS:
561	skip_line( 'sam_min_columns', i+1, line )
562	continue
563	# Set flags for mates
564	try:
565	flag = int( fields[ SAM_FLAG_COLUMN ] )
566	except ValueError:
567	skip_line( 'sam_flag', i+1, line )
568	continue
569	if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ):
570	skip_line( 'reads_paired', i+1, line )
571	continue
572	mate_name = fields[ SAM_QNAME_COLUMN ]
573	unmap_it = False
574	half = None
575	if mate_name.endswith( left_suffix ):
576	flag += BAM_FPAIRED + BAM_FREAD2
577	fields[ SAM_FLAG_COLUMN ] = "%d" % flag
578	unmap_it = True
579	half = "L"
580	elif mate_name.endswith( right_suffix ):
581	flag += BAM_FPAIRED + BAM_FREAD1
582	fields[ SAM_FLAG_COLUMN ] = "%d" % flag
583	unmap_it = True
584	half = "R"
585	on_plus_strand = ( flag & BAM_FREVERSE == 0 )
586	# Convert position from mate to read by adding clipping to cigar
587	if not unmap_it:
588	read_name = mate_name
589	else:
590	try:
591	read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ]
592	except KeyError:
593	skip_line( 'missing_mate', i+1, mate_name )
594	continue
595	cigar = fields[ SAM_CIGAR_COLUMN ]
596	cigar_prefix = None
597	cigar_suffix = None
598	if half == "L":
599	if on_plus_strand:
600	if s_offset > 0:
601	cigar_prefix = ( s_offset, "S" )
602	if e_offset > 0:
603	cigar_suffix = ( e_offset, "H" )
604	else:
605	if e_offset > 0:
606	cigar_prefix = ( e_offset, "H" )
607	if s_offset > 0:
608	cigar_suffix = ( s_offset, "S" )
609	elif half == "R":
610	if on_plus_strand:
611	if s_offset > 0:
612	cigar_prefix = ( s_offset, "H" )
613	if e_offset > 0:
614	cigar_suffix = ( e_offset, "S" )
615	else:
616	if e_offset > 0:
617	cigar_prefix = ( e_offset, "S" )
618	if s_offset > 0:
619	cigar_suffix = ( s_offset, "H" )
620	else:
621	if on_plus_strand:
622	if s_offset > 0:
623	cigar_prefix = ( s_offset, "S" )
624	if e_offset > 0:
625	cigar_suffix = ( e_offset, "S" )
626	else:
627	if e_offset > 0:
628	cigar_prefix = ( e_offset, "S" )
629	if s_offset > 0:
630	cigar_suffix = ( s_offset, "S" )
631	if cigar_prefix != None:
632	count, op = cigar_prefix
633	cigar = prefix_cigar( "%d%s" % ( count, op ), cigar )
634	if op == "S":
635	refPos = int( fields[ SAM_POS_COLUMN ] ) - count
636	fields[ SAM_POS_COLUMN ] = "%d" % refPos
637	if cigar_suffix != None:
638	count, op = cigar_suffix
639	cigar = suffix_cigar( cigar,"%d%s" % ( count, op) )
640	fields[ SAM_QNAME_COLUMN ] = read_name
641	fields[ SAM_CIGAR_COLUMN ] = cigar
642	# Fetch sequence and quality values, and flip/clip them
643	if read_name not in read_to_nucs_dict:
644	skip_line( 'missing_seq', i+1, read_name )
645	continue
646	nucs = read_to_nucs_dict[ read_name ]
647	if not on_plus_strand:
648	nucs = reverse_complement( nucs )
649	quals = None
650	if read_to_quals_dict != None:
651	if read_name not in read_to_quals_dict:
652	skip_line( 'missing_quals', i+1, read_name )
653	continue
654	quals = read_to_quals_dict[ read_name ]
655	if not on_plus_strand:
656	quals = reverse_string( quals )
657	cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] )
658	nucs, quals = clip_for_cigar( cigar, nucs, quals )
659	fields[ SAM_SEQ_COLUMN ] = nucs
660	if quals != None:
661	fields[ SAM_QUAL_COLUMN ] = quals
662	# Output the line
663	fout.write( "%s\n" % "\t".join( fields ) )
664	fout.close()
665
666	def prefix_cigar( prefix, cigar ):
667	ix = 0
668	while cigar[ ix ].isdigit():
669	ix += 1
670	if cigar[ ix ] != prefix[ -1 ]:
671	return prefix + cigar
672	count = int( prefix[ :-1 ] ) + int( cigar[ :ix ] )
673	return "%d%s%s" % ( count, prefix[ -1 ], cigar[ ix+1: ] )
674
675	def suffix_cigar( cigar, suffix ):
676	if cigar[ -1 ] != suffix[ -1 ]:
677	return cigar + suffix
678	ix = len( cigar ) - 2
679	while cigar[ix].isdigit():
680	ix -= 1
681	ix += 1
682	count = int( cigar[ ix:-1 ] ) + int( suffix[ :-1 ] )
683	return "%s%d%s" % ( cigar[ :ix ], count, suffix[ -1 ] )
684
685	def split_cigar( text ):
686	fields = []
687	field = []
688	for ch in text:
689	if ch not in "MIDHS":
690	field += ch
691	continue
692	if field == []:
693	raise ValueError
694	fields += [ ( int( "".join( field ) ), ch ) ]
695	field = []
696	if field != []:
697	raise ValueError
698	return fields
699
700	def clip_for_cigar( cigar, nucs, quals ):
701	# Hard clip prefix
702	count, op = cigar[0]
703	if op == "H":
704	nucs = nucs[ count: ]
705	if quals != None:
706	quals = quals[ count: ]
707	count, op = cigar[ 1 ]
708	# Soft clip prefix
709	if op == "S":
710	nucs = nucs[ :count ].lower() + nucs[ count: ]
711	# Hard clip suffix
712	count,op = cigar[ -1 ]
713	if op == "H":
714	nucs = nucs[ :-count ]
715	if quals != None:
716	quals = quals[ :-count ]
717	count, op = cigar[ -2 ]
718	# Soft clip suffix
719	if op == "S":
720	nucs = nucs[ :-count ] + nucs[ -count: ].lower()
721	return nucs, quals
722
723	def samify_phred_scores( quals ):
724	"""
725	Convert a decimal list of phred base-quality scores to a sam quality string.
726	Note that if a quality is outside the dynamic range of sam's ability to
727	represent it, we clip the value to the max allowed. SAM quality scores
728	range from chr(33) to chr(126).
729	"""
730	if min( quals ) >= 0 and max( quals ) <= 126-33:
731	return "".join( [ chr( 33 + q ) for q in quals ] )
732	else:
733	return "".join( [ chr( max( 33, min( 126, 33+q ) ) ) for q in quals ] )
734
735	def reverse_complement( nucs ):
736	complementMap = maketrans( "ACGTacgt", "TGCAtgca" )
737	return nucs[ ::-1 ].translate( complementMap )
738
739	def reverse_string( s ):
740	return s[ ::-1 ]
741
742	def __main__():
743	# Parse command line
744	# input1: a reference genome ( 2bit or fasta )
745	# input2: a collection of 454 paired end reads ( a fasta file )
746	# input3: a linker sequence ( a very small fasta file )
747	# input4: a base quality score 454 file ( qual454 )
748	parser = optparse.OptionParser()
749	parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' )
750	parser.add_option( '', '--ref_source', dest='ref_source', help='The reference is cached or from the history' )
751	parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' )
752	parser.add_option( '', '--source_select', dest='source_select', help='Use pre-set or cached reference file' )
753	parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' )
754	parser.add_option( '', '--input2', dest='input2', help='The 454 reads file to align' )
755	parser.add_option( '', '--input3', dest='input3', help='The sequencing linker file' )
756	parser.add_option( '', '--input4', dest='input4', help='The base quality score 454 file' )
757	parser.add_option( '', '--output', dest='output', help='The output file' )
758	parser.add_option( '', '--lastz_seqs_file_dir', dest='lastz_seqs_file_dir', help='Directory of local lastz_seqs.loc file' )
759
760	( options, args ) = parser.parse_args()
761	if options.ref_name != 'None':
762	ref_name = '[nickname=%s]' % options.ref_name
763	else:
764	ref_name = ''
765	if options.ref_source == 'history':
766	# Reference is a fasta dataset from the history
767	try:
768	# Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
769	error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
770	ref_sequences = int( options.ref_sequences )
771	if ref_sequences < 1:
772	stop_err( error_msg )
773	except:
774	stop_err( error_msg )
775	else:
776	ref_sequences = 0
777	tmp_w12_name = get_tmp_file_name( suffix='vs_linker.W12' )
778	tmp_T1_name = get_tmp_file_name( suffix='vs_linker.T1' )
779	# Run lastz twice ( with different options ) on the linker sequence and paired end reads,
780	# looking for the linker ( each run finds some the other doesn't )
781	command = 'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \
782	( options.input3, options.input2, tmp_w12_name )
783	run_command( command )
784	command = 'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \
785	( options.input3, options.input2, tmp_T1_name )
786	run_command( command )
787	# Combine the alignment output from the two lastz runs
788	tmp_combined_linker_file_name = get_tmp_file_name( suffix='vs_linker' )
789	command = 'cat %s %s \| sort -u > %s' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name )
790	run_command( command )
791	# Use the alignment info to split reads into left and right mates
792	tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name )
793	# Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates()
794	tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name )
795	# Combine and convert mate coordinates back to read coordinates
796	paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output )
797	# Delete all temporary files
798	for file_name in tmp_file_names:
799	os.remove( file_name )
800	# Handle any invalid lines in the input data
801	if total_skipped_lines:
802	msgs = dict( bad_interval="Bad interval in line",
803	inconsistent_read_lengths="Inconsistent read/quality lengths for seq #",
804	inconsistent_reads="Inconsistent reads for seq #",
805	inconsistent_sizes="Inconsistent sizes for seq #",
806	missing_mate="Mapping file does not include mate on line",
807	missing_quals="Missing quality values for name on line",
808	missing_seq="Missing sequence for name on line",
809	multiple_seqs="Multiple names for seq #",
810	no_header="First quality sequence has no header",
811	num_fields="Must have 4 fields in line",
812	reads_paired="SAM flag indicates reads already paired on line",
813	sam_flag="Bad SAM flag on line",
814	sam_headers="SAM headers on line",
815	sam_min_columns="Need 11 columns on line",
816	two_mate_names="Mate name already seen, line",
817	wrong_seq_len="Size differs from length of seq #" )
818	print "Skipped %d invalid lines: "
819	msg = ""
820	for k, v in skipped_lines.items():
821	if v[0]:
822	# v[0] is the number of times the error occurred
823	# v[1] is the position of the line or sequence in the file
824	# v[2] is the name of the sequence or the text of the line
825	msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] )
826	print msg
827
828	if __name__=="__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/sr_mapping/lastz_paired_reads_wrapper.py

異なるフォーマットでダウンロード: