Context Navigation

lastz_paired_reads_wrapper.py @ 2

リビジョン 2, 42.8 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2
	3	"""
	4	Runs Lastz paired read alignment process
	5	Written for Lastz v. 1.02.00.
	6
	7	# Author(s): based on various scripts written by Bob Harris (rsharris@bx.psu.edu),
	8	# then tweaked to this form by Greg Von Kuster (greg@bx.psu.edu)
	9
	10	This tool takes the following input:
	11	a. A collection of 454 paired end reads ( a fasta file )
	12	b. A linker sequence ( a very small fasta file )
	13	c. A reference genome ( nob, 2bit or fasta )
	14
	15	and uses the following process:
	16	1. Split reads into mates: the input to this step is the read file XXX.fasta, and the output is three
	17	files; XXX.short.fasta, XXX.long.fasta and XXX.mapping. The mapping file records the information necessary
	18	to convert mate coordinates back into the original read, which is needed later in the process.
	19
	20	2. Align short mates to the reference: this runs lastz against every chromosome. The input is XXX.short.fasta
	21	and the reference genome, and the output is a SAM file, XXX.short.sam.
	22
	23	3. Align long mates to the reference: this runs lastz against every chromosome. The input is XXX.long.fasta
	24	and the reference genome, and the output is a SAM file, XXX.long.sam.
	25
	26	4. Combine, and convert mate coordinates back to read coordinates. The input is XXX.mapping, XXX.short.sam and
	27	XXX.long.sam, and the output is XXX.sam.
	28
	29	usage: lastz_paired_reads_wrapper.py [options]
	30	--ref_name: The reference name to change all output matches to
	31	--ref_source: The reference is cached or from the history
	32	--source_select: Use pre-set or cached reference file
	33	--input1: The name of the reference file if using history or reference base name if using cached
	34	--input2: The reads file to align
	35	--input3: The sequencing linker file
	36	--input4: The base quality score 454 file
	37	--ref_sequences: The number of sequences in the reference file if using one from history
	38	--output: The name of the output file
	39	--lastz_seqs_file_dir: Directory of local lastz_seqs.loc file
	40	"""
	41	import optparse, os, subprocess, shutil, sys, tempfile, time
	42	from string import maketrans
	43
	44	from galaxy import eggs
	45	import pkg_resources
	46	pkg_resources.require( 'bx-python' )
	47	from bx.seq.twobit import *
	48	from bx.seq.fasta import FastaReader
	49	from galaxy.util.bunch import Bunch
	50	from galaxy.util import string_as_bool
	51
	52	# Column indexes for SAM required fields
	53	SAM_QNAME_COLUMN = 0
	54	SAM_FLAG_COLUMN = 1
	55	SAM_RNAME_COLUMN = 2
	56	SAM_POS_COLUMN = 3
	57	SAM_MAPQ_COLUMN = 4
	58	SAM_CIGAR_COLUMN = 5
	59	SAM_MRNM_COLUMN = 6
	60	SAM_MPOS_COLUMN = 7
	61	SAM_ISIZE_COLUMN = 8
	62	SAM_SEQ_COLUMN = 9
	63	SAM_QUAL_COLUMN = 10
	64	SAM_MIN_COLUMNS = 11
	65	# SAM bit-encoded flags
	66	BAM_FPAIRED = 1 # the read is paired in sequencing, no matter whether it is mapped in a pair
	67	BAM_FPROPER_PAIR = 2 # the read is mapped in a proper pair
	68	BAM_FUNMAP = 4 # the read itself is unmapped; conflictive with BAM_FPROPER_PAIR
	69	BAM_FMUNMAP = 8 # the mate is unmapped
	70	BAM_FREVERSE = 16 # the read is mapped to the reverse strand
	71	BAM_FMREVERSE = 32 # the mate is mapped to the reverse strand
	72	BAM_FREAD1 = 64 # this is read1
	73	BAM_FREAD2 = 128 # this is read2
	74	BAM_FSECONDARY = 256 # not primary alignment
	75	BAM_FQCFAIL = 512 # QC failure
	76	BAM_FDUP = 1024 # optical or PCR duplicate
	77
	78	# Keep track of all created temporary files so they can be deleted
	79	global tmp_file_names
	80	tmp_file_names = []
	81	# The values in the skipped_lines dict are tuples consisting of:
	82	# - the number of skipped lines for that error
	83	# If not a sequence error:
	84	# - the 1st line number on which the error was found
	85	# - the text of the 1st line on which the error was found
	86	# If a sequence error:
	87	# - The number of the sequence in the file
	88	# - the sequence name on which the error occurred
	89	# We may need to improve dealing with file position and text as
	90	# much of it comes from temporary files that are created from the
	91	# inputs, and not the inputs themselves, so this could be confusing
	92	# to the user.
	93	global skipped_lines
	94	skipped_lines = dict( bad_interval=( 0, 0, '' ),
	95	inconsistent_read_lengths=( 0, 0, '' ),
	96	inconsistent_reads=( 0, 0, '' ),
	97	inconsistent_sizes=( 0, 0, '' ),
	98	missing_mate=( 0, 0, '' ),
	99	missing_quals=( 0, 0, '' ),
	100	missing_seq=( 0, 0, '' ),
	101	multiple_seqs=( 0, 0, '' ),
	102	no_header=( 0, 0, '' ),
	103	num_fields=( 0, 0, '' ),
	104	reads_paired=( 0, 0, '' ),
	105	sam_flag=( 0, 0, '' ),
	106	sam_headers=( 0, 0, '' ),
	107	sam_min_columns=( 0, 0, '' ),
	108	two_mate_names=( 0, 0, '' ),
	109	wrong_seq_len=( 0, 0, '' ) )
	110	global total_skipped_lines
	111	total_skipped_lines = 0
	112
	113	def stop_err( msg ):
	114	sys.stderr.write( "%s" % msg )
	115	sys.exit()
	116
	117	def skip_line( error_key, position, text ):
	118	if not skipped_lines[ error_key ][2]:
	119	skipped_lines[ error_key ][1] = position
	120	skipped_lines[ error_key ][2] = text
	121	skipped_lines[ error_key ][0] += 1
	122	total_skipped_lines += 1
	123
	124	def get_tmp_file_name( dir=None, suffix=None ):
	125	"""
	126	Return a unique temporary file name that can be managed. The
	127	file must be manually removed after use.
	128	"""
	129	if dir and suffix:
	130	tmp_fd, tmp_name = tempfile.mkstemp( dir=dir, suffix=suffix )
	131	elif dir:
	132	tmp_fd, tmp_name = tempfile.mkstemp( dir=dir )
	133	elif suffix:
	134	tmp_fd, tmp_name = tempfile.mkstemp( suffix=suffix )
	135	os.close( tmp_fd )
	136	tmp_file_names.append( tmp_name )
	137	return tmp_name
	138
	139	def run_command( command ):
	140	proc = subprocess.Popen( args=command, shell=True, stderr=subprocess.PIPE, )
	141	proc.wait()
	142	stderr = proc.stderr.read()
	143	proc.wait()
	144	if stderr:
	145	stop_err( stderr )
	146
	147	def split_paired_reads( input2, combined_linker_file_name ):
	148	"""
	149	Given a fasta file of allegedly paired end reads ( input2 ), and a list of intervals
	150	showing where the linker is on each read ( combined_linker_file_name ), split the reads into left and right
	151	halves.
	152
	153	The input intervals look like this. Note that they may include multiple intervals for the same read
	154	( which should overlap ), and we use the union of them as the linker interval. Non-overlaps are
	155	reported to the user, and those reads are not processed. Starts are origin zero.
	156
	157	#name strand start len size
	158	FG3OYDA05FTEES + 219 42 283
	159	FG3OYDA05FVOLL + 263 41 416
	160	FG3OYDA05FFL7J + 81 42 421
	161	FG3OYDA05FOQWE + 55 42 332
	162	FG3OYDA05FV4DW + 297 42 388
	163	FG3OYDA05FWAQV + 325 42 419
	164	FG3OYDA05FVLGA + 90 42 367
	165	FG3OYDA05FWJ71 + 58 42 276
	166
	167	The output gives each half-sequence on a separate line, like this. This allows easy sorting of the
	168	sequences by length, after the fact.
	169
	170	219 FG3OYDA05FTEES_L TTTAGTTACACTTAACTCACTTCCATCCTCTAAATACGTGATTACCTTTC...
	171	22 FG3OYDA05FTEES_R CCTTCCTTAAGTCCTAAAACTG
	172	"""
	173	# Bob says these should be hard-coded.
	174	seq_len_lower_threshold = 17
	175	short_mate_cutoff = 50
	176	# We need to pass the name of this file back to the caller.
	177	tmp_mates_file_name = get_tmp_file_name( suffix='mates.txt' )
	178	mates_file = file( tmp_mates_file_name, "w+b" )
	179	# Read the linker intervals
	180	combined_linker_file = file( combined_linker_file_name, "rb" )
	181	read_to_linker_dict = {}
	182	i = 0
	183	for i, line in enumerate( combined_linker_file ):
	184	line = line.strip()
	185	if line.startswith( "#" ):
	186	continue
	187	if line.find( '#' ) >= 0:
	188	line = line.split( "#", 1 )[0].rstrip()
	189	fields = line.split()
	190	if len( fields ) != 4:
	191	skip_line( 'num_fields', i+1, line )
	192	continue
	193	name, start, length, size = fields
	194	start = int( start )
	195	length = int( length )
	196	size = int( size )
	197	end = start + length
	198	if end > size:
	199	skip_line[ 'bad_interval' ] += 1
	200	continue
	201	if name not in read_to_linker_dict:
	202	read_to_linker_dict[ name ] = ( start, end, size )
	203	continue
	204	if read_to_linker_dict[ name ] == None:
	205	# Read previously marked as non-overlapping intervals, so skip this sequence - see below
	206	continue
	207	( s, e, sz ) = read_to_linker_dict[ name ]
	208	if sz != size:
	209	skip_line( 'inconsistent_sizes', i+1, name )
	210	continue
	211	if s > end or e < start:
	212	# Non-overlapping intervals, so skip this sequence
	213	read_to_linker_dict[ name ] = None
	214	continue
	215	read_to_linker_dict[ name ] = ( min( s, start ), max( e, end ), size )
	216	combined_linker_file.close()
	217	# We need to pass the name of this file back to the caller.
	218	tmp_mates_mapping_file_name = get_tmp_file_name( suffix='mates.mapping' )
	219	mates_mapping_file = file( tmp_mates_mapping_file_name, 'w+b' )
	220	# Process the sequences
	221	seqs = 0
	222	fasta_reader = FastaReader( file( input2, 'rb' ) )
	223	while True:
	224	seq = fasta_reader.next()
	225	if not seq:
	226	break
	227	seqs += 1
	228	if seq.name not in read_to_linker_dict:
	229	if seq.length > seq_len_lower_threshold:
	230	mates_file.write( "%-3d %s %s\n" % ( seq.length, seq.name, seq.text ) )
	231	read_to_linker_dict[ seq.name ] = ""
	232	continue
	233	if read_to_linker_dict[ seq.name ] == "":
	234	skip_line( 'multiple_seqs', seqs, seq.name )
	235	continue
	236	if read_to_linker_dict[ seq.name ] == None:
	237	# Read previously marked as non-overlapping intervals, so skip this sequence - see above
	238	continue
	239	( start, end, size ) = read_to_linker_dict[ seq.name ]
	240	if seq.length != size:
	241	skip_line( 'wrong_seq_len', seqs, seq.name )
	242	continue
	243	left = seq.text[ :start ]
	244	right = seq.text[ end: ]
	245	left_is_small = len( left ) <= seq_len_lower_threshold
	246	right_is_small = len( right ) <= seq_len_lower_threshold
	247	if left_is_small and right_is_small:
	248	continue
	249	if not left_is_small:
	250	mates_file.write( "%-3d %s %s\n" % ( len( left ), seq.name + "_L", left ) )
	251	mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_L", seq.name, 0, size - start ) )
	252	if not right_is_small:
	253	mates_file.write( "%-3d %s %s\n" % ( len( right ), seq.name + "_R", right ) )
	254	mates_mapping_file.write( "%s %s %s %s\n" % ( seq.name + "_R", seq.name, end, 0 ) )
	255	read_to_linker_dict[ seq.name ] = ""
	256	combined_linker_file.close()
	257	mates_file.close()
	258	mates_mapping_file.close()
	259	# Create temporary files for short and long mates
	260	tmp_mates_short_file_name = get_tmp_file_name( suffix='mates.short' )
	261	tmp_mates_long_file_name = get_tmp_file_name( suffix='mates.long' )
	262	tmp_mates_short = open( tmp_mates_short_file_name, 'w+b' )
	263	tmp_mates_long = open( tmp_mates_long_file_name, 'w+b' )
	264	i = 0
	265	for i, line in enumerate( file( tmp_mates_file_name, 'rb' ) ):
	266	fields = line.split()
	267	seq_len = int( fields[0] )
	268	seq_name = fields[1]
	269	seq_text = fields[2]
	270	if seq_len <= short_mate_cutoff:
	271	tmp_mates_short.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
	272	else:
	273	tmp_mates_long.write( ">%s\n%s\n" % ( seq_name, seq_text ) )
	274	tmp_mates_short.close()
	275	tmp_mates_long.close()
	276	return tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name
	277
	278	def align_mates( input1, ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name ):
	279	tmp_align_file_names = []
	280	if ref_source == 'history':
	281	# Reference is a fasta dataset from the history
	282	# Create temporary files to contain the output from lastz executions
	283	tmp_short_file_name = get_tmp_file_name( suffix='short_out' )
	284	tmp_align_file_names.append( tmp_short_file_name )
	285	tmp_long_file_name = get_tmp_file_name( suffix='long_out' )
	286	tmp_align_file_names.append( tmp_long_file_name )
	287	seqs = 0
	288	fasta_reader = FastaReader( open( input1 ) )
	289	while True:
	290	# Read the next sequence from the reference dataset. Note that if the reference contains
	291	# a small number of chromosomes this loop is ok, but in many cases the genome has a bunch
	292	# of small straggler scaffolds and contigs and it is a computational waste to do each one
	293	# of these in its own run. There is an I/O down side to running by subsets (even if they are
	294	# one sequence per subset), compared to splitting the reference into sizes of 250 mb. With
	295	# the subset action, lastz still has to read and parse the entire file for every run (this
	296	# is true for fasta, but for .2bit files it can access each sequence directly within the file,
	297	# so the overhead is minimal).
	298	"""
	299	:> output_file (this creates the output file, empty)
	300	while there are more sequences to align
	301	find the next sequences that add up to 250M, put their names in farf.names
	302	lastz ${refFile}[subset=farf.names][multi][unmask] ${matesPath}/${matesFile} ...
	303	>> output_file
	304	"""
	305	seq = fasta_reader.next()
	306	if not seq:
	307	break
	308	seqs += 1
	309	# Create a temporary file to contain the current sequence as input to lastz.
	310	# We're doing this a bit differently here since we could be generating a huge
	311	# number of temporary files.
	312	tmp_in_fd, tmp_in_file_name = tempfile.mkstemp( suffix='seq_%d_in' % seqs )
	313	tmp_in_file = os.fdopen( tmp_in_fd, 'w+b' )
	314	tmp_in_file.write( '>%s\n%s\n' % ( seq.name, seq.text ) )
	315	tmp_in_file.close()
	316	# Align short mates
	317	command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_short_file_name )
	318	command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
	319	command += '>> %s' % tmp_short_file_name
	320	run_command( command )
	321	# Align long mates
	322	command = 'lastz %s[unmask]%s %s ' % ( tmp_in_file_name, ref_name, tmp_mates_long_file_name )
	323	command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
	324	command += '>> %s' % tmp_long_file_name
	325	run_command( command )
	326	# Remove the temporary file that contains the current sequence
	327	os.remove( tmp_in_file_name )
	328	else:
	329	# Reference is a locally cached 2bit file, split lastz calls across number of chroms in 2bit file
	330	tbf = TwoBitFile( open( input1, 'rb' ) )
	331	for chrom in tbf.keys():
	332	# Align short mates
	333	tmp_short_file_name = get_tmp_file_name( suffix='short_vs_%s' % chrom )
	334	tmp_align_file_names.append( tmp_short_file_name )
	335	command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_short_file_name )
	336	command += 'Z=1 --seed=1111111011111 --notrans --maxwordcount=90% --match=1,3 O=1 E=3 X=15 K=10 Y=12 L=18 --ambiguousn --noytrim --identity=95 --coverage=80 --continuity=95 --format=softsam- '
	337	command += '> %s' % tmp_short_file_name
	338	run_command( command )
	339	# Align long mates
	340	tmp_long_file_name = get_tmp_file_name( suffix='long_vs_%s' % chrom )
	341	tmp_align_file_names.append( tmp_long_file_name )
	342	command = 'lastz %s/%s[unmask]%s %s ' % ( input1, chrom, ref_name, tmp_mates_long_file_name )
	343	command += 'Z=15 W=13 --notrans --exact=18 --maxwordcount=90% --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --noytrim --identity=95 --coverage=90 --continuity=95 --format=softsam- '
	344	command += '> %s' % tmp_long_file_name
	345	run_command( command )
	346	return tmp_align_file_names
	347
	348	def paired_mate_unmapper( input2, input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, output ):
	349	"""
	350	Given a SAM file corresponding to alignments of subsegments of paired 'reads' to a reference sequence,
	351	convert the positions on the subsegments to positions on the reads. Also (optionally) add quality values.
	352
	353	The input file is in SAM format, as shown below. Each line represents the alignment of a part of a read
	354	to a reference sequence. Read pairs are indicated by suffixes in their names. Normally, the suffixes _L
	355	and _R indicate the left and right mates of reads (this can be overridden with the --left and --right
	356	options). Reads that were not mates have no suffix.
	357
	358	(SAM header lines omitted)
	359	F2YP0BU02G7LK5_R 16 chr21 15557360 255 40M * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT *
	360	F2YP0BU02HXV58_L 16 chr21 15952091 255 40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat *
	361	F2YP0BU02HREML_R 0 chr21 16386077 255 33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc *
	362	F2YP0BU02IOF1F_L 0 chr21 17567321 255 7S28M * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC *
	363	F2YP0BU02IKX84_R 16 chr21 18491628 255 22M1D18M9S * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt *
	364	F2YP0BU02GW5VA_L 16 chr21 20255344 255 6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA *
	365	F2YP0BU02JIMJ4_R 0 chr21 22383051 255 19M * 0 0 CCCTTTATCATTTTTTATT *
	366	F2YP0BU02IXZGF_L 16 chr21 23094798 255 13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT *
	367	F2YP0BU02IODR5_L 0 chr21 30935325 255 37M * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA *
	368	F2YP0BU02IMZBL_L 16 chr21 31603486 255 28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG *
	369	F2YP0BU02JA9PR_L 16 chr21 31677159 255 23M * 0 0 CACACCTGTAACCCCAGCACTTT *
	370	F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT *
	371	F2YP0BU02HKC61_R 0 chr21 31678718 255 40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT *
	372	F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat *
	373	F2YP0BU02JDCF1_L 0 chr21 31816600 255 38M * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT *
	374	F2YP0BU02GZ1GO_R 0 chr21 33360122 255 6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC *
	375	F2YP0BU02FX387_L 16 chr22 14786201 255 26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT *
	376	F2YP0BU02IF2NE_R 0 chr22 16960842 255 40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc *
	377	F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA *
	378	F2YP0BU02HKC61_R 16 chr22 29516998 255 8S32M * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG *
	379	F2YP0BU02FS4EM_R 0 chr22 30159364 255 29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG *
	380	F2YP0BU02G197P_L 0 chr22 32044496 255 40M10S * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc *
	381	F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG *
	382	F2YP0BU02GUB9L_L 16 chr22 49198404 255 16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA *
	383
	384	The user must provide a mapping file (which might better be called an unmapping file). This file is usually
	385	created by split_paired_reads, and tells us how to map the subsegments back to original coordinates in a single
	386	read (this means the left and right mates were part of a single read). The mapping file contains four columns.
	387	The first two give the mates's name (including the suffix) and the read name. The last two columns describe how
	388	much of the full original sequence is missing from the mate. For example, in the read below, the left mate is
	389	missing 63 on the right (42 for the linker and 21 for the right half). The right mate is missing 339 on the left.
	390
	391	left half: TTTCAACATATGCAAATCAATAAATGTAATCCAGCATATAAACAGAACCA
	392	AAGACAAAAACCACATGATTATCTCAATAGATGCAGAAAAGGCCTTCGGC
	393	AAAATTCAACAAAACTCCATGCTAAAACTCTCAATAAGGTATTGATGGGA
	394	CATGCCGCATAATAATAAGACATATCTATGACAAACCCACAGCCAATATC
	395	ATGCTGAATGCACAAAAATTGGAAGCATTCCCTTTGAAAACTGGCACAAG
	396	ACTGGGATGCCCTCTCTCACAACTCCTATTCAACATAGTGTTGGAAG
	397	linker: CGTAATAACTTCGTATAGCATACATTATACGAAGTCATACGA
	398	right half: CTCCTGCCTCAGCCTCCCGAG
	399
	400	mate_name read_name offset_to_start offset_from_end
	401	F2YP0BU02FS4EM_L F2YP0BU02FS4EM 0 71
	402	F2YP0BU02FS4EM_R F2YP0BU02FS4EM 339 0
	403
	404	The user can also specify a quality scores file, which should look something like this. Quality values are presumed
	405	to be PHRED scores, written in space-delimited decimal.
	406
	407	>F2YP0BU02FS4EM
	408	38 38 38 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 38 21 21 21 40
	409	40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 33
	410	32 32 40 40 40 21 21 18 18 21 34 34 31 40 40 40 40 40 40 40 40 40 40 40 40
	411	40 40 40 40 40 40 40 40 40 40 40 32 32 32 32 40 40 40 40 40 40 40 34 34 35
	412	31 31 28 28 33 33 33 36 36 36 17 17 17 19 26 36 36 36 40 40 40 40 40 33 34
	413	34 34 39 39 39 40 40 40 40 40 33 33 34 34 40 40 40 40 40 40 40 39 39 39 40
	414	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
	415	40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 39 39 39 40 40 40 40 40 40
	416	40 40 40 40 40 40 40 40 40 40 40 40 40 26 26 26 26 26 40 40 38 38 37 35 33
	417	36 40 19 17 17 17 17 19 19 23 30 20 20 20 23 35 40 36 36 36 36 36 36 36 36
	418	39 40 34 20 27 27 35 39 40 37 40 40 40 40 40 40 40 40 40 40 34 34 35 39 40
	419	40 40 40 40 40 40 39 39 39 40 40 40 40 36 36 32 32 28 28 29 30 36 40 30 26
	420	26 26 34 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 39 39 39
	421	40 39 35 34 34 40 40 40 40 30 30 30 35 40 40 40 40 40 39 39 36 40 40 40 40
	422	39 39 39 39 30 30 28 35 35 39 40 40 40 40 40 35 35 35
	423	>F2YP0BU02G197P
	424	40 40 40 40 40 40 40 40 40 40 39 39 39 39 39 39 40 40 40 40 40 40 40 40 40
	425	40 40 40 40 26 26 26 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
	426	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
	427	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
	428	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 34 34 34 40 40
	429	40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40
	430	40 40 40 40 40 40 40 34 34 34 34 40 40 40 40 34 34 34 34 40 40 40 40 40 40
	431	40 40 40 40 40 39 39 39 34 34 34 34 40 40 40 40 39 39 25 25 26 39 40 40 40
	432	40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
	433	33 33 33 33 40 35 21 21 21 30 38 40 40 40 40 40 40 40 40 35 35 30 30 30 40
	434	40 40 39 39 39 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40 40
	435	40 40 40 40 40 40 40 40 40 40 40 40 39 39 39 40 40 40 40 40 40 40 40 40 40
	436	40 40 40 39 39 39 40 40
	437	>F2YP0BU02FIING
	438	32 32 32 25 25 25 25 24 25 30 31 30 27 27 27 28 28 21 19 19 13 13 13 14 19
	439	19 17 19 16 16 25 28 22 21 17 17 18 25 24 25 25 25
	440
	441	The output file is also SAM:
	442
	443	(SAM header lines omitted)
	444	F2YP0BU02G7LK5 81 chr21 15557360 255 40M303H * 0 0 ATTTTATTCTCTTTGAAGCAATTGTGAATGGGAGTTTACT D>>>>IIIIIIHHG???IIIIIIIIIHHHFFEIH999HII
	445	F2YP0BU02HXV58 145 chr21 15952091 255 226H40M6S * 0 0 GCAAATTGTGCTGCTTTAAACATGCGTGTGCAAGTATCTTtttcat AA===DDDDAAAAD???:::ABBBBBAAA:888ECF;F>>>?8??@
	446	F2YP0BU02HREML 65 chr21 16386077 255 320H33M5S * 0 0 CCAAAGTTCTGGGATTACAGGCGTGAGCCATCGcgccc HH???HHIIIHFHIIIIIIICDDHHIIIIIIHHHHHHH
	447	F2YP0BU02IOF1F 129 chr21 17567321 255 7S28M409H * 0 0 taaagagAAGAATTCTCAACCCAGAATTTCATATC 4100<<A>4113:<EFGGGFFFHHHHHHDFFFFED
	448	F2YP0BU02IKX84 81 chr21 18491628 255 22M1D18M9S341H * 0 0 GTCTCTACCAAAAAATACAAAAATTAGCCGGGCGTGGTGGcatgtctgt ;;;=7@.55------?2?11112GGB=CCCCDIIIIIIIIIHHHHHHII
	449	F2YP0BU02GW5VA 145 chr21 20255344 255 286H6S32M * 0 0 caagaaCAAACACATTCAAAAGCTAGTAGAAGGCAAGA IIIIIIIHHHIIIIIIICCCCIIIIIIIIIIIIIIIII
	450	F2YP0BU02JIMJ4 65 chr21 22383051 255 208H19M * 0 0 CCCTTTATCATTTTTTATT 555544E?GE113344I22
	451	F2YP0BU02IXZGF 145 chr21 23094798 255 291H13M1I18M * 0 0 GCAAGCTCCACTTCCCGGGTTCACGCCATTCT IIIIIIIIIIIGG;;;GGHIIIIIGGGIIIII
	452	F2YP0BU02IODR5 129 chr21 30935325 255 37M154H * 0 0 GAAATAAAGGGTATTCAATTAGGAAAAGAGGAAGTCA 6...7/--..,30;9<<>@BFFFAAAAHIIIIIH@@@
	453	F2YP0BU02IMZBL 145 chr21 31603486 255 342H28M1D1M * 0 0 ATACAAAAATTAGCCGGGCACAGTGGCAG BB1552222<<>9==8;;?AA=??A???A
	454	F2YP0BU02JA9PR 145 chr21 31677159 255 229H23M * 0 0 CACACCTGTAACCCCAGCACTTT IIIIIIIIIIICCCCIIIIIHHH
	455	F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
	456	F2YP0BU02HKC61 65 chr21 31678718 255 300H40M * 0 0 CACTGCACTCCAGCCTGGGTGACAAAGCAAGACTCTGTCT AA@BD:::==AAA@A?8888:<90004<>>?><<<<4442
	457	F2YP0BU02HVA88 16 chr21 31703558 255 1M1D35M8S * 0 0 TGGGATTACAGGCGTGAGCTACCACACCCAGCCAGAgttcaaat >8888DFFHHGFHHHH@@?@?DDC96666HIIIFFFFFFFFFFF
	458	F2YP0BU02JDCF1 129 chr21 31816600 255 38M103H * 0 0 AGGAGAATCGCTTGAACCCAGGAGGCAGAGGTTGCGGT IIIIIIIIIIIHHHIIHHHIIIIIIIIIIIIIIIIIII
	459	F2YP0BU02GZ1GO 65 chr21 33360122 255 76H6S38M * 0 0 cctagaCTTCACACACACACACACACACACACACACACACACAC BBBBD?:688CFFFFFFFFFFFFFFFFFFFFFFFFFFDDBBB51
	460	F2YP0BU02FX387 145 chr22 14786201 255 201H26M * 0 0 TGGATGAAGCTGGAAACCATCATTCT IIHHHHHHHHHHHHHFFFFFFFFFFF
	461	F2YP0BU02IF2NE 65 chr22 16960842 255 209H40M10S * 0 0 TGGCATGCACCTGTAGTCTCAGCTACTTGGGAGGCTGAGGtgggaggatc BAAADDDDFDDDDDDBBA889<A?4444000@<>AA?9444;;8>77<7-
	462	F2YP0BU02F4TVA 0 chr22 19200522 255 49M * 0 0 CCTGGGAGGCGGAGGTTGCAGTGAGCCGAGATCACGCCATTGCACTCCA FFF???FFFFFIIIIIIIIIIIIIIIIIIIIIIIHHIIFHFFFGDDB=5
	463	F2YP0BU02HKC61 81 chr22 29516998 255 8S32M300H * 0 0 agacagagTCTTGCTTTGTCACCCAGGCTGGAGTGCAGTG 2444<<<<>?>><40009<:8888?A@AAA==:::DB@AA
	464	F2YP0BU02FS4EM 65 chr22 30159364 255 339H29M * 0 0 CTCCTGCCTCAGCCTCCCGAGTAGTTGGG IIIIHHEIIIIHHHH??=DDHIIIIIDDD
	465	F2YP0BU02G197P 129 chr22 32044496 255 40M10S258H * 0 0 TTGTTGGACATTTGGGTTGGTTCCAAGTCTTTGCTATTGTgaataatgcc IIIIIIIIIIHHHHHHIIIIIIIIIIIII;;;IIIIIIIIIIIIIIIIII
	466	F2YP0BU02FIING 16 chr22 45959944 255 3M1I11M1I26M * 0 0 AGCTATGGTACTGGCTATGAAAGCAGACACATAGACCAATGG :::9:32267=:114244/...446==<<<?@?:9::::AAA
	467	F2YP0BU02GUB9L 145 chr22 49198404 255 176H16M1I20M * 0 0 CACCACGCTCGGCTAATTTTTGTATTTTTAGTAGAGA IIIIIIIIIHAAC;<</////@4F5778;IIIIIIII
	468
	469	"""
	470	left_suffix = "_L"
	471	right_suffix = "_R"
	472	# Read the mapping
	473	mate_to_read_dict = {}
	474	i = 0
	475	for i, line in enumerate( file( tmp_mates_mapping_file_name, 'rb' ) ):
	476	line = line.strip()
	477	if not line.startswith( "#" ):
	478	fields = line.split()
	479	if len( fields ) != 4:
	480	skip_line( "num_fields", i+1, line )
	481	continue
	482	mate_name, read_name, s_offset, e_offset = fields
	483	if mate_name in mate_to_read_dict:
	484	skip_line( 'two_mate_names', i+1, mate_name )
	485	continue
	486	mate_to_read_dict[ mate_name ] = ( read_name, int( s_offset ), int( e_offset ) )
	487	# Read sequence data
	488	read_to_nucs_dict = {}
	489	seqs = 0
	490	fasta_reader = FastaReader( file( input2, 'rb' ) )
	491	while True:
	492	seq = fasta_reader.next()
	493	if not seq:
	494	break
	495	seqs += 1
	496	seq_text_upper = seq.text.upper()
	497	if seq.name in read_to_nucs_dict:
	498	if seq_text_upper != read_to_nucs_dict[ seq.name ]:
	499	skip_line( 'inconsistent_reads', seqs, seq.name )
	500	continue
	501	read_to_nucs_dict[ seq.name ] = seq_text_upper
	502	# Read quality data
	503	def quality_sequences( f ):
	504	seq_name = None
	505	seq_quals = None
	506	line_number = 0
	507	for line in f:
	508	line_number += 1
	509	line = line.strip()
	510	if line.startswith( ">" ):
	511	if seq_name != None:
	512	yield ( seq_name, seq_quals, seq_line )
	513	seq_name = sequence_name( line )
	514	seq_line = line_number
	515	seq_quals = []
	516	elif seq_name is None:
	517	skip_line( 'no_header', line_number, line )
	518	continue
	519	else:
	520	seq_quals += [ int( q ) for q in line.split() ]
	521	if seq_name is not None:
	522	yield ( seq_name, seq_quals, seq_line )
	523	def sequence_name( s ):
	524	s = s[ 1: ].strip()
	525	if not s:
	526	return ""
	527	else:
	528	return s.split()[ 0 ]
	529	read_to_quals_dict = {}
	530	# TODO: should we use Dan's fastaNamedReader here?
	531	for seq_name, quals, line_number in quality_sequences( file( input4 ) ):
	532	quals = samify_phred_scores( quals )
	533	if seq_name in read_to_quals_dict:
	534	if quals != read_to_quals_dict[ seq_name ]:
	535	skip_line( 'inconsistent_reads', line_number, seq_name )
	536	continue
	537	if len( quals ) != len( read_to_nucs_dict[ seq_name ] ):
	538	skip_line( 'inconsistent_read_lengths', line_number, seq_name )
	539	continue
	540	read_to_quals_dict[ seq_name ] = quals
	541	# process the SAM file
	542	tmp_align_file_names = ' '.join( tmp_align_file_name_list )
	543	combined_chrom_file_name = get_tmp_file_name( suffix='combined_chrom' )
	544	command = 'cat %s \| grep -v "^@" \| sort -k 1 > %s' % ( tmp_align_file_names, combined_chrom_file_name )
	545	run_command( command )
	546	fout = file( output, 'w+b' )
	547	has_non_header = False
	548	i = 0
	549	for i, line in enumerate( file( combined_chrom_file_name, 'rb' ) ):
	550	line = line.strip()
	551	if line.startswith( "@" ):
	552	if has_non_header:
	553	skip_line( 'sam_headers', i+1, line )
	554	continue
	555	fout.write( "%s\n" % line )
	556	continue
	557	has_non_header = True
	558	fields = line.split()
	559	num_fields = len( fields )
	560	if num_fields < SAM_MIN_COLUMNS:
	561	skip_line( 'sam_min_columns', i+1, line )
	562	continue
	563	# Set flags for mates
	564	try:
	565	flag = int( fields[ SAM_FLAG_COLUMN ] )
	566	except ValueError:
	567	skip_line( 'sam_flag', i+1, line )
	568	continue
	569	if not( flag & ( BAM_FPAIRED + BAM_FREAD1 + BAM_FREAD2 ) == 0 ):
	570	skip_line( 'reads_paired', i+1, line )
	571	continue
	572	mate_name = fields[ SAM_QNAME_COLUMN ]
	573	unmap_it = False
	574	half = None
	575	if mate_name.endswith( left_suffix ):
	576	flag += BAM_FPAIRED + BAM_FREAD2
	577	fields[ SAM_FLAG_COLUMN ] = "%d" % flag
	578	unmap_it = True
	579	half = "L"
	580	elif mate_name.endswith( right_suffix ):
	581	flag += BAM_FPAIRED + BAM_FREAD1
	582	fields[ SAM_FLAG_COLUMN ] = "%d" % flag
	583	unmap_it = True
	584	half = "R"
	585	on_plus_strand = ( flag & BAM_FREVERSE == 0 )
	586	# Convert position from mate to read by adding clipping to cigar
	587	if not unmap_it:
	588	read_name = mate_name
	589	else:
	590	try:
	591	read_name, s_offset, e_offset = mate_to_read_dict[ mate_name ]
	592	except KeyError:
	593	skip_line( 'missing_mate', i+1, mate_name )
	594	continue
	595	cigar = fields[ SAM_CIGAR_COLUMN ]
	596	cigar_prefix = None
	597	cigar_suffix = None
	598	if half == "L":
	599	if on_plus_strand:
	600	if s_offset > 0:
	601	cigar_prefix = ( s_offset, "S" )
	602	if e_offset > 0:
	603	cigar_suffix = ( e_offset, "H" )
	604	else:
	605	if e_offset > 0:
	606	cigar_prefix = ( e_offset, "H" )
	607	if s_offset > 0:
	608	cigar_suffix = ( s_offset, "S" )
	609	elif half == "R":
	610	if on_plus_strand:
	611	if s_offset > 0:
	612	cigar_prefix = ( s_offset, "H" )
	613	if e_offset > 0:
	614	cigar_suffix = ( e_offset, "S" )
	615	else:
	616	if e_offset > 0:
	617	cigar_prefix = ( e_offset, "S" )
	618	if s_offset > 0:
	619	cigar_suffix = ( s_offset, "H" )
	620	else:
	621	if on_plus_strand:
	622	if s_offset > 0:
	623	cigar_prefix = ( s_offset, "S" )
	624	if e_offset > 0:
	625	cigar_suffix = ( e_offset, "S" )
	626	else:
	627	if e_offset > 0:
	628	cigar_prefix = ( e_offset, "S" )
	629	if s_offset > 0:
	630	cigar_suffix = ( s_offset, "S" )
	631	if cigar_prefix != None:
	632	count, op = cigar_prefix
	633	cigar = prefix_cigar( "%d%s" % ( count, op ), cigar )
	634	if op == "S":
	635	refPos = int( fields[ SAM_POS_COLUMN ] ) - count
	636	fields[ SAM_POS_COLUMN ] = "%d" % refPos
	637	if cigar_suffix != None:
	638	count, op = cigar_suffix
	639	cigar = suffix_cigar( cigar,"%d%s" % ( count, op) )
	640	fields[ SAM_QNAME_COLUMN ] = read_name
	641	fields[ SAM_CIGAR_COLUMN ] = cigar
	642	# Fetch sequence and quality values, and flip/clip them
	643	if read_name not in read_to_nucs_dict:
	644	skip_line( 'missing_seq', i+1, read_name )
	645	continue
	646	nucs = read_to_nucs_dict[ read_name ]
	647	if not on_plus_strand:
	648	nucs = reverse_complement( nucs )
	649	quals = None
	650	if read_to_quals_dict != None:
	651	if read_name not in read_to_quals_dict:
	652	skip_line( 'missing_quals', i+1, read_name )
	653	continue
	654	quals = read_to_quals_dict[ read_name ]
	655	if not on_plus_strand:
	656	quals = reverse_string( quals )
	657	cigar = split_cigar( fields[ SAM_CIGAR_COLUMN ] )
	658	nucs, quals = clip_for_cigar( cigar, nucs, quals )
	659	fields[ SAM_SEQ_COLUMN ] = nucs
	660	if quals != None:
	661	fields[ SAM_QUAL_COLUMN ] = quals
	662	# Output the line
	663	fout.write( "%s\n" % "\t".join( fields ) )
	664	fout.close()
	665
	666	def prefix_cigar( prefix, cigar ):
	667	ix = 0
	668	while cigar[ ix ].isdigit():
	669	ix += 1
	670	if cigar[ ix ] != prefix[ -1 ]:
	671	return prefix + cigar
	672	count = int( prefix[ :-1 ] ) + int( cigar[ :ix ] )
	673	return "%d%s%s" % ( count, prefix[ -1 ], cigar[ ix+1: ] )
	674
	675	def suffix_cigar( cigar, suffix ):
	676	if cigar[ -1 ] != suffix[ -1 ]:
	677	return cigar + suffix
	678	ix = len( cigar ) - 2
	679	while cigar[ix].isdigit():
	680	ix -= 1
	681	ix += 1
	682	count = int( cigar[ ix:-1 ] ) + int( suffix[ :-1 ] )
	683	return "%s%d%s" % ( cigar[ :ix ], count, suffix[ -1 ] )
	684
	685	def split_cigar( text ):
	686	fields = []
	687	field = []
	688	for ch in text:
	689	if ch not in "MIDHS":
	690	field += ch
	691	continue
	692	if field == []:
	693	raise ValueError
	694	fields += [ ( int( "".join( field ) ), ch ) ]
	695	field = []
	696	if field != []:
	697	raise ValueError
	698	return fields
	699
	700	def clip_for_cigar( cigar, nucs, quals ):
	701	# Hard clip prefix
	702	count, op = cigar[0]
	703	if op == "H":
	704	nucs = nucs[ count: ]
	705	if quals != None:
	706	quals = quals[ count: ]
	707	count, op = cigar[ 1 ]
	708	# Soft clip prefix
	709	if op == "S":
	710	nucs = nucs[ :count ].lower() + nucs[ count: ]
	711	# Hard clip suffix
	712	count,op = cigar[ -1 ]
	713	if op == "H":
	714	nucs = nucs[ :-count ]
	715	if quals != None:
	716	quals = quals[ :-count ]
	717	count, op = cigar[ -2 ]
	718	# Soft clip suffix
	719	if op == "S":
	720	nucs = nucs[ :-count ] + nucs[ -count: ].lower()
	721	return nucs, quals
	722
	723	def samify_phred_scores( quals ):
	724	"""
	725	Convert a decimal list of phred base-quality scores to a sam quality string.
	726	Note that if a quality is outside the dynamic range of sam's ability to
	727	represent it, we clip the value to the max allowed. SAM quality scores
	728	range from chr(33) to chr(126).
	729	"""
	730	if min( quals ) >= 0 and max( quals ) <= 126-33:
	731	return "".join( [ chr( 33 + q ) for q in quals ] )
	732	else:
	733	return "".join( [ chr( max( 33, min( 126, 33+q ) ) ) for q in quals ] )
	734
	735	def reverse_complement( nucs ):
	736	complementMap = maketrans( "ACGTacgt", "TGCAtgca" )
	737	return nucs[ ::-1 ].translate( complementMap )
	738
	739	def reverse_string( s ):
	740	return s[ ::-1 ]
	741
	742	def __main__():
	743	# Parse command line
	744	# input1: a reference genome ( 2bit or fasta )
	745	# input2: a collection of 454 paired end reads ( a fasta file )
	746	# input3: a linker sequence ( a very small fasta file )
	747	# input4: a base quality score 454 file ( qual454 )
	748	parser = optparse.OptionParser()
	749	parser.add_option( '', '--ref_name', dest='ref_name', help='The reference name to change all output matches to' )
	750	parser.add_option( '', '--ref_source', dest='ref_source', help='The reference is cached or from the history' )
	751	parser.add_option( '', '--ref_sequences', dest='ref_sequences', help='Number of sequences in the reference dataset' )
	752	parser.add_option( '', '--source_select', dest='source_select', help='Use pre-set or cached reference file' )
	753	parser.add_option( '', '--input1', dest='input1', help='The name of the reference file if using history or reference base name if using cached' )
	754	parser.add_option( '', '--input2', dest='input2', help='The 454 reads file to align' )
	755	parser.add_option( '', '--input3', dest='input3', help='The sequencing linker file' )
	756	parser.add_option( '', '--input4', dest='input4', help='The base quality score 454 file' )
	757	parser.add_option( '', '--output', dest='output', help='The output file' )
	758	parser.add_option( '', '--lastz_seqs_file_dir', dest='lastz_seqs_file_dir', help='Directory of local lastz_seqs.loc file' )
	759
	760	( options, args ) = parser.parse_args()
	761	if options.ref_name != 'None':
	762	ref_name = '[nickname=%s]' % options.ref_name
	763	else:
	764	ref_name = ''
	765	if options.ref_source == 'history':
	766	# Reference is a fasta dataset from the history
	767	try:
	768	# Ensure there is at least 1 sequence in the dataset ( this may not be necessary ).
	769	error_msg = "The reference dataset is missing metadata, click the pencil icon in the history item and 'auto-detect' the metadata attributes."
	770	ref_sequences = int( options.ref_sequences )
	771	if ref_sequences < 1:
	772	stop_err( error_msg )
	773	except:
	774	stop_err( error_msg )
	775	else:
	776	ref_sequences = 0
	777	tmp_w12_name = get_tmp_file_name( suffix='vs_linker.W12' )
	778	tmp_T1_name = get_tmp_file_name( suffix='vs_linker.T1' )
	779	# Run lastz twice ( with different options ) on the linker sequence and paired end reads,
	780	# looking for the linker ( each run finds some the other doesn't )
	781	command = 'lastz %s %s W=12 --notrans --exact=18 --match=1,3 O=1 E=3 Y=10 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \
	782	( options.input3, options.input2, tmp_w12_name )
	783	run_command( command )
	784	command = 'lastz %s %s T=1 --match=1,2 O=1 E=2 X=15 K=10 Y=15 L=18 --ambiguousn --coverage=85 --format=general-:name2,zstart2+,length2,size2 > %s' % \
	785	( options.input3, options.input2, tmp_T1_name )
	786	run_command( command )
	787	# Combine the alignment output from the two lastz runs
	788	tmp_combined_linker_file_name = get_tmp_file_name( suffix='vs_linker' )
	789	command = 'cat %s %s \| sort -u > %s' % ( tmp_w12_name, tmp_T1_name, tmp_combined_linker_file_name )
	790	run_command( command )
	791	# Use the alignment info to split reads into left and right mates
	792	tmp_mates_mapping_file_name, tmp_mates_file_name, tmp_mates_short_file_name, tmp_mates_long_file_name = split_paired_reads( options.input2, tmp_combined_linker_file_name )
	793	# Align mates to the reference - tmp_align_file_names is a list of file names created by align_mates()
	794	tmp_align_file_name_list = align_mates( options.input1, options.ref_source, ref_name, ref_sequences, tmp_mates_short_file_name, tmp_mates_long_file_name )
	795	# Combine and convert mate coordinates back to read coordinates
	796	paired_mate_unmapper( options.input2, options.input4, tmp_mates_mapping_file_name, tmp_align_file_name_list, options.output )
	797	# Delete all temporary files
	798	for file_name in tmp_file_names:
	799	os.remove( file_name )
	800	# Handle any invalid lines in the input data
	801	if total_skipped_lines:
	802	msgs = dict( bad_interval="Bad interval in line",
	803	inconsistent_read_lengths="Inconsistent read/quality lengths for seq #",
	804	inconsistent_reads="Inconsistent reads for seq #",
	805	inconsistent_sizes="Inconsistent sizes for seq #",
	806	missing_mate="Mapping file does not include mate on line",
	807	missing_quals="Missing quality values for name on line",
	808	missing_seq="Missing sequence for name on line",
	809	multiple_seqs="Multiple names for seq #",
	810	no_header="First quality sequence has no header",
	811	num_fields="Must have 4 fields in line",
	812	reads_paired="SAM flag indicates reads already paired on line",
	813	sam_flag="Bad SAM flag on line",
	814	sam_headers="SAM headers on line",
	815	sam_min_columns="Need 11 columns on line",
	816	two_mate_names="Mate name already seen, line",
	817	wrong_seq_len="Size differs from length of seq #" )
	818	print "Skipped %d invalid lines: "
	819	msg = ""
	820	for k, v in skipped_lines.items():
	821	if v[0]:
	822	# v[0] is the number of times the error occurred
	823	# v[1] is the position of the line or sequence in the file
	824	# v[2] is the name of the sequence or the text of the line
	825	msg += "(%d)%s %d:%s. " % ( v[0], msgs[k], v[1], v[2] )
	826	print msg
	827
	828	if __name__=="__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/sr_mapping/lastz_paired_reads_wrapper.py @ 2

異なるフォーマットでダウンロード: