Context Navigation

short_reads_trim_seq.py @ 2

リビジョン 2, 9.1 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

Rev	行番号
[2]	1	#!/usr/bin/env python
	2	"""
	3	trim reads based on the quality scores
	4	input: read file and quality score file
	5	output: trimmed read file
	6	"""
	7
	8	import os, sys, math, tempfile, re
	9
	10	assert sys.version_info[:2] >= ( 2, 4 )
	11
	12	def stop_err( msg ):
	13	sys.stderr.write( "%s\n" % msg )
	14	sys.exit()
	15
	16	def append_to_outfile( outfile_name, seq_title, segments ):
	17	segments = segments.split( ',' )
	18	if len( segments ) > 1:
	19	outfile = open( outfile_name, 'a' )
	20	for i in range( len( segments ) ):
	21	outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) )
	22	outfile.close()
	23	elif segments[0]:
	24	outfile = open( outfile_name, 'a' )
	25	outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) )
	26	outfile.close()
	27
	28	def trim_seq( seq, score, arg, trim_score, threshold ):
	29	seq_method = '454'
	30	trim_pos = 0
	31	# trim after a certain position
	32	if arg.isdigit():
	33	keep_homopolymers = False
	34	trim_pos = int( arg )
	35	if trim_pos > 0 and trim_pos < len( seq ):
	36	seq = seq[0:trim_pos]
	37	else:
	38	keep_homopolymers = arg=='yes'
	39
	40	new_trim_seq = ''
	41	max_segment = 0
	42
	43	for i in range( len( seq ) ):
	44	if i >= len( score ):
	45	score.append(-1)
	46	if int( score[i] ) >= trim_score:
	47	pass_nuc = seq[ i:( i + 1 ) ]
	48	else:
	49	if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ):
	50	pass_nuc = seq[ i:( i + 1 ) ]
	51	else:
	52	pass_nuc = ' '
	53	new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc )
	54	# find the max substrings
	55	segments = new_trim_seq.split()
	56	max_segment = ''
	57	len_max_segment = 0
	58	if threshold == 0:
	59	for seg in segments:
	60	if len_max_segment < len( seg ):
	61	max_segment = '%s,' % seg
	62	len_max_segment = len( seg )
	63	elif len_max_segment == len( seg ):
	64	max_segment = '%s%s,' % ( max_segment, seg )
	65	else:
	66	for seg in segments:
	67	if len( seg ) >= threshold:
	68	max_segment = '%s%s,' % ( max_segment, seg )
	69	return max_segment[ 0:-1 ]
	70
	71	def __main__():
	72
	73	try:
	74	threshold_trim = int( sys.argv[1].strip() )
	75	except:
	76	stop_err( "Minimal quality score must be numeric." )
	77	try:
	78	threshold_report = int( sys.argv[2].strip() )
	79	except:
	80	stop_err( "Minimal length of trimmed reads must be numeric." )
	81	outfile_seq_name = sys.argv[3].strip()
	82	infile_seq_name = sys.argv[4].strip()
	83	infile_score_name = sys.argv[5].strip()
	84	arg = sys.argv[6].strip()
	85
	86	seq_infile_name = infile_seq_name
	87	score_infile_name = infile_score_name
	88
	89
	90	# Determine quailty score format: tabular or fasta format within the first 100 lines
	91	seq_method = None
	92	data_type = None
	93	for i, line in enumerate( file( score_infile_name ) ):
	94	line = line.rstrip( '\r\n' )
	95	if not line or line.startswith( '#' ):
	96	continue
	97	if data_type == None:
	98	if line.startswith( '>' ):
	99	data_type = 'fasta'
	100	continue
	101	elif len( line.split( '\t' ) ) > 0:
	102	fields = line.split()
	103	for score in fields:
	104	try:
	105	int( score )
	106	data_type = 'tabular'
	107	seq_method = 'solexa'
	108	break
	109	except:
	110	break
	111	elif data_type == 'fasta':
	112	fields = line.split()
	113	for score in fields:
	114	try:
	115	int( score )
	116	seq_method = '454'
	117	break
	118	except:
	119	break
	120	if i == 100:
	121	break
	122
	123	if data_type is None:
	124	stop_err( 'This tool can only use fasta data or tabular data.' )
	125	if seq_method is None:
	126	stop_err( 'Invalid data for fasta format.')
	127
	128	if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ):
	129	seq = None
	130	score = None
	131	score_found = False
	132
	133	score_file = open( score_infile_name, 'r' )
	134
	135	for i, line in enumerate( open( seq_infile_name ) ):
	136	line = line.rstrip( '\r\n' )
	137	if not line or line.startswith( '#' ):
	138	continue
	139	if line.startswith( '>' ):
	140	if seq:
	141	scores = []
	142	if data_type == 'fasta':
	143	score = None
	144	score_found = False
	145	score_line = 'start'
	146	while not score_found and score_line:
	147	score_line = score_file.readline().rstrip( '\r\n' )
	148	if not score_line or score_line.startswith( '#' ):
	149	continue
	150	if score_line.startswith( '>' ):
	151	if score:
	152	scores = score.split()
	153	score_found = True
	154	score = None
	155	else:
	156	for val in score_line.split():
	157	try:
	158	int( val )
	159	except:
	160	score_file.close()
	161	stop_err( "Non-numerical value '%s' in score file." % val )
	162	if not score:
	163	score = score_line
	164	else:
	165	score = '%s %s' % ( score, score_line )
	166	elif data_type == 'tabular':
	167	score = score_file.readline().rstrip('\r\n')
	168	loc = score.split( '\t' )
	169	for base in loc:
	170	nuc_error = base.split()
	171	try:
	172	nuc_error[0] = int( nuc_error[0] )
	173	nuc_error[1] = int( nuc_error[1] )
	174	nuc_error[2] = int( nuc_error[2] )
	175	nuc_error[3] = int( nuc_error[3] )
	176	big = max( nuc_error )
	177	except:
	178	score_file.close()
	179	stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
	180	scores.append( big )
	181	if scores:
	182	new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
	183	append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )
	184
	185	seq_title = line
	186	seq = None
	187	else:
	188	if not seq:
	189	seq = line
	190	else:
	191	seq = "%s%s" % ( seq, line )
	192	if seq:
	193	scores = []
	194	if data_type == 'fasta':
	195	score = None
	196	while score_line:
	197	score_line = score_file.readline().rstrip( '\r\n' )
	198	if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ):
	199	continue
	200	for val in score_line.split():
	201	try:
	202	int( val )
	203	except:
	204	score_file.close()
	205	stop_err( "Non-numerical value '%s' in score file." % val )
	206	if not score:
	207	score = score_line
	208	else:
	209	score = "%s %s" % ( score, score_line )
	210	if score:
	211	scores = score.split()
	212	elif data_type == 'tabular':
	213	score = score_file.readline().rstrip('\r\n')
	214	loc = score.split( '\t' )
	215	for base in loc:
	216	nuc_error = base.split()
	217	try:
	218	nuc_error[0] = int( nuc_error[0] )
	219	nuc_error[1] = int( nuc_error[1] )
	220	nuc_error[2] = int( nuc_error[2] )
	221	nuc_error[3] = int( nuc_error[3] )
	222	big = max( nuc_error )
	223	except:
	224	score_file.close()
	225	stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
	226	scores.append( big )
	227	if scores:
	228	new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
	229	append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )
	230	score_file.close()
	231	else:
	232	stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) )
	233
	234	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/metag_tools/short_reads_trim_seq.py @ 2

異なるフォーマットでダウンロード: