Context Navigation

short_reads_trim_seq.py @ 2

リビジョン 2, 9.1 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	"""
3	trim reads based on the quality scores
4	input: read file and quality score file
5	output: trimmed read file
6	"""
7
8	import os, sys, math, tempfile, re
9
10	assert sys.version_info[:2] >= ( 2, 4 )
11
12	def stop_err( msg ):
13	sys.stderr.write( "%s\n" % msg )
14	sys.exit()
15
16	def append_to_outfile( outfile_name, seq_title, segments ):
17	segments = segments.split( ',' )
18	if len( segments ) > 1:
19	outfile = open( outfile_name, 'a' )
20	for i in range( len( segments ) ):
21	outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) )
22	outfile.close()
23	elif segments[0]:
24	outfile = open( outfile_name, 'a' )
25	outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) )
26	outfile.close()
27
28	def trim_seq( seq, score, arg, trim_score, threshold ):
29	seq_method = '454'
30	trim_pos = 0
31	# trim after a certain position
32	if arg.isdigit():
33	keep_homopolymers = False
34	trim_pos = int( arg )
35	if trim_pos > 0 and trim_pos < len( seq ):
36	seq = seq[0:trim_pos]
37	else:
38	keep_homopolymers = arg=='yes'
39
40	new_trim_seq = ''
41	max_segment = 0
42
43	for i in range( len( seq ) ):
44	if i >= len( score ):
45	score.append(-1)
46	if int( score[i] ) >= trim_score:
47	pass_nuc = seq[ i:( i + 1 ) ]
48	else:
49	if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ):
50	pass_nuc = seq[ i:( i + 1 ) ]
51	else:
52	pass_nuc = ' '
53	new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc )
54	# find the max substrings
55	segments = new_trim_seq.split()
56	max_segment = ''
57	len_max_segment = 0
58	if threshold == 0:
59	for seg in segments:
60	if len_max_segment < len( seg ):
61	max_segment = '%s,' % seg
62	len_max_segment = len( seg )
63	elif len_max_segment == len( seg ):
64	max_segment = '%s%s,' % ( max_segment, seg )
65	else:
66	for seg in segments:
67	if len( seg ) >= threshold:
68	max_segment = '%s%s,' % ( max_segment, seg )
69	return max_segment[ 0:-1 ]
70
71	def __main__():
72
73	try:
74	threshold_trim = int( sys.argv[1].strip() )
75	except:
76	stop_err( "Minimal quality score must be numeric." )
77	try:
78	threshold_report = int( sys.argv[2].strip() )
79	except:
80	stop_err( "Minimal length of trimmed reads must be numeric." )
81	outfile_seq_name = sys.argv[3].strip()
82	infile_seq_name = sys.argv[4].strip()
83	infile_score_name = sys.argv[5].strip()
84	arg = sys.argv[6].strip()
85
86	seq_infile_name = infile_seq_name
87	score_infile_name = infile_score_name
88
89
90	# Determine quailty score format: tabular or fasta format within the first 100 lines
91	seq_method = None
92	data_type = None
93	for i, line in enumerate( file( score_infile_name ) ):
94	line = line.rstrip( '\r\n' )
95	if not line or line.startswith( '#' ):
96	continue
97	if data_type == None:
98	if line.startswith( '>' ):
99	data_type = 'fasta'
100	continue
101	elif len( line.split( '\t' ) ) > 0:
102	fields = line.split()
103	for score in fields:
104	try:
105	int( score )
106	data_type = 'tabular'
107	seq_method = 'solexa'
108	break
109	except:
110	break
111	elif data_type == 'fasta':
112	fields = line.split()
113	for score in fields:
114	try:
115	int( score )
116	seq_method = '454'
117	break
118	except:
119	break
120	if i == 100:
121	break
122
123	if data_type is None:
124	stop_err( 'This tool can only use fasta data or tabular data.' )
125	if seq_method is None:
126	stop_err( 'Invalid data for fasta format.')
127
128	if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ):
129	seq = None
130	score = None
131	score_found = False
132
133	score_file = open( score_infile_name, 'r' )
134
135	for i, line in enumerate( open( seq_infile_name ) ):
136	line = line.rstrip( '\r\n' )
137	if not line or line.startswith( '#' ):
138	continue
139	if line.startswith( '>' ):
140	if seq:
141	scores = []
142	if data_type == 'fasta':
143	score = None
144	score_found = False
145	score_line = 'start'
146	while not score_found and score_line:
147	score_line = score_file.readline().rstrip( '\r\n' )
148	if not score_line or score_line.startswith( '#' ):
149	continue
150	if score_line.startswith( '>' ):
151	if score:
152	scores = score.split()
153	score_found = True
154	score = None
155	else:
156	for val in score_line.split():
157	try:
158	int( val )
159	except:
160	score_file.close()
161	stop_err( "Non-numerical value '%s' in score file." % val )
162	if not score:
163	score = score_line
164	else:
165	score = '%s %s' % ( score, score_line )
166	elif data_type == 'tabular':
167	score = score_file.readline().rstrip('\r\n')
168	loc = score.split( '\t' )
169	for base in loc:
170	nuc_error = base.split()
171	try:
172	nuc_error[0] = int( nuc_error[0] )
173	nuc_error[1] = int( nuc_error[1] )
174	nuc_error[2] = int( nuc_error[2] )
175	nuc_error[3] = int( nuc_error[3] )
176	big = max( nuc_error )
177	except:
178	score_file.close()
179	stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
180	scores.append( big )
181	if scores:
182	new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
183	append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )
184
185	seq_title = line
186	seq = None
187	else:
188	if not seq:
189	seq = line
190	else:
191	seq = "%s%s" % ( seq, line )
192	if seq:
193	scores = []
194	if data_type == 'fasta':
195	score = None
196	while score_line:
197	score_line = score_file.readline().rstrip( '\r\n' )
198	if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ):
199	continue
200	for val in score_line.split():
201	try:
202	int( val )
203	except:
204	score_file.close()
205	stop_err( "Non-numerical value '%s' in score file." % val )
206	if not score:
207	score = score_line
208	else:
209	score = "%s %s" % ( score, score_line )
210	if score:
211	scores = score.split()
212	elif data_type == 'tabular':
213	score = score_file.readline().rstrip('\r\n')
214	loc = score.split( '\t' )
215	for base in loc:
216	nuc_error = base.split()
217	try:
218	nuc_error[0] = int( nuc_error[0] )
219	nuc_error[1] = int( nuc_error[1] )
220	nuc_error[2] = int( nuc_error[2] )
221	nuc_error[3] = int( nuc_error[3] )
222	big = max( nuc_error )
223	except:
224	score_file.close()
225	stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) )
226	scores.append( big )
227	if scores:
228	new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report )
229	append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments )
230	score_file.close()
231	else:
232	stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) )
233
234	if __name__ == "__main__": __main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/metag_tools/short_reads_trim_seq.py @ 2

異なるフォーマットでダウンロード: