Context Navigation

short_reads_figure_score.py @ 2

リビジョン 2, 8.4 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	#!/usr/bin/env python
2	"""
3	boxplot:
4	- box: first quartile and third quartile
5	- line inside the box: median
6	- outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile
7	IQR = third quartile - first quartile
8	- The smallest/largest value that is not an outlier is connected to the box by with a horizontal line.
9	"""
10
11	import os, sys, math, tempfile, re
12	from rpy import *
13
14	assert sys.version_info[:2] >= ( 2, 4 )
15
16	def stop_err( msg ):
17	sys.stderr.write( "%s\n" % msg )
18	sys.exit()
19
20	def merge_to_20_datapoints( score ):
21	number_of_points = 20
22	read_length = len( score )
23	step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) )
24	scores = []
25	point = 1
26	point_sum = 0
27	step_average = 0
28	score_points = 0
29
30	for i in xrange( 1, read_length ):
31	if i < ( point * step ):
32	point_sum += int( score[i] )
33	step_average += 1
34	else:
35	point_avg = point_sum * 1.0 / step_average
36	scores.append( point_avg )
37	point += 1
38	point_sum = 0
39	step_average = 0
40	if step_average > 0:
41	point_avg = point_sum * 1.0 / step_average
42	scores.append( point_avg )
43	if len( scores ) > number_of_points:
44	last_avg = 0
45	for j in xrange( number_of_points - 1, len( scores ) ):
46	last_avg += scores[j]
47	last_avg = last_avg / ( len(scores) - number_of_points + 1 )
48	else:
49	last_avg = scores[-1]
50	score_points = []
51	for k in range( number_of_points - 1 ):
52	score_points.append( scores[k] )
53	score_points.append( last_avg )
54	return score_points
55
56	def __main__():
57
58	invalid_lines = 0
59
60	infile_score_name = sys.argv[1].strip()
61	outfile_R_name = sys.argv[2].strip()
62
63	infile_name = infile_score_name
64
65	# Determine tabular or fasta format within the first 100 lines
66	seq_method = None
67	data_type = None
68	for i, line in enumerate( file( infile_name ) ):
69	line = line.rstrip( '\r\n' )
70	if not line or line.startswith( '#' ):
71	continue
72	if data_type == None:
73	if line.startswith( '>' ):
74	data_type = 'fasta'
75	continue
76	elif len( line.split( '\t' ) ) > 0:
77	fields = line.split()
78	for score in fields:
79	try:
80	int( score )
81	data_type = 'tabular'
82	seq_method = 'solexa'
83	break
84	except:
85	break
86	elif data_type == 'fasta':
87	fields = line.split()
88	for score in fields:
89	try:
90	int( score )
91	seq_method = '454'
92	break
93	except:
94	break
95	if i == 100:
96	break
97
98	if data_type is None:
99	stop_err( 'This tool can only use fasta data or tabular data.' )
100	if seq_method is None:
101	stop_err( 'Invalid data for fasta format.')
102
103	# Determine fixed length or variable length within the first 100 lines
104	read_length = 0
105	variable_length = False
106	if seq_method == 'solexa':
107	for i, line in enumerate( file( infile_name ) ):
108	line = line.rstrip( '\r\n' )
109	if not line or line.startswith( '#' ):
110	continue
111	scores = line.split('\t')
112	if read_length == 0:
113	read_length = len( scores )
114	if read_length != len( scores ):
115	variable_length = True
116	break
117	if i == 100:
118	break
119	elif seq_method == '454':
120	score = ''
121	for i, line in enumerate( file( infile_name ) ):
122	line = line.rstrip( '\r\n' )
123	if not line or line.startswith( '#' ):
124	continue
125	if line.startswith( '>' ):
126	if len( score ) > 0:
127	score = score.split()
128	if read_length == 0:
129	read_length = len( score )
130	if read_length != len( score ):
131	variable_length = True
132	break
133	score = ''
134	else:
135	score = score + ' ' + line
136	if i == 100:
137	break
138
139	if variable_length:
140	number_of_points = 20
141	else:
142	number_of_points = read_length
143	read_length_threshold = 100 # minimal read length for 454 file
144	score_points = []
145	score_matrix = []
146	invalid_scores = 0
147
148	if seq_method == 'solexa':
149	for i, line in enumerate( open( infile_name ) ):
150	line = line.rstrip( '\r\n' )
151	if not line or line.startswith( '#' ):
152	continue
153	tmp_array = []
154	scores = line.split( '\t' )
155	for bases in scores:
156	nuc_errors = bases.split()
157	try:
158	nuc_errors[0] = int( nuc_errors[0] )
159	nuc_errors[1] = int( nuc_errors[1] )
160	nuc_errors[2] = int( nuc_errors[2] )
161	nuc_errors[3] = int( nuc_errors[3] )
162	big = max( nuc_errors )
163	except:
164	#print 'Invalid numbers in the file. Skipped.'
165	invalid_scores += 1
166	big = 0
167	tmp_array.append( big )
168	score_points.append( tmp_array )
169	elif seq_method == '454':
170	# skip the last fasta sequence
171	score = ''
172	for i, line in enumerate( open( infile_name ) ):
173	line = line.rstrip( '\r\n' )
174	if not line or line.startswith( '#' ):
175	continue
176	if line.startswith( '>' ):
177	if len( score ) > 0:
178	score = ['0'] + score.split()
179	read_length = len( score )
180	tmp_array = []
181	if not variable_length:
182	score.pop(0)
183	score_points.append( score )
184	tmp_array = score
185	elif read_length > read_length_threshold:
186	score_points_tmp = merge_to_20_datapoints( score )
187	score_points.append( score_points_tmp )
188	tmp_array = score_points_tmp
189	score = ''
190	else:
191	score = "%s %s" % ( score, line )
192	if len( score ) > 0:
193	score = ['0'] + score.split()
194	read_length = len( score )
195	if not variable_length:
196	score.pop(0)
197	score_points.append( score )
198	elif read_length > read_length_threshold:
199	score_points_tmp = merge_to_20_datapoints( score )
200	score_points.append( score_points_tmp )
201	tmp_array = score_points_tmp
202
203	# reverse the matrix, for R
204	for i in range( number_of_points - 1 ):
205	tmp_array = []
206	for j in range( len( score_points ) ):
207	try:
208	tmp_array.append( int( score_points[j][i] ) )
209	except:
210	invalid_lines += 1
211	score_matrix.append( tmp_array )
212
213	# generate pdf figures
214	#outfile_R_pdf = outfile_R_name
215	#r.pdf( outfile_R_pdf )
216	outfile_R_png = outfile_R_name
217	r.bitmap( outfile_R_png )
218
219	title = "boxplot of quality scores"
220	empty_score_matrix_columns = 0
221	for i, subset in enumerate( score_matrix ):
222	if not subset:
223	empty_score_matrix_columns += 1
224	score_matrix[i] = [0]
225
226	if not variable_length:
227	r.boxplot( score_matrix, xlab="location in read length", main=title )
228	else:
229	r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title )
230	x_old_range = []
231	x_new_range = []
232	step = read_length_threshold / number_of_points
233	for i in xrange( 0, read_length_threshold, step ):
234	x_old_range.append( ( i / step ) )
235	x_new_range.append( i )
236	r.axis( 1, x_old_range, x_new_range )
237	r.dev_off()
238
239	if invalid_scores > 0:
240	print 'Skipped %d invalid scores. ' % invalid_scores
241	if invalid_lines > 0:
242	print 'Skipped %d invalid lines. ' % invalid_lines
243	if empty_score_matrix_columns > 0:
244	print '%d missing scores in score_matrix. ' % empty_score_matrix_columns
245
246	r.quit(save = "no")
247
248	if __name__=="__main__":__main__()

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/metag_tools/short_reads_figure_score.py @ 2

異なるフォーマットでダウンロード: