1 | #!/usr/bin/env python |
---|
2 | """ |
---|
3 | boxplot: |
---|
4 | - box: first quartile and third quartile |
---|
5 | - line inside the box: median |
---|
6 | - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile |
---|
7 | IQR = third quartile - first quartile |
---|
8 | - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. |
---|
9 | """ |
---|
10 | |
---|
11 | import os, sys, math, tempfile, re |
---|
12 | from rpy import * |
---|
13 | |
---|
14 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
15 | |
---|
16 | def stop_err( msg ): |
---|
17 | sys.stderr.write( "%s\n" % msg ) |
---|
18 | sys.exit() |
---|
19 | |
---|
20 | def merge_to_20_datapoints( score ): |
---|
21 | number_of_points = 20 |
---|
22 | read_length = len( score ) |
---|
23 | step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) |
---|
24 | scores = [] |
---|
25 | point = 1 |
---|
26 | point_sum = 0 |
---|
27 | step_average = 0 |
---|
28 | score_points = 0 |
---|
29 | |
---|
30 | for i in xrange( 1, read_length ): |
---|
31 | if i < ( point * step ): |
---|
32 | point_sum += int( score[i] ) |
---|
33 | step_average += 1 |
---|
34 | else: |
---|
35 | point_avg = point_sum * 1.0 / step_average |
---|
36 | scores.append( point_avg ) |
---|
37 | point += 1 |
---|
38 | point_sum = 0 |
---|
39 | step_average = 0 |
---|
40 | if step_average > 0: |
---|
41 | point_avg = point_sum * 1.0 / step_average |
---|
42 | scores.append( point_avg ) |
---|
43 | if len( scores ) > number_of_points: |
---|
44 | last_avg = 0 |
---|
45 | for j in xrange( number_of_points - 1, len( scores ) ): |
---|
46 | last_avg += scores[j] |
---|
47 | last_avg = last_avg / ( len(scores) - number_of_points + 1 ) |
---|
48 | else: |
---|
49 | last_avg = scores[-1] |
---|
50 | score_points = [] |
---|
51 | for k in range( number_of_points - 1 ): |
---|
52 | score_points.append( scores[k] ) |
---|
53 | score_points.append( last_avg ) |
---|
54 | return score_points |
---|
55 | |
---|
56 | def __main__(): |
---|
57 | |
---|
58 | invalid_lines = 0 |
---|
59 | |
---|
60 | infile_score_name = sys.argv[1].strip() |
---|
61 | outfile_R_name = sys.argv[2].strip() |
---|
62 | |
---|
63 | infile_name = infile_score_name |
---|
64 | |
---|
65 | # Determine tabular or fasta format within the first 100 lines |
---|
66 | seq_method = None |
---|
67 | data_type = None |
---|
68 | for i, line in enumerate( file( infile_name ) ): |
---|
69 | line = line.rstrip( '\r\n' ) |
---|
70 | if not line or line.startswith( '#' ): |
---|
71 | continue |
---|
72 | if data_type == None: |
---|
73 | if line.startswith( '>' ): |
---|
74 | data_type = 'fasta' |
---|
75 | continue |
---|
76 | elif len( line.split( '\t' ) ) > 0: |
---|
77 | fields = line.split() |
---|
78 | for score in fields: |
---|
79 | try: |
---|
80 | int( score ) |
---|
81 | data_type = 'tabular' |
---|
82 | seq_method = 'solexa' |
---|
83 | break |
---|
84 | except: |
---|
85 | break |
---|
86 | elif data_type == 'fasta': |
---|
87 | fields = line.split() |
---|
88 | for score in fields: |
---|
89 | try: |
---|
90 | int( score ) |
---|
91 | seq_method = '454' |
---|
92 | break |
---|
93 | except: |
---|
94 | break |
---|
95 | if i == 100: |
---|
96 | break |
---|
97 | |
---|
98 | if data_type is None: |
---|
99 | stop_err( 'This tool can only use fasta data or tabular data.' ) |
---|
100 | if seq_method is None: |
---|
101 | stop_err( 'Invalid data for fasta format.') |
---|
102 | |
---|
103 | # Determine fixed length or variable length within the first 100 lines |
---|
104 | read_length = 0 |
---|
105 | variable_length = False |
---|
106 | if seq_method == 'solexa': |
---|
107 | for i, line in enumerate( file( infile_name ) ): |
---|
108 | line = line.rstrip( '\r\n' ) |
---|
109 | if not line or line.startswith( '#' ): |
---|
110 | continue |
---|
111 | scores = line.split('\t') |
---|
112 | if read_length == 0: |
---|
113 | read_length = len( scores ) |
---|
114 | if read_length != len( scores ): |
---|
115 | variable_length = True |
---|
116 | break |
---|
117 | if i == 100: |
---|
118 | break |
---|
119 | elif seq_method == '454': |
---|
120 | score = '' |
---|
121 | for i, line in enumerate( file( infile_name ) ): |
---|
122 | line = line.rstrip( '\r\n' ) |
---|
123 | if not line or line.startswith( '#' ): |
---|
124 | continue |
---|
125 | if line.startswith( '>' ): |
---|
126 | if len( score ) > 0: |
---|
127 | score = score.split() |
---|
128 | if read_length == 0: |
---|
129 | read_length = len( score ) |
---|
130 | if read_length != len( score ): |
---|
131 | variable_length = True |
---|
132 | break |
---|
133 | score = '' |
---|
134 | else: |
---|
135 | score = score + ' ' + line |
---|
136 | if i == 100: |
---|
137 | break |
---|
138 | |
---|
139 | if variable_length: |
---|
140 | number_of_points = 20 |
---|
141 | else: |
---|
142 | number_of_points = read_length |
---|
143 | read_length_threshold = 100 # minimal read length for 454 file |
---|
144 | score_points = [] |
---|
145 | score_matrix = [] |
---|
146 | invalid_scores = 0 |
---|
147 | |
---|
148 | if seq_method == 'solexa': |
---|
149 | for i, line in enumerate( open( infile_name ) ): |
---|
150 | line = line.rstrip( '\r\n' ) |
---|
151 | if not line or line.startswith( '#' ): |
---|
152 | continue |
---|
153 | tmp_array = [] |
---|
154 | scores = line.split( '\t' ) |
---|
155 | for bases in scores: |
---|
156 | nuc_errors = bases.split() |
---|
157 | try: |
---|
158 | nuc_errors[0] = int( nuc_errors[0] ) |
---|
159 | nuc_errors[1] = int( nuc_errors[1] ) |
---|
160 | nuc_errors[2] = int( nuc_errors[2] ) |
---|
161 | nuc_errors[3] = int( nuc_errors[3] ) |
---|
162 | big = max( nuc_errors ) |
---|
163 | except: |
---|
164 | #print 'Invalid numbers in the file. Skipped.' |
---|
165 | invalid_scores += 1 |
---|
166 | big = 0 |
---|
167 | tmp_array.append( big ) |
---|
168 | score_points.append( tmp_array ) |
---|
169 | elif seq_method == '454': |
---|
170 | # skip the last fasta sequence |
---|
171 | score = '' |
---|
172 | for i, line in enumerate( open( infile_name ) ): |
---|
173 | line = line.rstrip( '\r\n' ) |
---|
174 | if not line or line.startswith( '#' ): |
---|
175 | continue |
---|
176 | if line.startswith( '>' ): |
---|
177 | if len( score ) > 0: |
---|
178 | score = ['0'] + score.split() |
---|
179 | read_length = len( score ) |
---|
180 | tmp_array = [] |
---|
181 | if not variable_length: |
---|
182 | score.pop(0) |
---|
183 | score_points.append( score ) |
---|
184 | tmp_array = score |
---|
185 | elif read_length > read_length_threshold: |
---|
186 | score_points_tmp = merge_to_20_datapoints( score ) |
---|
187 | score_points.append( score_points_tmp ) |
---|
188 | tmp_array = score_points_tmp |
---|
189 | score = '' |
---|
190 | else: |
---|
191 | score = "%s %s" % ( score, line ) |
---|
192 | if len( score ) > 0: |
---|
193 | score = ['0'] + score.split() |
---|
194 | read_length = len( score ) |
---|
195 | if not variable_length: |
---|
196 | score.pop(0) |
---|
197 | score_points.append( score ) |
---|
198 | elif read_length > read_length_threshold: |
---|
199 | score_points_tmp = merge_to_20_datapoints( score ) |
---|
200 | score_points.append( score_points_tmp ) |
---|
201 | tmp_array = score_points_tmp |
---|
202 | |
---|
203 | # reverse the matrix, for R |
---|
204 | for i in range( number_of_points - 1 ): |
---|
205 | tmp_array = [] |
---|
206 | for j in range( len( score_points ) ): |
---|
207 | try: |
---|
208 | tmp_array.append( int( score_points[j][i] ) ) |
---|
209 | except: |
---|
210 | invalid_lines += 1 |
---|
211 | score_matrix.append( tmp_array ) |
---|
212 | |
---|
213 | # generate pdf figures |
---|
214 | #outfile_R_pdf = outfile_R_name |
---|
215 | #r.pdf( outfile_R_pdf ) |
---|
216 | outfile_R_png = outfile_R_name |
---|
217 | r.bitmap( outfile_R_png ) |
---|
218 | |
---|
219 | title = "boxplot of quality scores" |
---|
220 | empty_score_matrix_columns = 0 |
---|
221 | for i, subset in enumerate( score_matrix ): |
---|
222 | if not subset: |
---|
223 | empty_score_matrix_columns += 1 |
---|
224 | score_matrix[i] = [0] |
---|
225 | |
---|
226 | if not variable_length: |
---|
227 | r.boxplot( score_matrix, xlab="location in read length", main=title ) |
---|
228 | else: |
---|
229 | r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) |
---|
230 | x_old_range = [] |
---|
231 | x_new_range = [] |
---|
232 | step = read_length_threshold / number_of_points |
---|
233 | for i in xrange( 0, read_length_threshold, step ): |
---|
234 | x_old_range.append( ( i / step ) ) |
---|
235 | x_new_range.append( i ) |
---|
236 | r.axis( 1, x_old_range, x_new_range ) |
---|
237 | r.dev_off() |
---|
238 | |
---|
239 | if invalid_scores > 0: |
---|
240 | print 'Skipped %d invalid scores. ' % invalid_scores |
---|
241 | if invalid_lines > 0: |
---|
242 | print 'Skipped %d invalid lines. ' % invalid_lines |
---|
243 | if empty_score_matrix_columns > 0: |
---|
244 | print '%d missing scores in score_matrix. ' % empty_score_matrix_columns |
---|
245 | |
---|
246 | r.quit(save = "no") |
---|
247 | |
---|
248 | if __name__=="__main__":__main__() |
---|