[2] | 1 | #!/usr/bin/env python |
---|
| 2 | """ |
---|
| 3 | boxplot: |
---|
| 4 | - box: first quartile and third quartile |
---|
| 5 | - line inside the box: median |
---|
| 6 | - outlier: 1.5 IQR higher than the third quartile or 1.5 IQR lower than the first quartile |
---|
| 7 | IQR = third quartile - first quartile |
---|
| 8 | - The smallest/largest value that is not an outlier is connected to the box by with a horizontal line. |
---|
| 9 | """ |
---|
| 10 | |
---|
| 11 | import os, sys, math, tempfile, re |
---|
| 12 | from rpy import * |
---|
| 13 | |
---|
| 14 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 15 | |
---|
| 16 | def stop_err( msg ): |
---|
| 17 | sys.stderr.write( "%s\n" % msg ) |
---|
| 18 | sys.exit() |
---|
| 19 | |
---|
| 20 | def merge_to_20_datapoints( score ): |
---|
| 21 | number_of_points = 20 |
---|
| 22 | read_length = len( score ) |
---|
| 23 | step = int( math.floor( ( read_length - 1 ) * 1.0 / number_of_points ) ) |
---|
| 24 | scores = [] |
---|
| 25 | point = 1 |
---|
| 26 | point_sum = 0 |
---|
| 27 | step_average = 0 |
---|
| 28 | score_points = 0 |
---|
| 29 | |
---|
| 30 | for i in xrange( 1, read_length ): |
---|
| 31 | if i < ( point * step ): |
---|
| 32 | point_sum += int( score[i] ) |
---|
| 33 | step_average += 1 |
---|
| 34 | else: |
---|
| 35 | point_avg = point_sum * 1.0 / step_average |
---|
| 36 | scores.append( point_avg ) |
---|
| 37 | point += 1 |
---|
| 38 | point_sum = 0 |
---|
| 39 | step_average = 0 |
---|
| 40 | if step_average > 0: |
---|
| 41 | point_avg = point_sum * 1.0 / step_average |
---|
| 42 | scores.append( point_avg ) |
---|
| 43 | if len( scores ) > number_of_points: |
---|
| 44 | last_avg = 0 |
---|
| 45 | for j in xrange( number_of_points - 1, len( scores ) ): |
---|
| 46 | last_avg += scores[j] |
---|
| 47 | last_avg = last_avg / ( len(scores) - number_of_points + 1 ) |
---|
| 48 | else: |
---|
| 49 | last_avg = scores[-1] |
---|
| 50 | score_points = [] |
---|
| 51 | for k in range( number_of_points - 1 ): |
---|
| 52 | score_points.append( scores[k] ) |
---|
| 53 | score_points.append( last_avg ) |
---|
| 54 | return score_points |
---|
| 55 | |
---|
| 56 | def __main__(): |
---|
| 57 | |
---|
| 58 | invalid_lines = 0 |
---|
| 59 | |
---|
| 60 | infile_score_name = sys.argv[1].strip() |
---|
| 61 | outfile_R_name = sys.argv[2].strip() |
---|
| 62 | |
---|
| 63 | infile_name = infile_score_name |
---|
| 64 | |
---|
| 65 | # Determine tabular or fasta format within the first 100 lines |
---|
| 66 | seq_method = None |
---|
| 67 | data_type = None |
---|
| 68 | for i, line in enumerate( file( infile_name ) ): |
---|
| 69 | line = line.rstrip( '\r\n' ) |
---|
| 70 | if not line or line.startswith( '#' ): |
---|
| 71 | continue |
---|
| 72 | if data_type == None: |
---|
| 73 | if line.startswith( '>' ): |
---|
| 74 | data_type = 'fasta' |
---|
| 75 | continue |
---|
| 76 | elif len( line.split( '\t' ) ) > 0: |
---|
| 77 | fields = line.split() |
---|
| 78 | for score in fields: |
---|
| 79 | try: |
---|
| 80 | int( score ) |
---|
| 81 | data_type = 'tabular' |
---|
| 82 | seq_method = 'solexa' |
---|
| 83 | break |
---|
| 84 | except: |
---|
| 85 | break |
---|
| 86 | elif data_type == 'fasta': |
---|
| 87 | fields = line.split() |
---|
| 88 | for score in fields: |
---|
| 89 | try: |
---|
| 90 | int( score ) |
---|
| 91 | seq_method = '454' |
---|
| 92 | break |
---|
| 93 | except: |
---|
| 94 | break |
---|
| 95 | if i == 100: |
---|
| 96 | break |
---|
| 97 | |
---|
| 98 | if data_type is None: |
---|
| 99 | stop_err( 'This tool can only use fasta data or tabular data.' ) |
---|
| 100 | if seq_method is None: |
---|
| 101 | stop_err( 'Invalid data for fasta format.') |
---|
| 102 | |
---|
| 103 | # Determine fixed length or variable length within the first 100 lines |
---|
| 104 | read_length = 0 |
---|
| 105 | variable_length = False |
---|
| 106 | if seq_method == 'solexa': |
---|
| 107 | for i, line in enumerate( file( infile_name ) ): |
---|
| 108 | line = line.rstrip( '\r\n' ) |
---|
| 109 | if not line or line.startswith( '#' ): |
---|
| 110 | continue |
---|
| 111 | scores = line.split('\t') |
---|
| 112 | if read_length == 0: |
---|
| 113 | read_length = len( scores ) |
---|
| 114 | if read_length != len( scores ): |
---|
| 115 | variable_length = True |
---|
| 116 | break |
---|
| 117 | if i == 100: |
---|
| 118 | break |
---|
| 119 | elif seq_method == '454': |
---|
| 120 | score = '' |
---|
| 121 | for i, line in enumerate( file( infile_name ) ): |
---|
| 122 | line = line.rstrip( '\r\n' ) |
---|
| 123 | if not line or line.startswith( '#' ): |
---|
| 124 | continue |
---|
| 125 | if line.startswith( '>' ): |
---|
| 126 | if len( score ) > 0: |
---|
| 127 | score = score.split() |
---|
| 128 | if read_length == 0: |
---|
| 129 | read_length = len( score ) |
---|
| 130 | if read_length != len( score ): |
---|
| 131 | variable_length = True |
---|
| 132 | break |
---|
| 133 | score = '' |
---|
| 134 | else: |
---|
| 135 | score = score + ' ' + line |
---|
| 136 | if i == 100: |
---|
| 137 | break |
---|
| 138 | |
---|
| 139 | if variable_length: |
---|
| 140 | number_of_points = 20 |
---|
| 141 | else: |
---|
| 142 | number_of_points = read_length |
---|
| 143 | read_length_threshold = 100 # minimal read length for 454 file |
---|
| 144 | score_points = [] |
---|
| 145 | score_matrix = [] |
---|
| 146 | invalid_scores = 0 |
---|
| 147 | |
---|
| 148 | if seq_method == 'solexa': |
---|
| 149 | for i, line in enumerate( open( infile_name ) ): |
---|
| 150 | line = line.rstrip( '\r\n' ) |
---|
| 151 | if not line or line.startswith( '#' ): |
---|
| 152 | continue |
---|
| 153 | tmp_array = [] |
---|
| 154 | scores = line.split( '\t' ) |
---|
| 155 | for bases in scores: |
---|
| 156 | nuc_errors = bases.split() |
---|
| 157 | try: |
---|
| 158 | nuc_errors[0] = int( nuc_errors[0] ) |
---|
| 159 | nuc_errors[1] = int( nuc_errors[1] ) |
---|
| 160 | nuc_errors[2] = int( nuc_errors[2] ) |
---|
| 161 | nuc_errors[3] = int( nuc_errors[3] ) |
---|
| 162 | big = max( nuc_errors ) |
---|
| 163 | except: |
---|
| 164 | #print 'Invalid numbers in the file. Skipped.' |
---|
| 165 | invalid_scores += 1 |
---|
| 166 | big = 0 |
---|
| 167 | tmp_array.append( big ) |
---|
| 168 | score_points.append( tmp_array ) |
---|
| 169 | elif seq_method == '454': |
---|
| 170 | # skip the last fasta sequence |
---|
| 171 | score = '' |
---|
| 172 | for i, line in enumerate( open( infile_name ) ): |
---|
| 173 | line = line.rstrip( '\r\n' ) |
---|
| 174 | if not line or line.startswith( '#' ): |
---|
| 175 | continue |
---|
| 176 | if line.startswith( '>' ): |
---|
| 177 | if len( score ) > 0: |
---|
| 178 | score = ['0'] + score.split() |
---|
| 179 | read_length = len( score ) |
---|
| 180 | tmp_array = [] |
---|
| 181 | if not variable_length: |
---|
| 182 | score.pop(0) |
---|
| 183 | score_points.append( score ) |
---|
| 184 | tmp_array = score |
---|
| 185 | elif read_length > read_length_threshold: |
---|
| 186 | score_points_tmp = merge_to_20_datapoints( score ) |
---|
| 187 | score_points.append( score_points_tmp ) |
---|
| 188 | tmp_array = score_points_tmp |
---|
| 189 | score = '' |
---|
| 190 | else: |
---|
| 191 | score = "%s %s" % ( score, line ) |
---|
| 192 | if len( score ) > 0: |
---|
| 193 | score = ['0'] + score.split() |
---|
| 194 | read_length = len( score ) |
---|
| 195 | if not variable_length: |
---|
| 196 | score.pop(0) |
---|
| 197 | score_points.append( score ) |
---|
| 198 | elif read_length > read_length_threshold: |
---|
| 199 | score_points_tmp = merge_to_20_datapoints( score ) |
---|
| 200 | score_points.append( score_points_tmp ) |
---|
| 201 | tmp_array = score_points_tmp |
---|
| 202 | |
---|
| 203 | # reverse the matrix, for R |
---|
| 204 | for i in range( number_of_points - 1 ): |
---|
| 205 | tmp_array = [] |
---|
| 206 | for j in range( len( score_points ) ): |
---|
| 207 | try: |
---|
| 208 | tmp_array.append( int( score_points[j][i] ) ) |
---|
| 209 | except: |
---|
| 210 | invalid_lines += 1 |
---|
| 211 | score_matrix.append( tmp_array ) |
---|
| 212 | |
---|
| 213 | # generate pdf figures |
---|
| 214 | #outfile_R_pdf = outfile_R_name |
---|
| 215 | #r.pdf( outfile_R_pdf ) |
---|
| 216 | outfile_R_png = outfile_R_name |
---|
| 217 | r.bitmap( outfile_R_png ) |
---|
| 218 | |
---|
| 219 | title = "boxplot of quality scores" |
---|
| 220 | empty_score_matrix_columns = 0 |
---|
| 221 | for i, subset in enumerate( score_matrix ): |
---|
| 222 | if not subset: |
---|
| 223 | empty_score_matrix_columns += 1 |
---|
| 224 | score_matrix[i] = [0] |
---|
| 225 | |
---|
| 226 | if not variable_length: |
---|
| 227 | r.boxplot( score_matrix, xlab="location in read length", main=title ) |
---|
| 228 | else: |
---|
| 229 | r.boxplot( score_matrix, xlab="position within read (% of total length)", xaxt="n", main=title ) |
---|
| 230 | x_old_range = [] |
---|
| 231 | x_new_range = [] |
---|
| 232 | step = read_length_threshold / number_of_points |
---|
| 233 | for i in xrange( 0, read_length_threshold, step ): |
---|
| 234 | x_old_range.append( ( i / step ) ) |
---|
| 235 | x_new_range.append( i ) |
---|
| 236 | r.axis( 1, x_old_range, x_new_range ) |
---|
| 237 | r.dev_off() |
---|
| 238 | |
---|
| 239 | if invalid_scores > 0: |
---|
| 240 | print 'Skipped %d invalid scores. ' % invalid_scores |
---|
| 241 | if invalid_lines > 0: |
---|
| 242 | print 'Skipped %d invalid lines. ' % invalid_lines |
---|
| 243 | if empty_score_matrix_columns > 0: |
---|
| 244 | print '%d missing scores in score_matrix. ' % empty_score_matrix_columns |
---|
| 245 | |
---|
| 246 | r.quit(save = "no") |
---|
| 247 | |
---|
| 248 | if __name__=="__main__":__main__() |
---|