[2] | 1 | #!/usr/bin/env python |
---|
| 2 | """ |
---|
| 3 | trim reads based on the quality scores |
---|
| 4 | input: read file and quality score file |
---|
| 5 | output: trimmed read file |
---|
| 6 | """ |
---|
| 7 | |
---|
| 8 | import os, sys, math, tempfile, re |
---|
| 9 | |
---|
| 10 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 11 | |
---|
| 12 | def stop_err( msg ): |
---|
| 13 | sys.stderr.write( "%s\n" % msg ) |
---|
| 14 | sys.exit() |
---|
| 15 | |
---|
| 16 | def append_to_outfile( outfile_name, seq_title, segments ): |
---|
| 17 | segments = segments.split( ',' ) |
---|
| 18 | if len( segments ) > 1: |
---|
| 19 | outfile = open( outfile_name, 'a' ) |
---|
| 20 | for i in range( len( segments ) ): |
---|
| 21 | outfile.write( "%s_%d\n%s\n" % ( seq_title, i, segments[i] ) ) |
---|
| 22 | outfile.close() |
---|
| 23 | elif segments[0]: |
---|
| 24 | outfile = open( outfile_name, 'a' ) |
---|
| 25 | outfile.write( "%s\n%s\n" % ( seq_title, segments[0] ) ) |
---|
| 26 | outfile.close() |
---|
| 27 | |
---|
| 28 | def trim_seq( seq, score, arg, trim_score, threshold ): |
---|
| 29 | seq_method = '454' |
---|
| 30 | trim_pos = 0 |
---|
| 31 | # trim after a certain position |
---|
| 32 | if arg.isdigit(): |
---|
| 33 | keep_homopolymers = False |
---|
| 34 | trim_pos = int( arg ) |
---|
| 35 | if trim_pos > 0 and trim_pos < len( seq ): |
---|
| 36 | seq = seq[0:trim_pos] |
---|
| 37 | else: |
---|
| 38 | keep_homopolymers = arg=='yes' |
---|
| 39 | |
---|
| 40 | new_trim_seq = '' |
---|
| 41 | max_segment = 0 |
---|
| 42 | |
---|
| 43 | for i in range( len( seq ) ): |
---|
| 44 | if i >= len( score ): |
---|
| 45 | score.append(-1) |
---|
| 46 | if int( score[i] ) >= trim_score: |
---|
| 47 | pass_nuc = seq[ i:( i + 1 ) ] |
---|
| 48 | else: |
---|
| 49 | if keep_homopolymers and ( (i == 0 ) or ( seq[ i:( i + 1 ) ].lower() == seq[ ( i - 1 ):i ].lower() ) ): |
---|
| 50 | pass_nuc = seq[ i:( i + 1 ) ] |
---|
| 51 | else: |
---|
| 52 | pass_nuc = ' ' |
---|
| 53 | new_trim_seq = '%s%s' % ( new_trim_seq, pass_nuc ) |
---|
| 54 | # find the max substrings |
---|
| 55 | segments = new_trim_seq.split() |
---|
| 56 | max_segment = '' |
---|
| 57 | len_max_segment = 0 |
---|
| 58 | if threshold == 0: |
---|
| 59 | for seg in segments: |
---|
| 60 | if len_max_segment < len( seg ): |
---|
| 61 | max_segment = '%s,' % seg |
---|
| 62 | len_max_segment = len( seg ) |
---|
| 63 | elif len_max_segment == len( seg ): |
---|
| 64 | max_segment = '%s%s,' % ( max_segment, seg ) |
---|
| 65 | else: |
---|
| 66 | for seg in segments: |
---|
| 67 | if len( seg ) >= threshold: |
---|
| 68 | max_segment = '%s%s,' % ( max_segment, seg ) |
---|
| 69 | return max_segment[ 0:-1 ] |
---|
| 70 | |
---|
| 71 | def __main__(): |
---|
| 72 | |
---|
| 73 | try: |
---|
| 74 | threshold_trim = int( sys.argv[1].strip() ) |
---|
| 75 | except: |
---|
| 76 | stop_err( "Minimal quality score must be numeric." ) |
---|
| 77 | try: |
---|
| 78 | threshold_report = int( sys.argv[2].strip() ) |
---|
| 79 | except: |
---|
| 80 | stop_err( "Minimal length of trimmed reads must be numeric." ) |
---|
| 81 | outfile_seq_name = sys.argv[3].strip() |
---|
| 82 | infile_seq_name = sys.argv[4].strip() |
---|
| 83 | infile_score_name = sys.argv[5].strip() |
---|
| 84 | arg = sys.argv[6].strip() |
---|
| 85 | |
---|
| 86 | seq_infile_name = infile_seq_name |
---|
| 87 | score_infile_name = infile_score_name |
---|
| 88 | |
---|
| 89 | |
---|
| 90 | # Determine quailty score format: tabular or fasta format within the first 100 lines |
---|
| 91 | seq_method = None |
---|
| 92 | data_type = None |
---|
| 93 | for i, line in enumerate( file( score_infile_name ) ): |
---|
| 94 | line = line.rstrip( '\r\n' ) |
---|
| 95 | if not line or line.startswith( '#' ): |
---|
| 96 | continue |
---|
| 97 | if data_type == None: |
---|
| 98 | if line.startswith( '>' ): |
---|
| 99 | data_type = 'fasta' |
---|
| 100 | continue |
---|
| 101 | elif len( line.split( '\t' ) ) > 0: |
---|
| 102 | fields = line.split() |
---|
| 103 | for score in fields: |
---|
| 104 | try: |
---|
| 105 | int( score ) |
---|
| 106 | data_type = 'tabular' |
---|
| 107 | seq_method = 'solexa' |
---|
| 108 | break |
---|
| 109 | except: |
---|
| 110 | break |
---|
| 111 | elif data_type == 'fasta': |
---|
| 112 | fields = line.split() |
---|
| 113 | for score in fields: |
---|
| 114 | try: |
---|
| 115 | int( score ) |
---|
| 116 | seq_method = '454' |
---|
| 117 | break |
---|
| 118 | except: |
---|
| 119 | break |
---|
| 120 | if i == 100: |
---|
| 121 | break |
---|
| 122 | |
---|
| 123 | if data_type is None: |
---|
| 124 | stop_err( 'This tool can only use fasta data or tabular data.' ) |
---|
| 125 | if seq_method is None: |
---|
| 126 | stop_err( 'Invalid data for fasta format.') |
---|
| 127 | |
---|
| 128 | if os.path.exists( seq_infile_name ) and os.path.exists( score_infile_name ): |
---|
| 129 | seq = None |
---|
| 130 | score = None |
---|
| 131 | score_found = False |
---|
| 132 | |
---|
| 133 | score_file = open( score_infile_name, 'r' ) |
---|
| 134 | |
---|
| 135 | for i, line in enumerate( open( seq_infile_name ) ): |
---|
| 136 | line = line.rstrip( '\r\n' ) |
---|
| 137 | if not line or line.startswith( '#' ): |
---|
| 138 | continue |
---|
| 139 | if line.startswith( '>' ): |
---|
| 140 | if seq: |
---|
| 141 | scores = [] |
---|
| 142 | if data_type == 'fasta': |
---|
| 143 | score = None |
---|
| 144 | score_found = False |
---|
| 145 | score_line = 'start' |
---|
| 146 | while not score_found and score_line: |
---|
| 147 | score_line = score_file.readline().rstrip( '\r\n' ) |
---|
| 148 | if not score_line or score_line.startswith( '#' ): |
---|
| 149 | continue |
---|
| 150 | if score_line.startswith( '>' ): |
---|
| 151 | if score: |
---|
| 152 | scores = score.split() |
---|
| 153 | score_found = True |
---|
| 154 | score = None |
---|
| 155 | else: |
---|
| 156 | for val in score_line.split(): |
---|
| 157 | try: |
---|
| 158 | int( val ) |
---|
| 159 | except: |
---|
| 160 | score_file.close() |
---|
| 161 | stop_err( "Non-numerical value '%s' in score file." % val ) |
---|
| 162 | if not score: |
---|
| 163 | score = score_line |
---|
| 164 | else: |
---|
| 165 | score = '%s %s' % ( score, score_line ) |
---|
| 166 | elif data_type == 'tabular': |
---|
| 167 | score = score_file.readline().rstrip('\r\n') |
---|
| 168 | loc = score.split( '\t' ) |
---|
| 169 | for base in loc: |
---|
| 170 | nuc_error = base.split() |
---|
| 171 | try: |
---|
| 172 | nuc_error[0] = int( nuc_error[0] ) |
---|
| 173 | nuc_error[1] = int( nuc_error[1] ) |
---|
| 174 | nuc_error[2] = int( nuc_error[2] ) |
---|
| 175 | nuc_error[3] = int( nuc_error[3] ) |
---|
| 176 | big = max( nuc_error ) |
---|
| 177 | except: |
---|
| 178 | score_file.close() |
---|
| 179 | stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) ) |
---|
| 180 | scores.append( big ) |
---|
| 181 | if scores: |
---|
| 182 | new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report ) |
---|
| 183 | append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments ) |
---|
| 184 | |
---|
| 185 | seq_title = line |
---|
| 186 | seq = None |
---|
| 187 | else: |
---|
| 188 | if not seq: |
---|
| 189 | seq = line |
---|
| 190 | else: |
---|
| 191 | seq = "%s%s" % ( seq, line ) |
---|
| 192 | if seq: |
---|
| 193 | scores = [] |
---|
| 194 | if data_type == 'fasta': |
---|
| 195 | score = None |
---|
| 196 | while score_line: |
---|
| 197 | score_line = score_file.readline().rstrip( '\r\n' ) |
---|
| 198 | if not score_line or score_line.startswith( '#' ) or score_line.startswith( '>' ): |
---|
| 199 | continue |
---|
| 200 | for val in score_line.split(): |
---|
| 201 | try: |
---|
| 202 | int( val ) |
---|
| 203 | except: |
---|
| 204 | score_file.close() |
---|
| 205 | stop_err( "Non-numerical value '%s' in score file." % val ) |
---|
| 206 | if not score: |
---|
| 207 | score = score_line |
---|
| 208 | else: |
---|
| 209 | score = "%s %s" % ( score, score_line ) |
---|
| 210 | if score: |
---|
| 211 | scores = score.split() |
---|
| 212 | elif data_type == 'tabular': |
---|
| 213 | score = score_file.readline().rstrip('\r\n') |
---|
| 214 | loc = score.split( '\t' ) |
---|
| 215 | for base in loc: |
---|
| 216 | nuc_error = base.split() |
---|
| 217 | try: |
---|
| 218 | nuc_error[0] = int( nuc_error[0] ) |
---|
| 219 | nuc_error[1] = int( nuc_error[1] ) |
---|
| 220 | nuc_error[2] = int( nuc_error[2] ) |
---|
| 221 | nuc_error[3] = int( nuc_error[3] ) |
---|
| 222 | big = max( nuc_error ) |
---|
| 223 | except: |
---|
| 224 | score_file.close() |
---|
| 225 | stop_err( "Invalid characters in line %d: '%s'" % ( i, line ) ) |
---|
| 226 | scores.append( big ) |
---|
| 227 | if scores: |
---|
| 228 | new_trim_seq_segments = trim_seq( seq, scores, arg, threshold_trim, threshold_report ) |
---|
| 229 | append_to_outfile( outfile_seq_name, seq_title, new_trim_seq_segments ) |
---|
| 230 | score_file.close() |
---|
| 231 | else: |
---|
| 232 | stop_err( "Cannot locate sequence file '%s'or score file '%s'." % ( seq_infile_name, score_infile_name ) ) |
---|
| 233 | |
---|
| 234 | if __name__ == "__main__": __main__() |
---|