[2] | 1 | #Dan Blankenberg |
---|
| 2 | import math |
---|
| 3 | import string |
---|
| 4 | import transform |
---|
| 5 | from sequence import SequencingRead |
---|
| 6 | from fasta import fastaSequence |
---|
| 7 | |
---|
| 8 | class fastqSequencingRead( SequencingRead ): |
---|
| 9 | format = 'sanger' #sanger is default |
---|
| 10 | ascii_min = 33 |
---|
| 11 | ascii_max = 126 |
---|
| 12 | quality_min = 0 |
---|
| 13 | quality_max = 93 |
---|
| 14 | score_system = 'phred' #phred or solexa |
---|
| 15 | sequence_space = 'base' #base or color |
---|
| 16 | @classmethod |
---|
| 17 | def get_class_by_format( cls, format ): |
---|
| 18 | assert format in FASTQ_FORMATS, 'Unknown format type specified: %s' % format |
---|
| 19 | return FASTQ_FORMATS[ format ] |
---|
| 20 | @classmethod |
---|
| 21 | def convert_score_phred_to_solexa( cls, decimal_score_list ): |
---|
| 22 | def phred_to_solexa( score ): |
---|
| 23 | if score <= 0: #can't take log10( 1 - 1 ); make <= 0 into -5 |
---|
| 24 | return -5 |
---|
| 25 | return int( round( 10.0 * math.log10( math.pow( 10.0, ( float( score ) / 10.0 ) ) - 1.0 ) ) ) |
---|
| 26 | return map( phred_to_solexa, decimal_score_list ) |
---|
| 27 | @classmethod |
---|
| 28 | def convert_score_solexa_to_phred( cls, decimal_score_list ): |
---|
| 29 | def solexa_to_phred( score ): |
---|
| 30 | return int( round( 10.0 * math.log10( math.pow( 10.0, ( float( score ) / 10.0 ) ) + 1.0 ) ) ) |
---|
| 31 | return map( solexa_to_phred, decimal_score_list ) |
---|
| 32 | @classmethod |
---|
| 33 | def restrict_scores_to_valid_range( cls, decimal_score_list ): |
---|
| 34 | def restrict_score( score ): |
---|
| 35 | return max( min( score, cls.quality_max ), cls.quality_min ) |
---|
| 36 | return map( restrict_score, decimal_score_list ) |
---|
| 37 | @classmethod |
---|
| 38 | def convert_base_to_color_space( cls, sequence ): |
---|
| 39 | return cls.color_space_converter.to_color_space( sequence ) |
---|
| 40 | @classmethod |
---|
| 41 | def convert_color_to_base_space( cls, sequence ): |
---|
| 42 | return cls.color_space_converter.to_base_space( sequence ) |
---|
| 43 | def is_ascii_encoded( self ): |
---|
| 44 | #as per fastq definition only decimal quality strings can have spaces (and TABs for our purposes) in them (and must have a trailing space) |
---|
| 45 | if ' ' in self.quality: |
---|
| 46 | return False |
---|
| 47 | if '\t' in self.quality: |
---|
| 48 | return False |
---|
| 49 | return True |
---|
| 50 | def get_ascii_quality_scores( self ): |
---|
| 51 | if self.is_ascii_encoded(): |
---|
| 52 | return list( self.quality ) |
---|
| 53 | else: |
---|
| 54 | quality = self.quality.rstrip() #decimal scores should have a trailing space |
---|
| 55 | if quality: |
---|
| 56 | try: |
---|
| 57 | return [ chr( int( val ) + self.ascii_min - self.quality_min ) for val in quality.split() ] |
---|
| 58 | except ValueError, e: |
---|
| 59 | raise ValueError( 'Error Parsing quality String. ASCII quality strings cannot contain spaces (%s): %s' % ( self.quality, e ) ) |
---|
| 60 | else: |
---|
| 61 | return [] |
---|
| 62 | def get_decimal_quality_scores( self ): |
---|
| 63 | if self.is_ascii_encoded(): |
---|
| 64 | return [ ord( val ) - self.ascii_min + self.quality_min for val in self.quality ] |
---|
| 65 | else: |
---|
| 66 | quality = self.quality.rstrip() #decimal scores should have a trailing space |
---|
| 67 | if quality: |
---|
| 68 | return [ int( val ) for val in quality.split() if val.strip() ] |
---|
| 69 | else: |
---|
| 70 | return [] |
---|
| 71 | def convert_read_to_format( self, format, force_quality_encoding = None ): |
---|
| 72 | assert format in FASTQ_FORMATS, 'Unknown format type specified: %s' % format |
---|
| 73 | assert force_quality_encoding in [ None, 'ascii', 'decimal' ], 'Invalid force_quality_encoding: %s' % force_quality_encoding |
---|
| 74 | new_class = FASTQ_FORMATS[ format ] |
---|
| 75 | new_read = new_class() |
---|
| 76 | new_read.identifier = self.identifier |
---|
| 77 | if self.sequence_space == new_class.sequence_space: |
---|
| 78 | new_read.sequence = self.sequence |
---|
| 79 | else: |
---|
| 80 | if self.sequence_space == 'base': |
---|
| 81 | new_read.sequence = self.convert_base_to_color_space( self.sequence ) |
---|
| 82 | else: |
---|
| 83 | new_read.sequence = self.convert_color_to_base_space( self.sequence ) |
---|
| 84 | new_read.description = self.description |
---|
| 85 | if self.score_system != new_read.score_system: |
---|
| 86 | if self.score_system == 'phred': |
---|
| 87 | score_list = self.convert_score_phred_to_solexa( self.get_decimal_quality_scores() ) |
---|
| 88 | else: |
---|
| 89 | score_list = self.convert_score_solexa_to_phred( self.get_decimal_quality_scores() ) |
---|
| 90 | else: |
---|
| 91 | score_list = self.get_decimal_quality_scores() |
---|
| 92 | new_read.quality = "%s " % " ".join( map( str, new_class.restrict_scores_to_valid_range( score_list ) ) ) #need trailing space to be valid decimal fastq |
---|
| 93 | if force_quality_encoding is None: |
---|
| 94 | if self.is_ascii_encoded(): |
---|
| 95 | new_encoding = 'ascii' |
---|
| 96 | else: |
---|
| 97 | new_encoding = 'decimal' |
---|
| 98 | else: |
---|
| 99 | new_encoding = force_quality_encoding |
---|
| 100 | if new_encoding == 'ascii': |
---|
| 101 | new_read.quality = "".join( new_read.get_ascii_quality_scores() ) |
---|
| 102 | return new_read |
---|
| 103 | def get_sequence( self ): |
---|
| 104 | return self.sequence |
---|
| 105 | def slice( self, left_column_offset, right_column_offset ): |
---|
| 106 | new_read = fastqSequencingRead.get_class_by_format( self.format )() |
---|
| 107 | new_read.identifier = self.identifier |
---|
| 108 | new_read.sequence = self.get_sequence()[left_column_offset:right_column_offset] |
---|
| 109 | new_read.description = self.description |
---|
| 110 | if self.is_ascii_encoded(): |
---|
| 111 | new_read.quality = self.quality[left_column_offset:right_column_offset] |
---|
| 112 | else: |
---|
| 113 | quality = map( str, self.get_decimal_quality_scores()[left_column_offset:right_column_offset] ) |
---|
| 114 | if quality: |
---|
| 115 | new_read.quality = "%s " % " ".join( quality ) |
---|
| 116 | else: |
---|
| 117 | new_read.quality = '' |
---|
| 118 | return new_read |
---|
| 119 | def is_valid_format( self ): |
---|
| 120 | if self.is_ascii_encoded(): |
---|
| 121 | for val in self.get_ascii_quality_scores(): |
---|
| 122 | val = ord( val ) |
---|
| 123 | if val < self.ascii_min or val > self.ascii_max: |
---|
| 124 | return False |
---|
| 125 | else: |
---|
| 126 | for val in self.get_decimal_quality_scores(): |
---|
| 127 | if val < self.quality_min or val > self.quality_max: |
---|
| 128 | return False |
---|
| 129 | if not self.is_valid_sequence(): |
---|
| 130 | return False |
---|
| 131 | return True |
---|
| 132 | def is_valid_sequence( self ): |
---|
| 133 | for base in self.get_sequence(): |
---|
| 134 | if base not in self.valid_sequence_list: |
---|
| 135 | return False |
---|
| 136 | return True |
---|
| 137 | def insufficient_quality_length( self ): |
---|
| 138 | return len( self.get_ascii_quality_scores() ) < len( self.sequence ) |
---|
| 139 | def assert_sequence_quality_lengths( self ): |
---|
| 140 | qual_len = len( self.get_ascii_quality_scores() ) |
---|
| 141 | seq_len = len( self.sequence ) |
---|
| 142 | assert qual_len == seq_len, "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i)" % ( qual_len, seq_len ) |
---|
| 143 | def reverse( self, clone = True ): |
---|
| 144 | #need to override how decimal quality scores are reversed |
---|
| 145 | if clone: |
---|
| 146 | rval = self.clone() |
---|
| 147 | else: |
---|
| 148 | rval = self |
---|
| 149 | rval.sequence = transform.reverse( self.sequence ) |
---|
| 150 | if rval.is_ascii_encoded(): |
---|
| 151 | rval.quality = rval.quality[::-1] |
---|
| 152 | else: |
---|
| 153 | rval.quality = reversed( rval.get_decimal_quality_scores() ) |
---|
| 154 | rval.quality = "%s " % " ".join( map( str, rval.quality ) ) |
---|
| 155 | return rval |
---|
| 156 | |
---|
| 157 | class fastqSangerRead( fastqSequencingRead ): |
---|
| 158 | format = 'sanger' |
---|
| 159 | ascii_min = 33 |
---|
| 160 | ascii_max = 126 |
---|
| 161 | quality_min = 0 |
---|
| 162 | quality_max = 93 |
---|
| 163 | score_system = 'phred' |
---|
| 164 | sequence_space = 'base' |
---|
| 165 | |
---|
| 166 | class fastqIlluminaRead( fastqSequencingRead ): |
---|
| 167 | format = 'illumina' |
---|
| 168 | ascii_min = 64 |
---|
| 169 | ascii_max = 126 |
---|
| 170 | quality_min = 0 |
---|
| 171 | quality_max = 62 |
---|
| 172 | score_system = 'phred' |
---|
| 173 | sequence_space = 'base' |
---|
| 174 | |
---|
| 175 | class fastqSolexaRead( fastqSequencingRead ): |
---|
| 176 | format = 'solexa' |
---|
| 177 | ascii_min = 59 |
---|
| 178 | ascii_max = 126 |
---|
| 179 | quality_min = -5 |
---|
| 180 | quality_max = 62 |
---|
| 181 | score_system = 'solexa' |
---|
| 182 | sequence_space = 'base' |
---|
| 183 | |
---|
| 184 | class fastqCSSangerRead( fastqSequencingRead ): |
---|
| 185 | format = 'cssanger' #color space |
---|
| 186 | ascii_min = 33 |
---|
| 187 | ascii_max = 126 |
---|
| 188 | quality_min = 0 |
---|
| 189 | quality_max = 93 |
---|
| 190 | score_system = 'phred' |
---|
| 191 | sequence_space = 'color' |
---|
| 192 | valid_sequence_list = map( str, range( 7 ) ) + [ '.' ] |
---|
| 193 | def __len__( self ): |
---|
| 194 | if self.has_adapter_base(): #Adapter base is not counted in length of read |
---|
| 195 | return len( self.sequence ) - 1 |
---|
| 196 | return fastqSequencingRead.__len__( self ) |
---|
| 197 | def has_adapter_base( self ): |
---|
| 198 | if self.sequence and self.sequence[0] in string.letters: #adapter base must be a letter |
---|
| 199 | return True |
---|
| 200 | return False |
---|
| 201 | def insufficient_quality_length( self ): |
---|
| 202 | if self.has_adapter_base(): |
---|
| 203 | return len( self.get_ascii_quality_scores() ) + 1 < len( self.sequence ) |
---|
| 204 | return fastqSequencingRead.insufficient_quality_length( self ) |
---|
| 205 | def assert_sequence_quality_lengths( self ): |
---|
| 206 | if self.has_adapter_base(): |
---|
| 207 | qual_len = len( self.get_ascii_quality_scores() ) |
---|
| 208 | seq_len = len( self.sequence ) |
---|
| 209 | assert qual_len + 1 == seq_len, "Invalid FASTQ file: quality score length (%i) does not match sequence length (%i with adapter base)" % ( qual_len, seq_len ) |
---|
| 210 | else: |
---|
| 211 | return fastqSequencingRead.assert_sequence_quality_lengths( self ) |
---|
| 212 | def get_sequence( self ): |
---|
| 213 | if self.has_adapter_base(): |
---|
| 214 | return self.sequence[1:] |
---|
| 215 | return self.sequence |
---|
| 216 | def reverse( self, clone = True ): |
---|
| 217 | #need to override how color space is reversed |
---|
| 218 | if clone: |
---|
| 219 | rval = self.clone() |
---|
| 220 | else: |
---|
| 221 | rval = self |
---|
| 222 | if rval.has_adapter_base(): |
---|
| 223 | adapter = rval.sequence[0] |
---|
| 224 | #sequence = rval.sequence[1:] |
---|
| 225 | rval.sequence = self.color_space_converter.to_color_space( transform.reverse( self.color_space_converter.to_base_space( rval.sequence ) ), adapter_base = adapter ) |
---|
| 226 | else: |
---|
| 227 | rval.sequence = transform.reverse( rval.sequence ) |
---|
| 228 | |
---|
| 229 | if rval.is_ascii_encoded(): |
---|
| 230 | rval.quality = rval.quality[::-1] |
---|
| 231 | else: |
---|
| 232 | rval.quality = reversed( rval.get_decimal_quality_scores() ) |
---|
| 233 | rval.quality = "%s " % " ".join( map( str, rval.quality ) ) |
---|
| 234 | return rval |
---|
| 235 | def complement( self, clone = True ): |
---|
| 236 | #need to override how color space is complemented |
---|
| 237 | if clone: |
---|
| 238 | rval = self.clone() |
---|
| 239 | else: |
---|
| 240 | rval = self |
---|
| 241 | if rval.has_adapter_base(): #No adapter, color space stays the same |
---|
| 242 | adapter = rval.sequence[0] |
---|
| 243 | sequence = rval.sequence[1:] |
---|
| 244 | if adapter.lower() != 'u': |
---|
| 245 | adapter = transform.DNA_complement( adapter ) |
---|
| 246 | else: |
---|
| 247 | adapter = transform.RNA_complement( adapter ) |
---|
| 248 | rval.sequence = "%s%s" % ( adapter, sequence ) |
---|
| 249 | return rval |
---|
| 250 | def change_adapter( self, new_adapter, clone = True ): |
---|
| 251 | #if new_adapter is empty, remove adapter, otherwise replace with new_adapter |
---|
| 252 | if clone: |
---|
| 253 | rval = self.clone() |
---|
| 254 | else: |
---|
| 255 | rval = self |
---|
| 256 | if rval.has_adapter_base(): |
---|
| 257 | if new_adapter: |
---|
| 258 | if new_adapter != rval.sequence[0]: |
---|
| 259 | rval.sequence = rval.color_space_converter.to_color_space( rval.color_space_converter.to_base_space( rval.sequence ), adapter_base = new_adapter ) |
---|
| 260 | else: |
---|
| 261 | rval.sequence = rval.sequence[1:] |
---|
| 262 | elif new_adapter: |
---|
| 263 | rval.sequence = "%s%s" % ( new_adapter, rval.sequence ) |
---|
| 264 | return rval |
---|
| 265 | |
---|
| 266 | |
---|
| 267 | FASTQ_FORMATS = {} |
---|
| 268 | for format in [ fastqIlluminaRead, fastqSolexaRead, fastqSangerRead, fastqCSSangerRead ]: |
---|
| 269 | FASTQ_FORMATS[ format.format ] = format |
---|
| 270 | |
---|
| 271 | |
---|
| 272 | class fastqAggregator( object ): |
---|
| 273 | VALID_FORMATS = FASTQ_FORMATS.keys() |
---|
| 274 | def __init__( self, ): |
---|
| 275 | self.ascii_values_used = [] #quick lookup of all ascii chars used |
---|
| 276 | self.seq_lens = {} #counts of seqs by read len |
---|
| 277 | self.nuc_index_quality = [] #counts of scores by read column |
---|
| 278 | self.nuc_index_base = [] #counts of bases by read column |
---|
| 279 | def consume_read( self, fastq_read ): |
---|
| 280 | #ascii values used |
---|
| 281 | for val in fastq_read.get_ascii_quality_scores(): |
---|
| 282 | if val not in self.ascii_values_used: |
---|
| 283 | self.ascii_values_used.append( val ) |
---|
| 284 | #lengths |
---|
| 285 | seq_len = len( fastq_read ) |
---|
| 286 | self.seq_lens[ seq_len ] = self.seq_lens.get( seq_len, 0 ) + 1 |
---|
| 287 | #decimal qualities by column |
---|
| 288 | for i, val in enumerate( fastq_read.get_decimal_quality_scores() ): |
---|
| 289 | if i == len( self.nuc_index_quality ): |
---|
| 290 | self.nuc_index_quality.append( {} ) |
---|
| 291 | self.nuc_index_quality[ i ][ val ] = self.nuc_index_quality[ i ].get( val, 0 ) + 1 |
---|
| 292 | #bases by column |
---|
| 293 | for i, nuc in enumerate( fastq_read.get_sequence() ): |
---|
| 294 | if i == len( self.nuc_index_base ): |
---|
| 295 | self.nuc_index_base.append( {} ) |
---|
| 296 | nuc = nuc.upper() |
---|
| 297 | self.nuc_index_base[ i ][ nuc ] = self.nuc_index_base[ i ].get( nuc, 0 ) + 1 |
---|
| 298 | def get_valid_formats( self, check_list = None ): |
---|
| 299 | if not check_list: |
---|
| 300 | check_list = self.VALID_FORMATS |
---|
| 301 | rval = [] |
---|
| 302 | sequence = [] |
---|
| 303 | for nuc_dict in self.nuc_index_base: |
---|
| 304 | for nuc in nuc_dict.keys(): |
---|
| 305 | if nuc not in sequence: |
---|
| 306 | sequence.append( nuc ) |
---|
| 307 | sequence = "".join( sequence ) |
---|
| 308 | quality = "".join( self.ascii_values_used ) |
---|
| 309 | for fastq_format in check_list: |
---|
| 310 | fastq_read = fastqSequencingRead.get_class_by_format( fastq_format )() |
---|
| 311 | fastq_read.quality = quality |
---|
| 312 | fastq_read.sequence = sequence |
---|
| 313 | if fastq_read.is_valid_format(): |
---|
| 314 | rval.append( fastq_format ) |
---|
| 315 | return rval |
---|
| 316 | def get_ascii_range( self ): |
---|
| 317 | return ( min( self.ascii_values_used ), max( self.ascii_values_used ) ) |
---|
| 318 | def get_decimal_range( self ): |
---|
| 319 | decimal_values_used = [] |
---|
| 320 | for scores in self.nuc_index_quality: |
---|
| 321 | decimal_values_used.extend( scores.keys() ) |
---|
| 322 | return ( min( decimal_values_used ), max( decimal_values_used ) ) |
---|
| 323 | def get_length_counts( self ): |
---|
| 324 | return self.seq_lens |
---|
| 325 | def get_max_read_length( self ): |
---|
| 326 | return len( self.nuc_index_quality ) |
---|
| 327 | def get_read_count_for_column( self, column ): |
---|
| 328 | if column >= len( self.nuc_index_quality ): |
---|
| 329 | return 0 |
---|
| 330 | return sum( self.nuc_index_quality[ column ].values() ) |
---|
| 331 | def get_read_count( self ): |
---|
| 332 | return self.get_read_count_for_column( 0 ) |
---|
| 333 | def get_base_counts_for_column( self, column ): |
---|
| 334 | return self.nuc_index_base[ column ] |
---|
| 335 | def get_score_list_for_column( self, column ): |
---|
| 336 | return self.nuc_index_quality[ column ].keys() |
---|
| 337 | def get_score_min_for_column( self, column ): |
---|
| 338 | return min( self.nuc_index_quality[ column ].keys() ) |
---|
| 339 | def get_score_max_for_column( self, column ): |
---|
| 340 | return max( self.nuc_index_quality[ column ].keys() ) |
---|
| 341 | def get_score_sum_for_column( self, column ): |
---|
| 342 | return sum( score * count for score, count in self.nuc_index_quality[ column ].iteritems() ) |
---|
| 343 | def get_score_at_position_for_column( self, column, position ): |
---|
| 344 | score_value_dict = self.nuc_index_quality[ column ] |
---|
| 345 | scores = sorted( score_value_dict.keys() ) |
---|
| 346 | for score in scores: |
---|
| 347 | if score_value_dict[ score ] <= position: |
---|
| 348 | position -= score_value_dict[ score ] |
---|
| 349 | else: |
---|
| 350 | return score |
---|
| 351 | def get_summary_statistics_for_column( self, i ): |
---|
| 352 | def _get_med_pos( size ): |
---|
| 353 | halfed = int( size / 2 ) |
---|
| 354 | if size % 2 == 1: |
---|
| 355 | return [ halfed ] |
---|
| 356 | return[ halfed - 1, halfed ] |
---|
| 357 | read_count = self.get_read_count_for_column( i ) |
---|
| 358 | |
---|
| 359 | min_score = self.get_score_min_for_column( i ) |
---|
| 360 | max_score = self.get_score_max_for_column( i ) |
---|
| 361 | sum_score = self.get_score_sum_for_column( i ) |
---|
| 362 | mean_score = float( sum_score ) / float( read_count ) |
---|
| 363 | #get positions |
---|
| 364 | med_pos = _get_med_pos( read_count ) |
---|
| 365 | if 0 in med_pos: |
---|
| 366 | q1_pos = [ 0 ] |
---|
| 367 | q3_pos = [ read_count - 1 ] |
---|
| 368 | else: |
---|
| 369 | q1_pos = _get_med_pos( min( med_pos ) ) |
---|
| 370 | q3_pos = [] |
---|
| 371 | for pos in q1_pos: |
---|
| 372 | q3_pos.append( max( med_pos ) + 1 + pos ) |
---|
| 373 | #get scores at position |
---|
| 374 | med_score = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in med_pos ] ) ) / float( len( med_pos ) ) |
---|
| 375 | q1 = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in q1_pos ] ) ) / float( len( q1_pos ) ) |
---|
| 376 | q3 = float( sum( [ self.get_score_at_position_for_column( i, pos ) for pos in q3_pos ] ) ) / float( len( q3_pos ) ) |
---|
| 377 | #determine iqr and step |
---|
| 378 | iqr = q3 - q1 |
---|
| 379 | step = 1.5 * iqr |
---|
| 380 | |
---|
| 381 | #Determine whiskers and outliers |
---|
| 382 | outliers = [] |
---|
| 383 | score_list = sorted( self.get_score_list_for_column( i ) ) |
---|
| 384 | left_whisker = q1 - step |
---|
| 385 | for score in score_list: |
---|
| 386 | if left_whisker <= score: |
---|
| 387 | left_whisker = score |
---|
| 388 | break |
---|
| 389 | else: |
---|
| 390 | outliers.append( score ) |
---|
| 391 | |
---|
| 392 | right_whisker = q3 + step |
---|
| 393 | score_list.reverse() |
---|
| 394 | for score in score_list: |
---|
| 395 | if right_whisker >= score: |
---|
| 396 | right_whisker = score |
---|
| 397 | break |
---|
| 398 | else: |
---|
| 399 | outliers.append( score ) |
---|
| 400 | |
---|
| 401 | column_stats = { 'read_count': read_count, |
---|
| 402 | 'min_score': min_score, |
---|
| 403 | 'max_score': max_score, |
---|
| 404 | 'sum_score': sum_score, |
---|
| 405 | 'mean_score': mean_score, |
---|
| 406 | 'q1': q1, |
---|
| 407 | 'med_score': med_score, |
---|
| 408 | 'q3': q3, |
---|
| 409 | 'iqr': iqr, |
---|
| 410 | 'left_whisker': left_whisker, |
---|
| 411 | 'right_whisker': right_whisker, |
---|
| 412 | 'outliers': outliers } |
---|
| 413 | return column_stats |
---|
| 414 | |
---|
| 415 | class fastqReader( object ): |
---|
| 416 | def __init__( self, fh, format = 'sanger' ): |
---|
| 417 | self.file = fh |
---|
| 418 | self.format = format |
---|
| 419 | def close( self ): |
---|
| 420 | return self.file.close() |
---|
| 421 | def next(self): |
---|
| 422 | while True: |
---|
| 423 | fastq_header = self.file.readline() |
---|
| 424 | if not fastq_header: |
---|
| 425 | raise StopIteration |
---|
| 426 | fastq_header = fastq_header.rstrip( '\n\r' ) |
---|
| 427 | #remove empty lines, apparently extra new lines at end of file is common? |
---|
| 428 | if fastq_header: |
---|
| 429 | break |
---|
| 430 | |
---|
| 431 | assert fastq_header.startswith( '@' ), 'Invalid fastq header: %s' % fastq_header |
---|
| 432 | rval = fastqSequencingRead.get_class_by_format( self.format )() |
---|
| 433 | rval.identifier = fastq_header |
---|
| 434 | while True: |
---|
| 435 | line = self.file.readline() |
---|
| 436 | if not line: |
---|
| 437 | raise Exception( 'Invalid FASTQ file: could not parse second instance of sequence identifier.' ) |
---|
| 438 | line = line.rstrip( '\n\r' ) |
---|
| 439 | if line.startswith( '+' ) and ( len( line ) == 1 or line[1:].startswith( fastq_header[1:] ) ): |
---|
| 440 | rval.description = line |
---|
| 441 | break |
---|
| 442 | rval.append_sequence( line ) |
---|
| 443 | while rval.insufficient_quality_length(): |
---|
| 444 | line = self.file.readline() |
---|
| 445 | if not line: |
---|
| 446 | break |
---|
| 447 | rval.append_quality( line ) |
---|
| 448 | rval.assert_sequence_quality_lengths() |
---|
| 449 | return rval |
---|
| 450 | def __iter__( self ): |
---|
| 451 | while True: |
---|
| 452 | yield self.next() |
---|
| 453 | |
---|
| 454 | class fastqNamedReader( object ): |
---|
| 455 | def __init__( self, fh, format = 'sanger' ): |
---|
| 456 | self.file = fh |
---|
| 457 | self.format = format |
---|
| 458 | self.reader = fastqReader( self.file, self.format ) |
---|
| 459 | #self.last_offset = self.file.tell() |
---|
| 460 | self.offset_dict = {} |
---|
| 461 | self.eof = False |
---|
| 462 | def close( self ): |
---|
| 463 | return self.file.close() |
---|
| 464 | def get( self, sequence_id ): |
---|
| 465 | if not isinstance( sequence_id, basestring ): |
---|
| 466 | sequence_id = sequence_id.identifier |
---|
| 467 | rval = None |
---|
| 468 | if sequence_id in self.offset_dict: |
---|
| 469 | initial_offset = self.file.tell() |
---|
| 470 | seq_offset = self.offset_dict[ sequence_id ].pop( 0 ) |
---|
| 471 | if not self.offset_dict[ sequence_id ]: |
---|
| 472 | del self.offset_dict[ sequence_id ] |
---|
| 473 | self.file.seek( seq_offset ) |
---|
| 474 | rval = self.reader.next() |
---|
| 475 | #assert rval.identifier == sequence_id, 'seq id mismatch' #should be able to remove this |
---|
| 476 | self.file.seek( initial_offset ) |
---|
| 477 | else: |
---|
| 478 | while True: |
---|
| 479 | offset = self.file.tell() |
---|
| 480 | try: |
---|
| 481 | fastq_read = self.reader.next() |
---|
| 482 | except StopIteration: |
---|
| 483 | self.eof = True |
---|
| 484 | break #eof, id not found, will return None |
---|
| 485 | if fastq_read.identifier == sequence_id: |
---|
| 486 | rval = fastq_read |
---|
| 487 | break |
---|
| 488 | else: |
---|
| 489 | if fastq_read.identifier not in self.offset_dict: |
---|
| 490 | self.offset_dict[ fastq_read.identifier ] = [] |
---|
| 491 | self.offset_dict[ fastq_read.identifier ].append( offset ) |
---|
| 492 | return rval |
---|
| 493 | def has_data( self ): |
---|
| 494 | #returns a string representation of remaining data, or empty string (False) if no data remaining |
---|
| 495 | eof = self.eof |
---|
| 496 | count = 0 |
---|
| 497 | rval = '' |
---|
| 498 | if self.offset_dict: |
---|
| 499 | count = sum( map( len, self.offset_dict.values() ) ) |
---|
| 500 | if not eof: |
---|
| 501 | offset = self.file.tell() |
---|
| 502 | try: |
---|
| 503 | fastq_read = self.reader.next() |
---|
| 504 | except StopIteration: |
---|
| 505 | eof = True |
---|
| 506 | self.file.seek( offset ) |
---|
| 507 | if count: |
---|
| 508 | rval = "There were %i known sequence reads not utilized. " |
---|
| 509 | if not eof: |
---|
| 510 | rval = "%s%s" % ( rval, "An additional unknown number of reads exist in the input that were not utilized." ) |
---|
| 511 | return rval |
---|
| 512 | |
---|
| 513 | class fastqWriter( object ): |
---|
| 514 | def __init__( self, fh, format = None, force_quality_encoding = None ): |
---|
| 515 | self.file = fh |
---|
| 516 | self.format = format |
---|
| 517 | self.force_quality_encoding = force_quality_encoding |
---|
| 518 | def write( self, fastq_read ): |
---|
| 519 | if self.format: |
---|
| 520 | fastq_read = fastq_read.convert_read_to_format( self.format, force_quality_encoding = self.force_quality_encoding ) |
---|
| 521 | self.file.write( str( fastq_read ) ) |
---|
| 522 | def close( self ): |
---|
| 523 | return self.file.close() |
---|
| 524 | |
---|
| 525 | class fastqJoiner( object ): |
---|
| 526 | def __init__( self, format, force_quality_encoding = None ): |
---|
| 527 | self.format = format |
---|
| 528 | self.force_quality_encoding = force_quality_encoding |
---|
| 529 | def join( self, read1, read2 ): |
---|
| 530 | if read1.identifier.endswith( '/2' ) and read2.identifier.endswith( '/1' ): |
---|
| 531 | #swap 1 and 2 |
---|
| 532 | tmp = read1 |
---|
| 533 | read1 = read2 |
---|
| 534 | read2 = tmp |
---|
| 535 | del tmp |
---|
| 536 | if read1.identifier.endswith( '/1' ) and read2.identifier.endswith( '/2' ): |
---|
| 537 | identifier = read1.identifier[:-2] |
---|
| 538 | else: |
---|
| 539 | identifier = read1.identifier |
---|
| 540 | |
---|
| 541 | #use force quality encoding, if not present force to encoding of first read |
---|
| 542 | force_quality_encoding = self.force_quality_encoding |
---|
| 543 | if not force_quality_encoding: |
---|
| 544 | if read1.is_ascii_encoded(): |
---|
| 545 | force_quality_encoding = 'ascii' |
---|
| 546 | else: |
---|
| 547 | force_quality_encoding = 'decimal' |
---|
| 548 | |
---|
| 549 | new_read1 = read1.convert_read_to_format( self.format, force_quality_encoding = force_quality_encoding ) |
---|
| 550 | new_read2 = read2.convert_read_to_format( self.format, force_quality_encoding = force_quality_encoding ) |
---|
| 551 | rval = FASTQ_FORMATS[ self.format ]() |
---|
| 552 | rval.identifier = identifier |
---|
| 553 | if len( read1.description ) > 1: |
---|
| 554 | rval.description = "+%s" % ( identifier[1:] ) |
---|
| 555 | else: |
---|
| 556 | rval.description = '+' |
---|
| 557 | if rval.sequence_space == 'color': |
---|
| 558 | #need to handle color space joining differently |
---|
| 559 | #convert to nuc space, join, then convert back |
---|
| 560 | rval.sequence = rval.convert_base_to_color_space( new_read1.convert_color_to_base_space( new_read1.sequence ) + new_read2.convert_color_to_base_space( new_read2.sequence ) ) |
---|
| 561 | else: |
---|
| 562 | rval.sequence = new_read1.sequence + new_read2.sequence |
---|
| 563 | if force_quality_encoding == 'ascii': |
---|
| 564 | rval.quality = new_read1.quality + new_read2.quality |
---|
| 565 | else: |
---|
| 566 | rval.quality = "%s %s" % ( new_read1.quality.strip(), new_read2.quality.strip() ) |
---|
| 567 | return rval |
---|
| 568 | def get_paired_identifier( self, fastq_read ): |
---|
| 569 | identifier = fastq_read.identifier |
---|
| 570 | if identifier[-2] == '/': |
---|
| 571 | if identifier[-1] == "1": |
---|
| 572 | identifier = "%s2" % identifier[:-1] |
---|
| 573 | elif identifier[-1] == "2": |
---|
| 574 | identifier = "%s1" % identifier[:-1] |
---|
| 575 | return identifier |
---|
| 576 | |
---|
| 577 | class fastqSplitter( object ): |
---|
| 578 | def split( self, fastq_read ): |
---|
| 579 | length = len( fastq_read ) |
---|
| 580 | #Only reads of even lengths can be split |
---|
| 581 | if length % 2 != 0: |
---|
| 582 | return None, None |
---|
| 583 | half = int( length / 2 ) |
---|
| 584 | read1 = fastq_read.slice( 0, half ) |
---|
| 585 | read1.identifier += "/1" |
---|
| 586 | if len( read1.description ) > 1: |
---|
| 587 | read1.description += "/1" |
---|
| 588 | read2 = fastq_read.slice( half, None ) |
---|
| 589 | read2.identifier += "/2" |
---|
| 590 | if len( read2.description ) > 1: |
---|
| 591 | read2.description += "/2" |
---|
| 592 | return read1, read2 |
---|
| 593 | |
---|
| 594 | class fastqCombiner( object ): |
---|
| 595 | def __init__( self, format ): |
---|
| 596 | self.format = format |
---|
| 597 | def combine(self, fasta_seq, quality_seq ): |
---|
| 598 | fastq_read = fastqSequencingRead.get_class_by_format( self.format )() |
---|
| 599 | fastq_read.identifier = "@%s" % fasta_seq.identifier[1:] |
---|
| 600 | fastq_read.description = '+' |
---|
| 601 | fastq_read.sequence = fasta_seq.sequence |
---|
| 602 | fastq_read.quality = quality_seq.sequence |
---|
| 603 | return fastq_read |
---|
| 604 | |
---|
| 605 | class fastqFakeFastaScoreReader( object ): |
---|
| 606 | def __init__( self, format = 'sanger', quality_encoding = None ): |
---|
| 607 | self.fastq_read = fastqSequencingRead.get_class_by_format( format )() |
---|
| 608 | if quality_encoding != 'decimal': |
---|
| 609 | quality_encoding = 'ascii' |
---|
| 610 | self.quality_encoding = quality_encoding |
---|
| 611 | def close( self ): |
---|
| 612 | return #nothing to close |
---|
| 613 | def get( self, sequence ): |
---|
| 614 | assert isinstance( sequence, fastaSequence ), 'fastqFakeFastaScoreReader requires a fastaSequence object as the parameter' |
---|
| 615 | #add sequence to fastq_read, then get_sequence(), color space adapters do not have quality score values |
---|
| 616 | self.fastq_read.sequence = sequence.sequence |
---|
| 617 | new_sequence = fastaSequence() |
---|
| 618 | new_sequence.identifier = sequence.identifier |
---|
| 619 | if self.quality_encoding == 'ascii': |
---|
| 620 | new_sequence.sequence = chr( self.fastq_read.ascii_max ) * len( self.fastq_read.get_sequence() ) |
---|
| 621 | else: |
---|
| 622 | new_sequence.sequence = ( "%i " % self.fastq_read.quality_max ) * len( self.fastq_read.get_sequence() ) |
---|
| 623 | return new_sequence |
---|
| 624 | def has_data( self ): |
---|
| 625 | return '' #No actual data exist, none can be remaining |
---|