reads on various attributesfastq_manipulation.py $input_file $fastq_manipulation_file $output_file $output_file.files_path '${input_file.extension[len( 'fastq' ):]}'int( float( value ) ) == float( value )int( float( value ) ) == float( value )##create an importable module
#import binascii
import re
import binascii
from string import maketrans
##does read match
def match_read( fastq_read ):
#for $match_block in $match_blocks:
#if $match_block['match_type']['match_type_selector'] == 'identifier':
search_target = fastq_read.identifier[1:] ##don't include @
#elif $match_block['match_type']['match_type_selector'] == 'sequence':
search_target = fastq_read.sequence
#elif $match_block['match_type']['match_type_selector'] == 'quality':
search_target = fastq_read.quality
#else:
#continue
#end if
if not re.search( binascii.unhexlify( "${ binascii.hexlify( str( match_block['match_type']['match']['match_by'] ) ) }" ), search_target ):
return False
#end for
return True
##modify matched reads
def manipulate_read( fastq_read ):
new_read = fastq_read.clone()
#for $manipulate_block in $manipulate_blocks:
#if $manipulate_block['manipulation_type']['manipulation_type_selector'] == 'identifier':
#if $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'translate':
new_read.identifier = "@%s" % new_read.identifier[1:].translate( maketrans( binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['from'] ) ) }" ), binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['to'] ) ) }" ) ) )
#end if
#elif $manipulate_block['manipulation_type']['manipulation_type_selector'] == 'sequence':
#if $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'translate':
new_read.sequence = new_read.sequence.translate( maketrans( binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['from'] ) ) }" ), binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['to'] ) ) }" ) ) )
#elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'rev_comp':
new_read = new_read.reverse_complement()
#elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'rev_no_comp':
new_read = new_read.reverse()
#elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'no_rev_comp':
new_read = new_read.complement()
#elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'trim':
#if $manipulate_block['manipulation_type']['manipulation']['offset_type']['base_offset_type'] == 'offsets_percent':
left_column_offset = int( round( float( ${ manipulate_block['manipulation_type']['manipulation']['offset_type']['left_column_offset'] } ) / 100.0 * float( len( new_read ) ) ) )
right_column_offset = int( round( float( ${ manipulate_block['manipulation_type']['manipulation']['offset_type']['right_column_offset'] } ) / 100.0 * float( len( new_read ) ) ) )
#else
left_column_offset = ${ manipulate_block['manipulation_type']['manipulation']['offset_type']['left_column_offset'] }
right_column_offset = ${ manipulate_block['manipulation_type']['manipulation']['offset_type']['right_column_offset'] }
#end if
if right_column_offset > 0:
right_column_offset = -right_column_offset
else:
right_column_offset = None
new_read = new_read.slice( left_column_offset, right_column_offset )
if not ( ${str( manipulate_block['manipulation_type']['manipulation']['keep_zero_length'] ) == 'keep_zero_length'} or len( new_read ) ):
return None
#elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'dna_to_rna':
new_read = new_read.sequence_as_DNA()
#elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'rna_to_dna':
new_read = new_read.sequence_as_RNA()
#elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'change_adapter':
if new_read.sequence_space == 'color':
new_read = new_read.change_adapter( binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['new_adapter'] ) ) }" ) )
#end if
#elif $manipulate_block['manipulation_type']['manipulation_type_selector'] == 'quality':
#if $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'translate':
new_read.quality = new_read.quality.translate( maketrans( binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['from'] ) ) }" ), binascii.unhexlify( "${ binascii.hexlify( str( manipulate_block['manipulation_type']['manipulation']['to'] ) ) }" ) ) )
#elif $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'map_score':
def score_method( score ):
raise Exception, "Unimplemented" ##This option is not yet available, need to abstract out e.g. column adding tool action: preventing users from using 'harmful' actions
new_read.quality_map( score_method )
#end if
#elif $manipulate_block['manipulation_type']['manipulation_type_selector'] == 'miscellaneous':
#if $manipulate_block['manipulation_type']['manipulation']['manipulation_selector'] == 'remove':
return None
#end if
#else:
#continue
#end if
#end for
if new_read.description != "+":
new_read.description = "+%s" % new_read.identifier[1:] ##ensure description is still valid
return new_read
def match_and_manipulate_read( fastq_read ):
new_read = fastq_read
if match_read( fastq_read ):
new_read = manipulate_read( fastq_read )
return new_read
This tool allows you to build complex manipulations to be applied to each matching read in a FASTQ file. A read must match all matching directives in order for it to be manipulated; if a read does not match, it is output in a non-modified manner. All reads matching will have each of the specified manipulations performed upon them, in the order specified.
Regular Expression Matches are made using re.search, see http://docs.python.org/library/re.html for more information.
All matching is performed on a single line string, regardless if e.g. the sequence or quality score spans multiple lines in the original file.
String translations are performed using string.translate, see http://docs.python.org/library/string.html#string.translate and http://docs.python.org/library/string.html#string.maketrans for more information.
.. class:: warningmark
Only color space reads can have adapter bases substituted.
-----
**Example**
Suppose you have a color space sanger formatted sequence (fastqcssanger) and you want to double-encode the color space into psuedo-nucleotide space (this is different from converting) to allow these reads to be used in tools which do not natively support it (using specially designed indexes). This tool can handle this manipulation, however, this is generally not recommended as results tend to be poorer than those produced from tools which are specially designed to handle color space data.
Steps:
1. Click **Add new Match Reads** and leave the matching options set to the default (Matching by sequence name/identifier using the regular expression "\*."; thereby matching all reads).
2. Click **Add new Manipulate Reads**, change **Manipulate Reads on** to "Sequence Content", set **Sequence Manipulation Type** to "Change Adapter Base" and set **New Adapter** to "" (an empty text field).
3. Click **Add new Manipulate Reads**, change **Manipulate Reads on** to "Sequence Content", set **Sequence Manipulation Type** to "String Translate" and set **From** to "0123." and **To** to "ACGTN".
4. Click Execute. The new history item will contained double-encoded psuedo-nucleotide space reads.