1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | import sys |
---|
4 | import string |
---|
5 | import optparse |
---|
6 | import tempfile |
---|
7 | import sqlite3 |
---|
8 | |
---|
9 | def stop_err( msg ): |
---|
10 | sys.stderr.write( msg ) |
---|
11 | sys.exit() |
---|
12 | |
---|
13 | def solid2sanger( quality_string, min_qual = 0 ): |
---|
14 | sanger = "" |
---|
15 | quality_string = quality_string.rstrip( " " ) |
---|
16 | for qv in quality_string.split(" "): |
---|
17 | try: |
---|
18 | if int( qv ) < 0: |
---|
19 | qv = '0' |
---|
20 | if int( qv ) < min_qual: |
---|
21 | return False |
---|
22 | break |
---|
23 | sanger += chr( int( qv ) + 33 ) |
---|
24 | except: |
---|
25 | pass |
---|
26 | return sanger |
---|
27 | |
---|
28 | def Translator(frm='', to='', delete='', keep=None): |
---|
29 | allchars = string.maketrans('','') |
---|
30 | if len(to) == 1: |
---|
31 | to = to * len(frm) |
---|
32 | trans = string.maketrans(frm, to) |
---|
33 | if keep is not None: |
---|
34 | delete = allchars.translate(allchars, keep.translate(allchars, delete)) |
---|
35 | def callable(s): |
---|
36 | return s.translate(trans, delete) |
---|
37 | return callable |
---|
38 | |
---|
39 | def merge_reads_qual( f_reads, f_qual, f_out, trim_name=False, out='fastq', double_encode = False, trim_first_base = False, pair_end_flag = '', min_qual = 0, table_name=None ): |
---|
40 | |
---|
41 | # Reads from two files f_csfasta (reads) and f_qual (quality values) and produces output in three formats depending on out parameter, |
---|
42 | # which can have three values: fastq, txt, and db |
---|
43 | # fastq = fastq format |
---|
44 | # txt = space delimited format with defline, reads, and qvs |
---|
45 | # dp = dump data into sqlite3 db. |
---|
46 | # IMPORTNAT! If out = db two optins must be provided: |
---|
47 | # 1. f_out must be a db connection object initialized with sqlite3.connect() |
---|
48 | # 2. table_name must be provided |
---|
49 | |
---|
50 | if out == 'db': |
---|
51 | cursor = f_out.cursor() |
---|
52 | sql = "create table %s (name varchar(50) not null, read blob, qv blob)" % table_name |
---|
53 | cursor.execute(sql) |
---|
54 | |
---|
55 | lines = [] |
---|
56 | line = " " |
---|
57 | while line: |
---|
58 | for f in [ f_reads, f_qual ]: |
---|
59 | line = f.readline().rstrip( '\n\r' ) |
---|
60 | while line.startswith( '#' ): |
---|
61 | line = f.readline().rstrip( '\n\r' ) |
---|
62 | lines.append( line ) |
---|
63 | |
---|
64 | if lines[0].startswith( '>' ): |
---|
65 | defline = lines[0][1:] |
---|
66 | if trim_name and ( defline[ len( defline )-3: ] == "_F3" or defline[ len( defline )-3: ] == "_R3" ): |
---|
67 | defline = defline[ : len( defline )-3 ] |
---|
68 | |
---|
69 | else: |
---|
70 | |
---|
71 | if trim_first_base: |
---|
72 | lines[0] = lines[0][1:] |
---|
73 | if double_encode: |
---|
74 | de = Translator(frm="0123.", to="ACGTN") |
---|
75 | lines[0] = de(lines[0]) |
---|
76 | qual = solid2sanger( lines[1], int( min_qual ) ) |
---|
77 | if qual: |
---|
78 | if out == 'fastq': |
---|
79 | f_out.write( "@%s%s\n%s\n+\n%s\n" % ( defline, pair_end_flag, lines[0], qual ) ) |
---|
80 | if out == 'txt': |
---|
81 | f_out.write( '%s %s %s\n' % (defline, lines[0], qual ) ) |
---|
82 | if out == 'db': |
---|
83 | cursor.execute('insert into %s values("%s","%s","%s")' % (table_name, defline, lines[0], qual ) ) |
---|
84 | lines = [] |
---|
85 | |
---|
86 | def main(): |
---|
87 | |
---|
88 | usage = "%prog --fr F3.csfasta --fq R3.csfasta --fout fastq_output_file [option]" |
---|
89 | parser = optparse.OptionParser(usage=usage) |
---|
90 | |
---|
91 | |
---|
92 | parser.add_option( |
---|
93 | '--fr','--f_reads', |
---|
94 | metavar="F3_CSFASTA_FILE", |
---|
95 | dest='fr', |
---|
96 | help='Name of F3 file with color space reads') |
---|
97 | |
---|
98 | parser.add_option( |
---|
99 | '--fq','--f_qual', |
---|
100 | metavar="F3_QUAL_FILE", |
---|
101 | dest='fq', |
---|
102 | help='Name of F3 file with color quality values') |
---|
103 | |
---|
104 | parser.add_option( |
---|
105 | '--fout','--f3_fastq_output', |
---|
106 | metavar="F3_OUTPUT", |
---|
107 | dest='fout', |
---|
108 | help='Name for F3 output file') |
---|
109 | |
---|
110 | parser.add_option( |
---|
111 | '--rr','--r_reads', |
---|
112 | metavar="R3_CSFASTA_FILE", |
---|
113 | dest='rr', |
---|
114 | default = False, |
---|
115 | help='Name of R3 file with color space reads') |
---|
116 | |
---|
117 | parser.add_option( |
---|
118 | '--rq','--r_qual', |
---|
119 | metavar="R3_QUAL_FILE", |
---|
120 | dest='rq', |
---|
121 | default = False, |
---|
122 | help='Name of R3 file with color quality values') |
---|
123 | |
---|
124 | parser.add_option( |
---|
125 | '--rout', |
---|
126 | metavar="R3_OUTPUT", |
---|
127 | dest='rout', |
---|
128 | help='Name for F3 output file') |
---|
129 | |
---|
130 | parser.add_option( |
---|
131 | '-q','--min_qual', |
---|
132 | dest='min_qual', |
---|
133 | default = '-1000', |
---|
134 | help='Minimum quality threshold for printing reads. If a read contains a single call with QV lower than this value, it will not be reported. Default is -1000') |
---|
135 | |
---|
136 | parser.add_option( |
---|
137 | '-t','--trim_name', |
---|
138 | dest='trim_name', |
---|
139 | action='store_true', |
---|
140 | default = False, |
---|
141 | help='Trim _R3 and _F3 off read names. Default is False') |
---|
142 | |
---|
143 | parser.add_option( |
---|
144 | '-f','--trim_first_base', |
---|
145 | dest='trim_first_base', |
---|
146 | action='store_true', |
---|
147 | default = False, |
---|
148 | help='Remove the first base of reads in color-space. Default is False') |
---|
149 | |
---|
150 | parser.add_option( |
---|
151 | '-d','--double_encode', |
---|
152 | dest='de', |
---|
153 | action='store_true', |
---|
154 | default = False, |
---|
155 | help='Double encode color calls as nucleotides: 0123. becomes ACGTN. Default is False') |
---|
156 | |
---|
157 | options, args = parser.parse_args() |
---|
158 | |
---|
159 | if not ( options.fout and options.fr and options.fq ): |
---|
160 | parser.error(""" |
---|
161 | One or more of the three required paremetrs is missing: |
---|
162 | (1) --fr F3.csfasta file |
---|
163 | (2) --fq F3.qual file |
---|
164 | (3) --fout name of output file |
---|
165 | Use --help for more info |
---|
166 | """) |
---|
167 | |
---|
168 | fr = open ( options.fr , 'r' ) |
---|
169 | fq = open ( options.fq , 'r' ) |
---|
170 | f_out = open ( options.fout , 'w' ) |
---|
171 | |
---|
172 | if options.rr and options.rq: |
---|
173 | rr = open ( options.rr , 'r' ) |
---|
174 | rq = open ( options.rq , 'r' ) |
---|
175 | if not options.rout: |
---|
176 | parser.error("Provide the name for f3 output using --rout option. Use --help for more info") |
---|
177 | r_out = open ( options.rout, 'w' ) |
---|
178 | |
---|
179 | db = tempfile.NamedTemporaryFile() |
---|
180 | |
---|
181 | try: |
---|
182 | con = sqlite3.connect(db.name) |
---|
183 | cur = con.cursor() |
---|
184 | except: |
---|
185 | stop_err('Cannot connect to %s\n') % db.name |
---|
186 | |
---|
187 | |
---|
188 | merge_reads_qual( fr, fq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="f3" ) |
---|
189 | merge_reads_qual( rr, rq, con, trim_name=options.trim_name, out='db', double_encode=options.de, trim_first_base=options.trim_first_base, min_qual=options.min_qual, table_name="r3" ) |
---|
190 | cur.execute('create index f3_name on f3( name )') |
---|
191 | cur.execute('create index r3_name on r3( name )') |
---|
192 | |
---|
193 | cur.execute('select * from r3,f3 where f3.name = r3.name') |
---|
194 | for item in cur: |
---|
195 | f_out.write( "@%s%s\n%s\n+\n%s\n" % (item[0], "/1", item[1], item[2]) ) |
---|
196 | r_out.write( "@%s%s\n%s\n+\n%s\n" % (item[3], "/2", item[4], item[5]) ) |
---|
197 | |
---|
198 | |
---|
199 | else: |
---|
200 | merge_reads_qual( fr, fq, f_out, trim_name=options.trim_name, out='fastq', double_encode = options.de, trim_first_base = options.trim_first_base, min_qual=options.min_qual ) |
---|
201 | |
---|
202 | |
---|
203 | |
---|
204 | f_out.close() |
---|
205 | |
---|
206 | if __name__ == "__main__": |
---|
207 | main() |
---|
208 | |
---|
209 | |
---|