[2] | 1 | #!/usr/bin/python |
---|
| 2 | '''This software extracts the seq, qual and ancillary information from an sff |
---|
| 3 | file, like the ones used by the 454 sequencer. |
---|
| 4 | |
---|
| 5 | Optionally, it can also split paired-end reads if given the linker sequence. |
---|
| 6 | The splitting is done with maximum match, i.e., every occurence of the linker |
---|
| 7 | sequence will be removed, even if occuring multiple times.''' |
---|
| 8 | |
---|
| 9 | #copyright Jose Blanca and Bastien Chevreux |
---|
| 10 | #COMAV institute, Universidad Politecnica de Valencia (UPV) |
---|
| 11 | #Valencia, Spain |
---|
| 12 | |
---|
| 13 | # additions to handle paired end reads by Bastien Chevreux |
---|
| 14 | # bugfixes for linker specific lengths: Lionel Guy |
---|
| 15 | |
---|
| 16 | #This program is free software: you can redistribute it and/or modify |
---|
| 17 | #it under the terms of the GNU General Public License as published by |
---|
| 18 | #the Free Software Foundation, either version 3 of the License, or |
---|
| 19 | #(at your option) any later version. |
---|
| 20 | #This program is distributed in the hope that it will be useful, |
---|
| 21 | #but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
| 22 | #MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
| 23 | #GNU General Public License for more details. |
---|
| 24 | #You should have received a copy of the GNU General Public License |
---|
| 25 | #along with this program. If not, see <http://www.gnu.org/licenses/>. |
---|
| 26 | |
---|
| 27 | __author__ = 'Jose Blanca and Bastien Chevreux' |
---|
| 28 | __copyright__ = 'Copyright 2008, Jose Blanca, COMAV, and Bastien Chevreux' |
---|
| 29 | __license__ = 'GPLv3 or later' |
---|
| 30 | __version__ = '0.2.8' |
---|
| 31 | __email__ = 'jblanca@btc.upv.es' |
---|
| 32 | __status__ = 'beta' |
---|
| 33 | |
---|
| 34 | import struct |
---|
| 35 | import sys |
---|
| 36 | import os |
---|
| 37 | import subprocess |
---|
| 38 | import tempfile |
---|
| 39 | |
---|
| 40 | |
---|
| 41 | fake_sff_name = 'fake_sff_name' |
---|
| 42 | |
---|
| 43 | |
---|
| 44 | # readname as key: lines with matches from SSAHA, one best match |
---|
| 45 | ssahapematches = {} |
---|
| 46 | # linker readname as key: length of linker sequence |
---|
| 47 | linkerlengths = {} |
---|
| 48 | |
---|
| 49 | # set to true if something really fishy is going on with the sequences |
---|
| 50 | stern_warning = True |
---|
| 51 | |
---|
| 52 | def read_bin_fragment(struct_def, fileh, offset=0, data=None, |
---|
| 53 | byte_padding=None): |
---|
| 54 | '''It reads a chunk of a binary file. |
---|
| 55 | |
---|
| 56 | You have to provide the struct, a file object, the offset (where to start |
---|
| 57 | reading). |
---|
| 58 | Also you can provide an optional dict that will be populated with the |
---|
| 59 | extracted data. |
---|
| 60 | If a byte_padding is given the number of bytes read will be a multiple of |
---|
| 61 | that number, adding the required pad at the end. |
---|
| 62 | It returns the number of bytes reads and the data dict. |
---|
| 63 | ''' |
---|
| 64 | if data is None: |
---|
| 65 | data = {} |
---|
| 66 | |
---|
| 67 | #we read each item |
---|
| 68 | bytes_read = 0 |
---|
| 69 | for item in struct_def: |
---|
| 70 | #we go to the place and read |
---|
| 71 | fileh.seek(offset + bytes_read) |
---|
| 72 | n_bytes = struct.calcsize(item[1]) |
---|
| 73 | buffer = fileh.read(n_bytes) |
---|
| 74 | read = struct.unpack('>' + item[1], buffer) |
---|
| 75 | if len(read) == 1: |
---|
| 76 | read = read[0] |
---|
| 77 | data[item[0]] = read |
---|
| 78 | bytes_read += n_bytes |
---|
| 79 | |
---|
| 80 | #if there is byte_padding the bytes_to_read should be a multiple of the |
---|
| 81 | #byte_padding |
---|
| 82 | if byte_padding is not None: |
---|
| 83 | pad = byte_padding |
---|
| 84 | bytes_read = ((bytes_read + pad - 1) // pad) * pad |
---|
| 85 | |
---|
| 86 | return (bytes_read, data) |
---|
| 87 | |
---|
| 88 | |
---|
| 89 | def check_magic(magic): |
---|
| 90 | '''It checks that the magic number of the file matches the sff magic.''' |
---|
| 91 | if magic != 779314790: |
---|
| 92 | raise RuntimeError('This file does not seems to be an sff file.') |
---|
| 93 | |
---|
| 94 | def check_version(version): |
---|
| 95 | '''It checks that the version is supported, otherwise it raises an error.''' |
---|
| 96 | supported = ('\x00', '\x00', '\x00', '\x01') |
---|
| 97 | i = 0 |
---|
| 98 | for item in version: |
---|
| 99 | if version[i] != supported[i]: |
---|
| 100 | raise RuntimeError('SFF version not supported. Please contact the author of the software.') |
---|
| 101 | i += 1 |
---|
| 102 | |
---|
| 103 | def read_header(fileh): |
---|
| 104 | '''It reads the header from the sff file and returns a dict with the |
---|
| 105 | information''' |
---|
| 106 | #first we read the first part of the header |
---|
| 107 | head_struct = [ |
---|
| 108 | ('magic_number', 'I'), |
---|
| 109 | ('version', 'cccc'), |
---|
| 110 | ('index_offset', 'Q'), |
---|
| 111 | ('index_length', 'I'), |
---|
| 112 | ('number_of_reads', 'I'), |
---|
| 113 | ('header_length', 'H'), |
---|
| 114 | ('key_length', 'H'), |
---|
| 115 | ('number_of_flows_per_read', 'H'), |
---|
| 116 | ('flowgram_format_code', 'B'), |
---|
| 117 | ] |
---|
| 118 | data = {} |
---|
| 119 | first_bytes, data = read_bin_fragment(struct_def=head_struct, fileh=fileh, |
---|
| 120 | offset=0, data=data) |
---|
| 121 | check_magic(data['magic_number']) |
---|
| 122 | check_version(data['version']) |
---|
| 123 | #now that we know the number_of_flows_per_read and the key_length |
---|
| 124 | #we can read the second part of the header |
---|
| 125 | struct2 = [ |
---|
| 126 | ('flow_chars', str(data['number_of_flows_per_read']) + 'c'), |
---|
| 127 | ('key_sequence', str(data['key_length']) + 'c') |
---|
| 128 | ] |
---|
| 129 | read_bin_fragment(struct_def=struct2, fileh=fileh, offset=first_bytes, |
---|
| 130 | data=data) |
---|
| 131 | return data |
---|
| 132 | |
---|
| 133 | |
---|
| 134 | def read_sequence(header, fileh, fposition): |
---|
| 135 | '''It reads one read from the sff file located at the fposition and |
---|
| 136 | returns a dict with the information.''' |
---|
| 137 | header_length = header['header_length'] |
---|
| 138 | index_offset = header['index_offset'] |
---|
| 139 | index_length = header['index_length'] |
---|
| 140 | |
---|
| 141 | #the sequence struct |
---|
| 142 | read_header_1 = [ |
---|
| 143 | ('read_header_length', 'H'), |
---|
| 144 | ('name_length', 'H'), |
---|
| 145 | ('number_of_bases', 'I'), |
---|
| 146 | ('clip_qual_left', 'H'), |
---|
| 147 | ('clip_qual_right', 'H'), |
---|
| 148 | ('clip_adapter_left', 'H'), |
---|
| 149 | ('clip_adapter_right', 'H'), |
---|
| 150 | ] |
---|
| 151 | def read_header_2(name_length): |
---|
| 152 | '''It returns the struct definition for the second part of the header''' |
---|
| 153 | return [('name', str(name_length) +'c')] |
---|
| 154 | def read_data(number_of_bases): |
---|
| 155 | '''It returns the struct definition for the read data section.''' |
---|
| 156 | #size = {'c': 1, 'B':1, 'H':2, 'I':4, 'Q':8} |
---|
| 157 | if header['flowgram_format_code'] == 1: |
---|
| 158 | flow_type = 'H' |
---|
| 159 | else: |
---|
| 160 | raise Error('file version not supported') |
---|
| 161 | number_of_bases = str(number_of_bases) |
---|
| 162 | return [ |
---|
| 163 | ('flowgram_values', str(header['number_of_flows_per_read']) + |
---|
| 164 | flow_type), |
---|
| 165 | ('flow_index_per_base', number_of_bases + 'B'), |
---|
| 166 | ('bases', number_of_bases + 'c'), |
---|
| 167 | ('quality_scores', number_of_bases + 'B'), |
---|
| 168 | ] |
---|
| 169 | |
---|
| 170 | data = {} |
---|
| 171 | #we read the first part of the header |
---|
| 172 | bytes_read, data = read_bin_fragment(struct_def=read_header_1, |
---|
| 173 | fileh=fileh, offset=fposition, data=data) |
---|
| 174 | |
---|
| 175 | read_bin_fragment(struct_def=read_header_2(data['name_length']), |
---|
| 176 | fileh=fileh, offset=fposition + bytes_read, data=data) |
---|
| 177 | #we join the letters of the name |
---|
| 178 | data['name'] = ''.join(data['name']) |
---|
| 179 | offset = data['read_header_length'] |
---|
| 180 | #we read the sequence and the quality |
---|
| 181 | read_data_st = read_data(data['number_of_bases']) |
---|
| 182 | bytes_read, data = read_bin_fragment(struct_def=read_data_st, |
---|
| 183 | fileh=fileh, offset=fposition + offset, |
---|
| 184 | data=data, byte_padding=8) |
---|
| 185 | #we join the bases |
---|
| 186 | data['bases'] = ''.join(data['bases']) |
---|
| 187 | |
---|
| 188 | #print data |
---|
| 189 | #print "pre cqr: ", data['clip_qual_right'] |
---|
| 190 | #print "pre car: ", data['clip_adapter_right'] |
---|
| 191 | #print "pre cql: ", data['clip_qual_left'] |
---|
| 192 | #print "pre cal: ", data['clip_adapter_left'] |
---|
| 193 | |
---|
| 194 | # correct for the case the right clip is <= than the left clip |
---|
| 195 | # in this case, left clip is 0 are set to 0 (right clip == 0 means |
---|
| 196 | # "whole sequence") |
---|
| 197 | if data['clip_qual_right'] <= data['clip_qual_left'] : |
---|
| 198 | data['clip_qual_right'] = 0 |
---|
| 199 | data['clip_qual_left'] = 0 |
---|
| 200 | if data['clip_adapter_right'] <= data['clip_adapter_left'] : |
---|
| 201 | data['clip_adapter_right'] = 0 |
---|
| 202 | data['clip_adapter_left'] = 0 |
---|
| 203 | |
---|
| 204 | #the clipping section follows the NCBI's guidelines Trace Archive RFC |
---|
| 205 | #http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=rfc&m=doc&s=rfc |
---|
| 206 | #if there's no adapter clip: qual -> vector |
---|
| 207 | #else: qual-> qual |
---|
| 208 | # adapter -> vector |
---|
| 209 | |
---|
| 210 | if not data['clip_adapter_left']: |
---|
| 211 | data['clip_adapter_left'], data['clip_qual_left'] = data['clip_qual_left'], data['clip_adapter_left'] |
---|
| 212 | if not data['clip_adapter_right']: |
---|
| 213 | data['clip_adapter_right'], data['clip_qual_right'] = data['clip_qual_right'], data['clip_adapter_right'] |
---|
| 214 | |
---|
| 215 | # see whether we have to override the minimum left clips |
---|
| 216 | if config['min_leftclip'] > 0: |
---|
| 217 | if data['clip_adapter_left'] >0 and data['clip_adapter_left'] < config['min_leftclip']: |
---|
| 218 | data['clip_adapter_left'] = config['min_leftclip'] |
---|
| 219 | if data['clip_qual_left'] >0 and data['clip_qual_left'] < config['min_leftclip']: |
---|
| 220 | data['clip_qual_left'] = config['min_leftclip'] |
---|
| 221 | |
---|
| 222 | |
---|
| 223 | #print "post cqr: ", data['clip_qual_right'] |
---|
| 224 | #print "post car: ", data['clip_adapter_right'] |
---|
| 225 | #print "post cql: ", data['clip_qual_left'] |
---|
| 226 | #print "post cal: ", data['clip_adapter_left'] |
---|
| 227 | |
---|
| 228 | |
---|
| 229 | # for handling the -c (clip) option gently, we already clip here |
---|
| 230 | # and set all clip points to the sequence end points |
---|
| 231 | if config['clip']: |
---|
| 232 | data['bases'], data['quality_scores'] = clip_read(data) |
---|
| 233 | |
---|
| 234 | data['number_of_bases']=len(data['bases']) |
---|
| 235 | data['clip_qual_right'] = data['number_of_bases'] |
---|
| 236 | data['clip_adapter_right'] = data['number_of_bases'] |
---|
| 237 | data['clip_qual_left'] = 0 |
---|
| 238 | data['clip_adapter_left'] = 0 |
---|
| 239 | |
---|
| 240 | return data['read_header_length'] + bytes_read, data |
---|
| 241 | |
---|
| 242 | |
---|
| 243 | def sequences(fileh, header): |
---|
| 244 | '''It returns a generator with the data for each read.''' |
---|
| 245 | #now we can read all the sequences |
---|
| 246 | fposition = header['header_length'] #position in the file |
---|
| 247 | reads_read = 0 |
---|
| 248 | while True: |
---|
| 249 | if fposition == header['index_offset']: |
---|
| 250 | #we have to skip the index section |
---|
| 251 | fposition += index_length |
---|
| 252 | continue |
---|
| 253 | else: |
---|
| 254 | bytes_read, seq_data = read_sequence(header=header, fileh=fileh, |
---|
| 255 | fposition=fposition) |
---|
| 256 | yield seq_data |
---|
| 257 | fposition += bytes_read |
---|
| 258 | reads_read += 1 |
---|
| 259 | if reads_read >= header['number_of_reads']: |
---|
| 260 | break |
---|
| 261 | |
---|
| 262 | |
---|
| 263 | def remove_last_xmltag_in_file(fname, tag=None): |
---|
| 264 | '''Given an xml file name and a tag, it removes the last tag of the |
---|
| 265 | file if it matches the given tag. Tag removal is performed via file |
---|
| 266 | truncation. |
---|
| 267 | |
---|
| 268 | It the given tag is not the last in the file, a RunTimeError will be |
---|
| 269 | raised. |
---|
| 270 | |
---|
| 271 | The resulting xml file will be not xml valid. This function is a hack |
---|
| 272 | that allows to append records to xml files in a quick and dirty way. |
---|
| 273 | ''' |
---|
| 274 | |
---|
| 275 | fh = open(fname, 'r+') |
---|
| 276 | #we have to read from the end to the start of the file and keep the |
---|
| 277 | #string enclosed by </ > |
---|
| 278 | i = -1 |
---|
| 279 | last_tag = [] #the chars that form the last tag |
---|
| 280 | start_offset = None #in which byte does the last tag starts? |
---|
| 281 | end_offset = None #in which byte does the last tag ends? |
---|
| 282 | while True: |
---|
| 283 | fh.seek(i, 2) |
---|
| 284 | char = fh.read(1) |
---|
| 285 | if not char.isspace(): |
---|
| 286 | last_tag.append(char) |
---|
| 287 | if char == '>': |
---|
| 288 | end_offset = i |
---|
| 289 | if char == '<': |
---|
| 290 | start_offset = i |
---|
| 291 | break |
---|
| 292 | i -= 1 |
---|
| 293 | |
---|
| 294 | #we have read the last tag backwards |
---|
| 295 | last_tag = ''.join(last_tag[::-1]) |
---|
| 296 | #we remove the </ and > |
---|
| 297 | last_tag = last_tag.rstrip('>').lstrip('</') |
---|
| 298 | |
---|
| 299 | #we check that we're removing the asked tag |
---|
| 300 | if tag is not None and tag != last_tag: |
---|
| 301 | raise RuntimeError("The given xml tag wasn't the last one in the file") |
---|
| 302 | |
---|
| 303 | # while we are at it: also remove all white spaces in that line :-) |
---|
| 304 | i -= 1 |
---|
| 305 | while True: |
---|
| 306 | fh.seek(i, 2) |
---|
| 307 | char = fh.read(1) |
---|
| 308 | if not char == ' ' and not char == '\t': |
---|
| 309 | break; |
---|
| 310 | if fh.tell() == 1: |
---|
| 311 | break; |
---|
| 312 | i -= 1 |
---|
| 313 | |
---|
| 314 | fh.truncate(); |
---|
| 315 | |
---|
| 316 | fh.close() |
---|
| 317 | return last_tag |
---|
| 318 | |
---|
| 319 | |
---|
| 320 | def create_basic_xml_info(readname, fname): |
---|
| 321 | '''Formats a number of read specific infos into XML format. |
---|
| 322 | Currently formated: name and the tags set from command line |
---|
| 323 | ''' |
---|
| 324 | to_print = [' <trace>\n'] |
---|
| 325 | to_print.append(' <trace_name>') |
---|
| 326 | to_print.append(readname) |
---|
| 327 | to_print.append('</trace_name>\n') |
---|
| 328 | |
---|
| 329 | #extra information |
---|
| 330 | #do we have extra info for this file? |
---|
| 331 | info = None |
---|
| 332 | if config['xml_info']: |
---|
| 333 | #with this name? |
---|
| 334 | if fname in config['xml_info']: |
---|
| 335 | info = config['xml_info'][fname] |
---|
| 336 | else: |
---|
| 337 | #with no name? |
---|
| 338 | try: |
---|
| 339 | info = config['xml_info'][fake_sff_name] |
---|
| 340 | except KeyError: |
---|
| 341 | pass |
---|
| 342 | #we print the info that we have |
---|
| 343 | if info: |
---|
| 344 | for key in info: |
---|
| 345 | to_print.append(' <' + key + '>' + info[key] + \ |
---|
| 346 | '</' + key +'>\n') |
---|
| 347 | |
---|
| 348 | return ''.join(to_print) |
---|
| 349 | |
---|
| 350 | |
---|
| 351 | def create_clip_xml_info(readlen, adapl, adapr, quall, qualr): |
---|
| 352 | '''Takes the clip values of the read and formats them into XML |
---|
| 353 | Corrects "wrong" values that might have resulted through |
---|
| 354 | simplified calculations earlier in the process of conversion |
---|
| 355 | (especially during splitting of paired-end reads) |
---|
| 356 | ''' |
---|
| 357 | |
---|
| 358 | to_print = [""] |
---|
| 359 | |
---|
| 360 | # if right borders are >= to read length, they don't need |
---|
| 361 | # to be printed |
---|
| 362 | if adapr >= readlen: |
---|
| 363 | adapr = 0 |
---|
| 364 | if qualr >= readlen: |
---|
| 365 | qualr = 0 |
---|
| 366 | |
---|
| 367 | # BaCh |
---|
| 368 | # when called via split_paired_end(), some values may be < 0 |
---|
| 369 | # (when clip values were 0 previously) |
---|
| 370 | # instead of putting tons of if clauses for different calculations there, |
---|
| 371 | # I centralise corrective measure here |
---|
| 372 | # set all values <0 to 0 |
---|
| 373 | |
---|
| 374 | if adapr < 0: |
---|
| 375 | adapr = 0 |
---|
| 376 | if qualr <0: |
---|
| 377 | qualr = 0 |
---|
| 378 | if adapl < 0: |
---|
| 379 | adapl = 0 |
---|
| 380 | if quall <0: |
---|
| 381 | quall = 0 |
---|
| 382 | |
---|
| 383 | if quall: |
---|
| 384 | to_print.append(' <clip_quality_left>') |
---|
| 385 | to_print.append(str(quall)) |
---|
| 386 | to_print.append('</clip_quality_left>\n') |
---|
| 387 | if qualr: |
---|
| 388 | to_print.append(' <clip_quality_right>') |
---|
| 389 | to_print.append(str(qualr)) |
---|
| 390 | to_print.append('</clip_quality_right>\n') |
---|
| 391 | if adapl: |
---|
| 392 | to_print.append(' <clip_vector_left>') |
---|
| 393 | to_print.append(str(adapl)) |
---|
| 394 | to_print.append('</clip_vector_left>\n') |
---|
| 395 | if adapr: |
---|
| 396 | to_print.append(' <clip_vector_right>') |
---|
| 397 | to_print.append(str(adapr)) |
---|
| 398 | to_print.append('</clip_vector_right>\n') |
---|
| 399 | return ''.join(to_print) |
---|
| 400 | |
---|
| 401 | |
---|
| 402 | def create_xml_for_unpaired_read(data, fname): |
---|
| 403 | '''Given the data for one read it returns an str with the xml ancillary |
---|
| 404 | data.''' |
---|
| 405 | to_print = [create_basic_xml_info(data['name'],fname)] |
---|
| 406 | #clippings in the XML only if we do not hard clip |
---|
| 407 | if not config['clip']: |
---|
| 408 | to_print.append(create_clip_xml_info(data['number_of_bases'],data['clip_adapter_left'], data['clip_adapter_right'], data['clip_qual_left'], data['clip_qual_right'])); |
---|
| 409 | to_print.append(' </trace>\n') |
---|
| 410 | return ''.join(to_print) |
---|
| 411 | |
---|
| 412 | |
---|
| 413 | def format_as_fasta(name,seq,qual): |
---|
| 414 | name_line = ''.join(('>', name,'\n')) |
---|
| 415 | seqstring = ''.join((name_line, seq, '\n')) |
---|
| 416 | qual_line = ' '.join([str(q) for q in qual]) |
---|
| 417 | qualstring = ''.join((name_line, qual_line, '\n')) |
---|
| 418 | return seqstring, qualstring |
---|
| 419 | |
---|
| 420 | def format_as_fastq(name,seq,qual): |
---|
| 421 | qual_line = ''.join([chr(q+33) for q in qual]) |
---|
| 422 | #seqstring = ''.join(('@', name,'\n', seq, '\n+', name,'\n', qual_line, '\n')) |
---|
| 423 | seqstring = ''.join(('@', name,'\n', seq, '\n+\n', qual_line, '\n')) |
---|
| 424 | return seqstring |
---|
| 425 | |
---|
| 426 | |
---|
| 427 | def get_read_data(data): |
---|
| 428 | '''Given the data for one read it returns 2 strs with the fasta seq |
---|
| 429 | and fasta qual.''' |
---|
| 430 | #seq and qual |
---|
| 431 | if config['mix_case']: |
---|
| 432 | seq = sequence_case(data) |
---|
| 433 | qual = data['quality_scores'] |
---|
| 434 | else : |
---|
| 435 | seq = data['bases'] |
---|
| 436 | qual = data['quality_scores'] |
---|
| 437 | |
---|
| 438 | return seq, qual |
---|
| 439 | |
---|
| 440 | def extract_read_info(data, fname): |
---|
| 441 | '''Given the data for one read it returns 3 strs with the fasta seq, fasta |
---|
| 442 | qual and xml ancillary data.''' |
---|
| 443 | |
---|
| 444 | seq,qual = get_read_data(data) |
---|
| 445 | seqstring, qualstring = format_as_fasta(data['name'],seq,qual) |
---|
| 446 | |
---|
| 447 | #name_line = ''.join(('>', data['name'],'\n')) |
---|
| 448 | #seq = ''.join((name_line, seq, '\n')) |
---|
| 449 | #qual_line = ' '.join([str(q) for q in qual]) |
---|
| 450 | #qual = ''.join((name_line, qual_line, '\n')) |
---|
| 451 | |
---|
| 452 | xmlstring = create_xml_for_unpaired_read(data, fname) |
---|
| 453 | |
---|
| 454 | return seqstring, qualstring, xmlstring |
---|
| 455 | |
---|
| 456 | def write_sequence(name,seq,qual,seq_fh,qual_fh): |
---|
| 457 | '''Write sequence and quality FASTA and FASTA qual filehandles |
---|
| 458 | (or into FASTQ and XML) |
---|
| 459 | if sequence length is 0, don't write''' |
---|
| 460 | |
---|
| 461 | if len(seq) == 0 : return |
---|
| 462 | |
---|
| 463 | if qual_fh is None: |
---|
| 464 | seq_fh.write(format_as_fastq(name,seq,qual)) |
---|
| 465 | else: |
---|
| 466 | seqstring, qualstring = format_as_fasta(name,seq,qual) |
---|
| 467 | seq_fh.write(seqstring) |
---|
| 468 | qual_fh.write(qualstring) |
---|
| 469 | return |
---|
| 470 | |
---|
| 471 | def write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh): |
---|
| 472 | '''Writes an unpaired read into FASTA, FASTA qual and XML filehandles |
---|
| 473 | (or into FASTQ and XML) |
---|
| 474 | if sequence length is 0, don't write''' |
---|
| 475 | |
---|
| 476 | seq,qual = get_read_data(data) |
---|
| 477 | if len(seq) == 0 : return |
---|
| 478 | |
---|
| 479 | write_sequence(data['name'],seq,qual,seq_fh,qual_fh) |
---|
| 480 | |
---|
| 481 | anci = create_xml_for_unpaired_read(data, sff_fh.name) |
---|
| 482 | if anci is not None: |
---|
| 483 | xml_fh.write(anci) |
---|
| 484 | return |
---|
| 485 | |
---|
| 486 | |
---|
| 487 | def reverse_complement(seq): |
---|
| 488 | '''Returns the reverse complement of a DNA sequence as string''' |
---|
| 489 | |
---|
| 490 | compdict = { |
---|
| 491 | 'a': 't', |
---|
| 492 | 'c': 'g', |
---|
| 493 | 'g': 'c', |
---|
| 494 | 't': 'a', |
---|
| 495 | 'u': 't', |
---|
| 496 | 'm': 'k', |
---|
| 497 | 'r': 'y', |
---|
| 498 | 'w': 'w', |
---|
| 499 | 's': 's', |
---|
| 500 | 'y': 'r', |
---|
| 501 | 'k': 'm', |
---|
| 502 | 'v': 'b', |
---|
| 503 | 'h': 'd', |
---|
| 504 | 'd': 'h', |
---|
| 505 | 'b': 'v', |
---|
| 506 | 'x': 'x', |
---|
| 507 | 'n': 'n', |
---|
| 508 | 'A': 'T', |
---|
| 509 | 'C': 'G', |
---|
| 510 | 'G': 'C', |
---|
| 511 | 'T': 'A', |
---|
| 512 | 'U': 'T', |
---|
| 513 | 'M': 'K', |
---|
| 514 | 'R': 'Y', |
---|
| 515 | 'W': 'W', |
---|
| 516 | 'S': 'S', |
---|
| 517 | 'Y': 'R', |
---|
| 518 | 'K': 'M', |
---|
| 519 | 'V': 'B', |
---|
| 520 | 'H': 'D', |
---|
| 521 | 'D': 'H', |
---|
| 522 | 'B': 'V', |
---|
| 523 | 'X': 'X', |
---|
| 524 | 'N': 'N', |
---|
| 525 | '*': '*' |
---|
| 526 | } |
---|
| 527 | |
---|
| 528 | complseq = ''.join([compdict[base] for base in seq]) |
---|
| 529 | # python hack to reverse a list/string/etc |
---|
| 530 | complseq = complseq[::-1] |
---|
| 531 | return complseq |
---|
| 532 | |
---|
| 533 | |
---|
| 534 | def mask_sequence(seq, maskchar, fpos, tpos): |
---|
| 535 | '''Given a sequence, mask it with maskchar starting at fpos (including) and |
---|
| 536 | ending at tpos (excluding) |
---|
| 537 | ''' |
---|
| 538 | |
---|
| 539 | if len(maskchar) > 1: |
---|
| 540 | raise RuntimeError("Internal error: more than one character given to mask_sequence") |
---|
| 541 | if fpos<0: |
---|
| 542 | fpos = 0 |
---|
| 543 | if tpos > len(seq): |
---|
| 544 | tpos = len(seq) |
---|
| 545 | |
---|
| 546 | newseq = ''.join((seq[:fpos],maskchar*(tpos-fpos), seq[tpos:])) |
---|
| 547 | |
---|
| 548 | return newseq |
---|
| 549 | |
---|
| 550 | |
---|
| 551 | def fragment_sequences(sequence, qualities, splitchar): |
---|
| 552 | '''Works like split() on strings, except it does this on a sequence |
---|
| 553 | and the corresponding list with quality values. |
---|
| 554 | Returns a tuple for each fragment, each sublist has the fragment |
---|
| 555 | sequence as first and the fragment qualities as second elemnt''' |
---|
| 556 | |
---|
| 557 | # this is slow (due to zip and list appends... use an iterator over |
---|
| 558 | # the sequence find find variations and splices on seq and qual |
---|
| 559 | |
---|
| 560 | if len(sequence) != len(qualities): |
---|
| 561 | print sequence, qualities |
---|
| 562 | raise RuntimeError("Internal error: length of sequence and qualities don't match???") |
---|
| 563 | |
---|
| 564 | retlist = ([]) |
---|
| 565 | if len(sequence) == 0: |
---|
| 566 | return retlist |
---|
| 567 | |
---|
| 568 | actseq = ([]) |
---|
| 569 | actqual = ([]) |
---|
| 570 | if sequence[0] != splitchar: |
---|
| 571 | inseq = True |
---|
| 572 | else: |
---|
| 573 | inseq = False |
---|
| 574 | for char,qual in zip(sequence,qualities): |
---|
| 575 | if inseq: |
---|
| 576 | if char != splitchar: |
---|
| 577 | actseq.append(char) |
---|
| 578 | actqual.append(qual) |
---|
| 579 | else: |
---|
| 580 | retlist.append((''.join(actseq), actqual)) |
---|
| 581 | actseq = ([]) |
---|
| 582 | actqual = ([]) |
---|
| 583 | inseq = False |
---|
| 584 | else: |
---|
| 585 | if char != splitchar: |
---|
| 586 | inseq = True |
---|
| 587 | actseq.append(char) |
---|
| 588 | actqual.append(qual) |
---|
| 589 | |
---|
| 590 | if inseq and len(actseq): |
---|
| 591 | retlist.append((''.join(actseq), actqual)) |
---|
| 592 | |
---|
| 593 | return retlist |
---|
| 594 | |
---|
| 595 | |
---|
| 596 | def calc_subseq_boundaries(maskedseq, maskchar): |
---|
| 597 | '''E.g.: |
---|
| 598 | ........xxxxxxxx..........xxxxxxxxxxxxxxxxxxxxx......... |
---|
| 599 | to |
---|
| 600 | (0,8),(8,16),(16,26),(26,47),(47,56) |
---|
| 601 | ''' |
---|
| 602 | |
---|
| 603 | blist = ([]) |
---|
| 604 | if len(maskedseq) == 0: |
---|
| 605 | return blist |
---|
| 606 | |
---|
| 607 | inmask = True |
---|
| 608 | if maskedseq[0] != maskchar: |
---|
| 609 | inmask = False |
---|
| 610 | |
---|
| 611 | start = 0 |
---|
| 612 | for spos in range(len(maskedseq)): |
---|
| 613 | if inmask and maskedseq[spos] != maskchar: |
---|
| 614 | blist.append(([start,spos])) |
---|
| 615 | start = spos |
---|
| 616 | inmask = False |
---|
| 617 | elif not inmask and maskedseq[spos] == maskchar: |
---|
| 618 | blist.append(([start,spos])) |
---|
| 619 | start = spos |
---|
| 620 | inmask = True |
---|
| 621 | |
---|
| 622 | blist.append(([start,spos+1])) |
---|
| 623 | |
---|
| 624 | return blist |
---|
| 625 | |
---|
| 626 | |
---|
| 627 | def correct_for_smallhits(maskedseq, maskchar, linkername): |
---|
| 628 | '''If partial hits were found, take preventive measure: grow |
---|
| 629 | the masked areas by 20 bases in each direction |
---|
| 630 | Returns either unchanged "maskedseq" or a new sequence |
---|
| 631 | with some more characters masked. |
---|
| 632 | ''' |
---|
| 633 | global linkerlengths |
---|
| 634 | |
---|
| 635 | CEBUG = 0 |
---|
| 636 | |
---|
| 637 | if CEBUG : print "correct_for_smallhits" |
---|
| 638 | if CEBUG : print "Masked seq\n", maskedseq |
---|
| 639 | if CEBUG : print "Linkername: ", linkername |
---|
| 640 | |
---|
| 641 | if len(maskedseq) == 0: |
---|
| 642 | return maskedseq |
---|
| 643 | |
---|
| 644 | growl=40 |
---|
| 645 | growl2=growl/2 |
---|
| 646 | |
---|
| 647 | boundaries = calc_subseq_boundaries(maskedseq,maskchar) |
---|
| 648 | if CEBUG : print "Boundaries: ", boundaries |
---|
| 649 | |
---|
| 650 | foundpartial = False |
---|
| 651 | for bounds in boundaries: |
---|
| 652 | if CEBUG : print "\tbounds: ", bounds |
---|
| 653 | left, right = bounds |
---|
| 654 | if left != 0 and right != len(maskedseq): |
---|
| 655 | if maskedseq[left] == maskchar: |
---|
| 656 | # allow 10% discrepancy |
---|
| 657 | # -linkerlengths[linkername]/10 |
---|
| 658 | # that's a kind of safety net if there are slight sequencing |
---|
| 659 | # errors in the linker itself |
---|
| 660 | if right-left < linkerlengths[linkername]-linkerlengths[linkername]/10: |
---|
| 661 | if CEBUG : print "\t\tPartial: found " + str(right-left) + " gaps, " + linkername + " is " + str(linkerlengths[linkername]) + " nt long." |
---|
| 662 | foundpartial = True |
---|
| 663 | |
---|
| 664 | if not foundpartial: |
---|
| 665 | return maskedseq |
---|
| 666 | |
---|
| 667 | # grow |
---|
| 668 | newseq = "" |
---|
| 669 | for bounds in boundaries: |
---|
| 670 | if CEBUG : print "Bounds: ", bounds |
---|
| 671 | left, right = bounds |
---|
| 672 | if maskedseq[left] == maskchar: |
---|
| 673 | newseq += maskedseq[left:right] |
---|
| 674 | else: |
---|
| 675 | clearstart = 0 |
---|
| 676 | if left > 0 : |
---|
| 677 | clearstart = left+growl2 |
---|
| 678 | clearstop = len(maskedseq) |
---|
| 679 | if right < len(maskedseq): |
---|
| 680 | clearstop = right-growl2 |
---|
| 681 | |
---|
| 682 | if CEBUG : print "clearstart, clearstop: ",clearstart, clearstop |
---|
| 683 | |
---|
| 684 | if clearstop <= clearstart: |
---|
| 685 | newseq += maskchar * (right-left) |
---|
| 686 | else: |
---|
| 687 | if clearstart != left: |
---|
| 688 | newseq += maskchar * growl2 |
---|
| 689 | newseq += maskedseq[clearstart:clearstop] |
---|
| 690 | if clearstop != right: |
---|
| 691 | newseq += maskchar * growl2 |
---|
| 692 | |
---|
| 693 | #print "newseq\n",newseq |
---|
| 694 | |
---|
| 695 | return newseq |
---|
| 696 | |
---|
| 697 | |
---|
| 698 | def split_paired_end(data, sff_fh, seq_fh, qual_fh, xml_fh): |
---|
| 699 | '''Splits a paired end read and writes sequences into FASTA, FASTA qual |
---|
| 700 | and XML traceinfo file. Returns the number of sequences created. |
---|
| 701 | |
---|
| 702 | As the linker sequence may be anywhere in the read, including the ends |
---|
| 703 | and overlapping with bad quality sequence, we need to perform some |
---|
| 704 | computing and eventually set new clip points. |
---|
| 705 | |
---|
| 706 | If the resulting split yields only one sequence (because linker |
---|
| 707 | was not present or overlapping with left or right clip), only one |
---|
| 708 | sequence will be written with ".fn" appended to the name. |
---|
| 709 | |
---|
| 710 | If the read can be split, two reads will be written. The side left of |
---|
| 711 | the linker will be named ".r" and will be written in reverse complement |
---|
| 712 | into the file to conform with what approximately all assemblers expect |
---|
| 713 | when reading paired-end data: reads in forward direction in file. The side |
---|
| 714 | right of the linker will be named ".f" |
---|
| 715 | |
---|
| 716 | If SSAHA found partial linker (linker sequences < length of linker), |
---|
| 717 | the sequences will get a "_pl" furthermore be cut back thoroughly. |
---|
| 718 | |
---|
| 719 | If SSAHA found multiple occurences of the linker, the names will get an |
---|
| 720 | additional "_mlc" within the name to show that there was "multiple |
---|
| 721 | linker contamination". |
---|
| 722 | |
---|
| 723 | For multiple or partial linker, the "good" parts of the reads are |
---|
| 724 | stored with a ".part<number>" name, additionally they will not get |
---|
| 725 | template information in the XML |
---|
| 726 | ''' |
---|
| 727 | |
---|
| 728 | global ssahapematches |
---|
| 729 | |
---|
| 730 | CEBUG = 0 |
---|
| 731 | |
---|
| 732 | maskchar = "#" |
---|
| 733 | |
---|
| 734 | if CEBUG : print "Need to split: " + data['name'] |
---|
| 735 | |
---|
| 736 | numseqs = 0; |
---|
| 737 | readname = data['name'] |
---|
| 738 | readlen = data['number_of_bases'] |
---|
| 739 | |
---|
| 740 | leftclip, rightclip = return_merged_clips(data) |
---|
| 741 | seq, qual = get_read_data(data) |
---|
| 742 | |
---|
| 743 | if CEBUG : print "Original read:\n",seq |
---|
| 744 | |
---|
| 745 | maskedseq = seq |
---|
| 746 | if leftclip > 0: |
---|
| 747 | maskedseq = mask_sequence(maskedseq, maskchar, 0, leftclip-1) |
---|
| 748 | if rightclip < len(maskedseq): |
---|
| 749 | maskedseq = mask_sequence(maskedseq, maskchar, rightclip, len(maskedseq)) |
---|
| 750 | |
---|
| 751 | leftclip, rightclip = return_merged_clips(data) |
---|
| 752 | readlen = data['number_of_bases'] |
---|
| 753 | |
---|
| 754 | if CEBUG : print "Readname:", readname |
---|
| 755 | if CEBUG : print "Readlen:", readlen |
---|
| 756 | if CEBUG : print "Num matches:", str(len(ssahapematches[data['name']])) |
---|
| 757 | if CEBUG : print "matches:", ssahapematches[data['name']] |
---|
| 758 | |
---|
| 759 | for match in ssahapematches[data['name']]: |
---|
| 760 | score = int(match[0]) |
---|
| 761 | linkername = match[2] |
---|
| 762 | leftreadhit = int(match[3]) |
---|
| 763 | rightreadhit = int(match[4]) |
---|
| 764 | #leftlinkerhit = int(match[5]) |
---|
| 765 | #rightlinkerhit = int(match[6]) |
---|
| 766 | #direction = match[7] |
---|
| 767 | #hitlen = int(match[8]) |
---|
| 768 | #hitidentity = float(match[9]) |
---|
| 769 | |
---|
| 770 | if CEBUG : print match |
---|
| 771 | if CEBUG : print "Match with score:", score |
---|
| 772 | if CEBUG : print "Read before:\n", maskedseq |
---|
| 773 | maskedseq = mask_sequence(maskedseq, maskchar, leftreadhit-1, rightreadhit) |
---|
| 774 | if CEBUG : print "Masked seq:\n", maskedseq |
---|
| 775 | |
---|
| 776 | correctedseq = correct_for_smallhits(maskedseq, maskchar, linkername) |
---|
| 777 | |
---|
| 778 | if len(maskedseq) != len(correctedseq): |
---|
| 779 | raise RuntimeError("Internal error: maskedseq != correctedseq") |
---|
| 780 | |
---|
| 781 | partialhits = False |
---|
| 782 | if correctedseq != maskedseq: |
---|
| 783 | if CEBUG : print "Partial hits in", readname |
---|
| 784 | if CEBUG : print "Original seq:\n", seq |
---|
| 785 | if CEBUG : print "Masked seq:\n", maskedseq |
---|
| 786 | if CEBUG : print "Corrected seq\n", correctedseq |
---|
| 787 | partialhits = True |
---|
| 788 | readname += "_pl" |
---|
| 789 | maskedseq = correctedseq |
---|
| 790 | |
---|
| 791 | fragments = fragment_sequences(maskedseq, qual, maskchar) |
---|
| 792 | |
---|
| 793 | if CEBUG : print "Fragments (", len(fragments), "): ", fragments |
---|
| 794 | |
---|
| 795 | mlcflag = False |
---|
| 796 | #if len(ssahapematches[data['name']]) > 1: |
---|
| 797 | # #print "Multi linker contamination" |
---|
| 798 | # mlcflag = True |
---|
| 799 | # readname += "_mlc" |
---|
| 800 | |
---|
| 801 | if len(fragments) > 2: |
---|
| 802 | if CEBUG : print "Multi linker contamination" |
---|
| 803 | mlcflag = True |
---|
| 804 | readname += "_mlc" |
---|
| 805 | |
---|
| 806 | |
---|
| 807 | #print fragments |
---|
| 808 | if mlcflag or partialhits: |
---|
| 809 | fragcounter = 1 |
---|
| 810 | readname += ".part" |
---|
| 811 | for frag in fragments: |
---|
| 812 | actseq = frag[0] |
---|
| 813 | if len(actseq) >= 20: |
---|
| 814 | actqual = frag[1] |
---|
| 815 | oname = readname + str(fragcounter) |
---|
| 816 | #seq_fh.write(">"+oname+"\n") |
---|
| 817 | #seq_fh.write(actseq+"\n") |
---|
| 818 | #qual_fh.write(">"+oname+"\n") |
---|
| 819 | #qual_fh.write(' '.join((str(q) for q in actqual))) |
---|
| 820 | #qual_fh.write("\n") |
---|
| 821 | write_sequence(oname,actseq,actqual,seq_fh,qual_fh) |
---|
| 822 | to_print = [create_basic_xml_info(oname,sff_fh.name)] |
---|
| 823 | # No clipping in XML ... the multiple and partial fragments |
---|
| 824 | # are clipped "hard" |
---|
| 825 | # No template ID and trace_end: we don't know the |
---|
| 826 | # orientation of the frahments. Even if it were |
---|
| 827 | # only two, the fact we had multiple linkers |
---|
| 828 | # says something went wrong, so simply do not |
---|
| 829 | # write any paired-end information for all these fragments |
---|
| 830 | to_print.append(' </trace>\n') |
---|
| 831 | xml_fh.write(''.join(to_print)) |
---|
| 832 | numseqs += 1 |
---|
| 833 | fragcounter += 1 |
---|
| 834 | else: |
---|
| 835 | if len(fragments) >2: |
---|
| 836 | raise RuntimeError("Unexpected: more than two fragments detected in " + readname + ". please contact the authors.") |
---|
| 837 | # nothing will happen for 0 fragments |
---|
| 838 | if len(fragments) == 1: |
---|
| 839 | #print "Tada1" |
---|
| 840 | boundaries = calc_subseq_boundaries(maskedseq,maskchar) |
---|
| 841 | if len(boundaries) < 1 or len(boundaries) >3: |
---|
| 842 | raise RuntimeError("Unexpected case: ", str(len(boundaries)), "boundaries for 1 fragment of ", readname) |
---|
| 843 | if len(boundaries) == 3: |
---|
| 844 | # case: mask char on both sides of sequence |
---|
| 845 | #print "bounds3" |
---|
| 846 | data['clip_adapter_left']=1+boundaries[0][1] |
---|
| 847 | data['clip_adapter_right']=boundaries[2][0] |
---|
| 848 | elif len(boundaries) == 2: |
---|
| 849 | # case: mask char left or right of sequence |
---|
| 850 | #print "bounds2", |
---|
| 851 | if maskedseq[0] == maskchar : |
---|
| 852 | # case: mask char left |
---|
| 853 | #print "left" |
---|
| 854 | data['clip_adapter_left']=1+boundaries[0][1] |
---|
| 855 | else: |
---|
| 856 | # case: mask char right |
---|
| 857 | #print "right" |
---|
| 858 | data['clip_adapter_right']=boundaries[1][0] |
---|
| 859 | data['name'] = data['name'] + ".fn" |
---|
| 860 | write_unpaired_read(data, sff_fh, seq_fh, qual_fh, xml_fh) |
---|
| 861 | numseqs = 1 |
---|
| 862 | elif len(fragments) == 2: |
---|
| 863 | #print "Tada2" |
---|
| 864 | oname = readname + ".r" |
---|
| 865 | seq, qual = get_read_data(data) |
---|
| 866 | |
---|
| 867 | startsearch = False |
---|
| 868 | for spos in range(len(maskedseq)): |
---|
| 869 | if maskedseq[spos] != maskchar: |
---|
| 870 | startsearch = True; |
---|
| 871 | else: |
---|
| 872 | if startsearch: |
---|
| 873 | break |
---|
| 874 | |
---|
| 875 | #print "\nspos: ", spos |
---|
| 876 | lseq=seq[:spos] |
---|
| 877 | #print "lseq:", lseq |
---|
| 878 | actseq = reverse_complement(lseq) |
---|
| 879 | lreadlen = len(actseq) |
---|
| 880 | lqual = qual[:spos]; |
---|
| 881 | # python hack to reverse a list/string/etc |
---|
| 882 | lqual = lqual[::-1]; |
---|
| 883 | |
---|
| 884 | #seq_fh.write(">"+oname+"\n") |
---|
| 885 | #seq_fh.write(actseq+"\n") |
---|
| 886 | #qual_fh.write(">"+oname+"\n") |
---|
| 887 | #qual_fh.write(' '.join((str(q) for q in lqual))) |
---|
| 888 | #qual_fh.write("\n") |
---|
| 889 | |
---|
| 890 | write_sequence(oname,actseq,lqual,seq_fh,qual_fh) |
---|
| 891 | |
---|
| 892 | to_print = [create_basic_xml_info(oname,sff_fh.name)] |
---|
| 893 | to_print.append(create_clip_xml_info(lreadlen, 0, lreadlen+1-data['clip_adapter_left'], 0, lreadlen+1-data['clip_qual_left'])); |
---|
| 894 | to_print.append(' <template_id>') |
---|
| 895 | to_print.append(readname) |
---|
| 896 | to_print.append('</template_id>\n') |
---|
| 897 | to_print.append(' <trace_end>r</trace_end>\n') |
---|
| 898 | to_print.append(' </trace>\n') |
---|
| 899 | xml_fh.write(''.join(to_print)) |
---|
| 900 | |
---|
| 901 | oname = readname + ".f" |
---|
| 902 | startsearch = False |
---|
| 903 | for spos in range(len(maskedseq)-1,-1,-1): |
---|
| 904 | if maskedseq[spos] != maskchar: |
---|
| 905 | startsearch = True; |
---|
| 906 | else: |
---|
| 907 | if startsearch: |
---|
| 908 | break |
---|
| 909 | |
---|
| 910 | actseq = seq[spos+1:] |
---|
| 911 | actqual = qual[spos+1:]; |
---|
| 912 | |
---|
| 913 | #print "\nspos: ", spos |
---|
| 914 | #print "rseq:", actseq |
---|
| 915 | |
---|
| 916 | #seq_fh.write(">"+oname+"\n") |
---|
| 917 | #seq_fh.write(actseq+"\n") |
---|
| 918 | #qual_fh.write(">"+oname+"\n") |
---|
| 919 | #qual_fh.write(' '.join((str(q) for q in actqual))) |
---|
| 920 | #qual_fh.write("\n") |
---|
| 921 | write_sequence(oname,actseq,actqual,seq_fh,qual_fh) |
---|
| 922 | |
---|
| 923 | rreadlen = len(actseq) |
---|
| 924 | to_print = [create_basic_xml_info(oname,sff_fh.name)] |
---|
| 925 | to_print.append(create_clip_xml_info(rreadlen, 0, rreadlen-(readlen-data['clip_adapter_right']), 0, rreadlen-(readlen-data['clip_qual_right']))); |
---|
| 926 | to_print.append(' <template_id>') |
---|
| 927 | to_print.append(readname) |
---|
| 928 | to_print.append('</template_id>\n') |
---|
| 929 | to_print.append(' <trace_end>f</trace_end>\n') |
---|
| 930 | to_print.append(' </trace>\n') |
---|
| 931 | xml_fh.write(''.join(to_print)) |
---|
| 932 | numseqs = 2 |
---|
| 933 | |
---|
| 934 | return numseqs |
---|
| 935 | |
---|
| 936 | |
---|
| 937 | |
---|
| 938 | def extract_reads_from_sff(config, sff_files): |
---|
| 939 | '''Given the configuration and the list of sff_files it writes the seqs, |
---|
| 940 | qualities and ancillary data into the output file(s). |
---|
| 941 | |
---|
| 942 | If file for paired-end linker was given, first extracts all sequences |
---|
| 943 | of an SFF and searches these against the linker(s) with SSAHA2 to |
---|
| 944 | create needed information to split reads. |
---|
| 945 | ''' |
---|
| 946 | |
---|
| 947 | global ssahapematches |
---|
| 948 | |
---|
| 949 | |
---|
| 950 | if len(sff_files) == 0 : |
---|
| 951 | raise RuntimeError("No SFF file given?") |
---|
| 952 | |
---|
| 953 | #we go through all input files |
---|
| 954 | for sff_file in sff_files: |
---|
| 955 | if not os.path.getsize(sff_file): |
---|
| 956 | raise RuntimeError('Empty file? : ' + sff_file) |
---|
| 957 | fh = open(sff_file, 'r') |
---|
| 958 | fh.close() |
---|
| 959 | |
---|
| 960 | openmode = 'w' |
---|
| 961 | if config['append']: |
---|
| 962 | openmode = 'a' |
---|
| 963 | |
---|
| 964 | seq_fh = open(config['seq_fname'], openmode) |
---|
| 965 | xml_fh = open(config['xml_fname'], openmode) |
---|
| 966 | if config['want_fastq']: |
---|
| 967 | qual_fh = None |
---|
| 968 | try: |
---|
| 969 | os.remove(config['qual_fname']) |
---|
| 970 | except : |
---|
| 971 | python_formattingwithoutbracesisdumb_dummy = 1 |
---|
| 972 | else: |
---|
| 973 | qual_fh = open(config['qual_fname'], openmode) |
---|
| 974 | |
---|
| 975 | if not config['append']: |
---|
| 976 | xml_fh.write('<?xml version="1.0"?>\n<trace_volume>\n') |
---|
| 977 | else: |
---|
| 978 | remove_last_xmltag_in_file(config['xml_fname'], "trace_volume") |
---|
| 979 | |
---|
| 980 | #we go through all input files |
---|
| 981 | for sff_file in sff_files: |
---|
| 982 | #print "Working on '" + sff_file + "':" |
---|
| 983 | ssahapematches.clear() |
---|
| 984 | |
---|
| 985 | seqcheckstore = ([]) |
---|
| 986 | |
---|
| 987 | debug = 0 |
---|
| 988 | |
---|
| 989 | if not debug and config['pelinker_fname']: |
---|
| 990 | #print "Creating temporary sequences from reads in '" + sff_file + "' ... ", |
---|
| 991 | sys.stdout.flush() |
---|
| 992 | |
---|
| 993 | if 0 : |
---|
| 994 | # for debugging |
---|
| 995 | pid = os.getpid() |
---|
| 996 | tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta' |
---|
| 997 | tmpfasta_fh = open(tmpfasta_fname, 'w') |
---|
| 998 | else: |
---|
| 999 | tmpfasta_fh = tempfile.NamedTemporaryFile(prefix = 'sffeseqs_', |
---|
| 1000 | suffix = '.fasta') |
---|
| 1001 | |
---|
| 1002 | sff_fh = open(sff_file, 'rb') |
---|
| 1003 | header_data = read_header(fileh=sff_fh) |
---|
| 1004 | for seq_data in sequences(fileh=sff_fh, header=header_data): |
---|
| 1005 | seq,qual = get_read_data(seq_data) |
---|
| 1006 | seqstring, qualstring = format_as_fasta(seq_data['name'],seq,qual) |
---|
| 1007 | tmpfasta_fh.write(seqstring) |
---|
| 1008 | #seq, qual, anci = extract_read_info(seq_data, sff_fh.name) |
---|
| 1009 | #tmpfasta_fh.write(seq) |
---|
| 1010 | #print "done." |
---|
| 1011 | tmpfasta_fh.seek(0) |
---|
| 1012 | |
---|
| 1013 | if 0 : |
---|
| 1014 | # for debugging |
---|
| 1015 | tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2' |
---|
| 1016 | tmpssaha_fh = open(tmpssaha_fname, 'w+') |
---|
| 1017 | else: |
---|
| 1018 | tmpssaha_fh = tempfile.NamedTemporaryFile(prefix = 'sffealig_', |
---|
| 1019 | suffix = '.ssaha2') |
---|
| 1020 | |
---|
| 1021 | launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh) |
---|
| 1022 | tmpfasta_fh.close() |
---|
| 1023 | |
---|
| 1024 | tmpssaha_fh.seek(0) |
---|
| 1025 | read_ssaha_data(tmpssaha_fh) |
---|
| 1026 | tmpssaha_fh.close() |
---|
| 1027 | |
---|
| 1028 | if debug: |
---|
| 1029 | tmpssaha_fh = open("sffe.tmp.10634.ssaha2", 'r') |
---|
| 1030 | read_ssaha_data(tmpssaha_fh) |
---|
| 1031 | |
---|
| 1032 | #print "Converting '" + sff_file + "' ... ", |
---|
| 1033 | sys.stdout.flush() |
---|
| 1034 | sff_fh = open(sff_file, 'rb') |
---|
| 1035 | #read_header(infile) |
---|
| 1036 | header_data = read_header(fileh=sff_fh) |
---|
| 1037 | |
---|
| 1038 | #now convert all reads |
---|
| 1039 | nseqs_sff = 0 |
---|
| 1040 | nseqs_out = 0 |
---|
| 1041 | for seq_data in sequences(fileh=sff_fh, header=header_data): |
---|
| 1042 | nseqs_sff += 1 |
---|
| 1043 | |
---|
| 1044 | seq, qual = clip_read(seq_data) |
---|
| 1045 | seqcheckstore.append(seq[0:50]) |
---|
| 1046 | |
---|
| 1047 | #if nseqs_sff >1000: |
---|
| 1048 | # check_for_dubious_startseq(seqcheckstore,sff_file,seq_data) |
---|
| 1049 | # sys.exit() |
---|
| 1050 | |
---|
| 1051 | if ssahapematches.has_key(seq_data['name']): |
---|
| 1052 | #print "Paired end:",seq_data['name'] |
---|
| 1053 | nseqs_out += split_paired_end(seq_data, sff_fh, seq_fh, qual_fh, xml_fh) |
---|
| 1054 | else: |
---|
| 1055 | #print "Normal:",seq_data['name'] |
---|
| 1056 | if config['pelinker_fname']: |
---|
| 1057 | seq_data['name'] = seq_data['name'] + ".fn" |
---|
| 1058 | write_unpaired_read(seq_data, sff_fh, seq_fh, qual_fh, xml_fh) |
---|
| 1059 | nseqs_out += 1 |
---|
| 1060 | #print "done." |
---|
| 1061 | #print 'Converted', str(nseqs_sff), 'reads into', str(nseqs_out), 'sequences.' |
---|
| 1062 | sff_fh.close() |
---|
| 1063 | |
---|
| 1064 | check_for_dubious_startseq(seqcheckstore,sff_file,seq_data) |
---|
| 1065 | seqcheckstore = ([]) |
---|
| 1066 | |
---|
| 1067 | xml_fh.write('</trace_volume>\n') |
---|
| 1068 | |
---|
| 1069 | xml_fh.close() |
---|
| 1070 | seq_fh.close() |
---|
| 1071 | if qual_fh is not None: |
---|
| 1072 | qual_fh.close() |
---|
| 1073 | |
---|
| 1074 | return |
---|
| 1075 | |
---|
| 1076 | def check_for_dubious_startseq(seqcheckstore, sffname,seqdata): |
---|
| 1077 | |
---|
| 1078 | global stern_warning |
---|
| 1079 | |
---|
| 1080 | foundproblem = "" |
---|
| 1081 | for checklen in range(1,len(seqcheckstore[0])): |
---|
| 1082 | foundinloop = False |
---|
| 1083 | seqdict = {} |
---|
| 1084 | for seq in seqcheckstore: |
---|
| 1085 | shortseq = seq[0:checklen] |
---|
| 1086 | if shortseq in seqdict: |
---|
| 1087 | seqdict[shortseq] += 1 |
---|
| 1088 | else: |
---|
| 1089 | seqdict[shortseq] = 1 |
---|
| 1090 | |
---|
| 1091 | for shortseq, count in seqdict.items(): |
---|
| 1092 | if float(count)/len(seqcheckstore) >= 0.5: |
---|
| 1093 | foundinloop = True |
---|
| 1094 | stern_warning |
---|
| 1095 | foundproblem = "\n"+"*" * 80 |
---|
| 1096 | foundproblem += "\nWARNING: " |
---|
| 1097 | foundproblem += "weird sequences in file " + sffname + "\n\n" |
---|
| 1098 | foundproblem += "After applying left clips, " + str(count) + " sequences (=" |
---|
| 1099 | foundproblem += '%.0f'%(100.0*float(count)/len(seqcheckstore)) |
---|
| 1100 | foundproblem += "%) start with these bases:\n" + shortseq |
---|
| 1101 | foundproblem += "\n\nThis does not look sane.\n\n" |
---|
| 1102 | foundproblem += "Countermeasures you *probably* must take:\n" |
---|
| 1103 | foundproblem += "1) Make your sequence provider aware of that problem and ask whether this can be\n corrected in the SFF.\n" |
---|
| 1104 | foundproblem += "2) If you decide that this is not normal and your sequence provider does not\n react, use the --min_left_clip of sff_extract.\n" |
---|
| 1105 | left,right = return_merged_clips(seqdata) |
---|
| 1106 | foundproblem += " (Probably '--min_left_clip="+ str(left+len(shortseq))+"' but you should cross-check that)\n" |
---|
| 1107 | foundproblem += "*" * 80 + "\n" |
---|
| 1108 | if not foundinloop : |
---|
| 1109 | break |
---|
| 1110 | if len(foundproblem): |
---|
| 1111 | print foundproblem |
---|
| 1112 | |
---|
| 1113 | |
---|
| 1114 | def parse_extra_info(info): |
---|
| 1115 | '''It parses the information that will go in the xml file. |
---|
| 1116 | |
---|
| 1117 | There are two formats accepted for the extra information: |
---|
| 1118 | key1:value1, key2:value2 |
---|
| 1119 | or: |
---|
| 1120 | file1.sff{key1:value1, key2:value2};file2.sff{key3:value3} |
---|
| 1121 | ''' |
---|
| 1122 | if not info: |
---|
| 1123 | return info |
---|
| 1124 | finfos = info.split(';') #information for each file |
---|
| 1125 | data_for_files = {} |
---|
| 1126 | for finfo in finfos: |
---|
| 1127 | #we split the file name from the rest |
---|
| 1128 | items = finfo.split('{') |
---|
| 1129 | if len(items) == 1: |
---|
| 1130 | fname = fake_sff_name |
---|
| 1131 | info = items[0] |
---|
| 1132 | else: |
---|
| 1133 | fname = items[0] |
---|
| 1134 | info = items[1] |
---|
| 1135 | #now we get each key,value pair in the info |
---|
| 1136 | info = info.replace('}', '') |
---|
| 1137 | data = {} |
---|
| 1138 | for item in info.split(','): |
---|
| 1139 | key, value = item.strip().split(':') |
---|
| 1140 | key = key.strip() |
---|
| 1141 | value = value.strip() |
---|
| 1142 | data[key] = value |
---|
| 1143 | data_for_files[fname] = data |
---|
| 1144 | return data_for_files |
---|
| 1145 | |
---|
| 1146 | |
---|
| 1147 | def return_merged_clips(data): |
---|
| 1148 | '''It returns the left and right positions to clip.''' |
---|
| 1149 | def max(a, b): |
---|
| 1150 | '''It returns the max of the two given numbers. |
---|
| 1151 | |
---|
| 1152 | It won't take into account the zero values. |
---|
| 1153 | ''' |
---|
| 1154 | if not a and not b: |
---|
| 1155 | return None |
---|
| 1156 | if not a: |
---|
| 1157 | return b |
---|
| 1158 | if not b: |
---|
| 1159 | return a |
---|
| 1160 | if a >= b: |
---|
| 1161 | return a |
---|
| 1162 | else: |
---|
| 1163 | return b |
---|
| 1164 | def min(a, b): |
---|
| 1165 | '''It returns the min of the two given numbers. |
---|
| 1166 | |
---|
| 1167 | It won't take into account the zero values. |
---|
| 1168 | ''' |
---|
| 1169 | if not a and not b: |
---|
| 1170 | return None |
---|
| 1171 | if not a: |
---|
| 1172 | return b |
---|
| 1173 | if not b: |
---|
| 1174 | return a |
---|
| 1175 | if a <= b: |
---|
| 1176 | return a |
---|
| 1177 | else: |
---|
| 1178 | return b |
---|
| 1179 | left = max(data['clip_adapter_left'], data['clip_qual_left']) |
---|
| 1180 | right = min(data['clip_adapter_right'], data['clip_qual_right']) |
---|
| 1181 | #maybe both clips where zero |
---|
| 1182 | if left is None: |
---|
| 1183 | left = 1 |
---|
| 1184 | if right is None: |
---|
| 1185 | right = data['number_of_bases'] |
---|
| 1186 | return left, right |
---|
| 1187 | |
---|
| 1188 | def sequence_case(data): |
---|
| 1189 | '''Given the data for one read it returns the seq with mixed case. |
---|
| 1190 | |
---|
| 1191 | The regions to be clipped will be lower case and the rest upper case. |
---|
| 1192 | ''' |
---|
| 1193 | left, right = return_merged_clips(data) |
---|
| 1194 | seq = data['bases'] |
---|
| 1195 | new_seq = ''.join((seq[:left-1].lower(), seq[left-1:right], seq[right:].lower())) |
---|
| 1196 | return new_seq |
---|
| 1197 | |
---|
| 1198 | def clip_read(data): |
---|
| 1199 | '''Given the data for one read it returns clipped seq and qual.''' |
---|
| 1200 | |
---|
| 1201 | qual = data['quality_scores'] |
---|
| 1202 | left, right = return_merged_clips(data) |
---|
| 1203 | seq = data['bases'] |
---|
| 1204 | qual = data['quality_scores'] |
---|
| 1205 | new_seq = seq[left-1:right] |
---|
| 1206 | new_qual = qual[left-1:right] |
---|
| 1207 | |
---|
| 1208 | return new_seq, new_qual |
---|
| 1209 | |
---|
| 1210 | |
---|
| 1211 | |
---|
| 1212 | def tests_for_ssaha(linker_fname): |
---|
| 1213 | '''Tests whether SSAHA2 can be successfully called.''' |
---|
| 1214 | |
---|
| 1215 | try: |
---|
| 1216 | print "Testing whether SSAHA2 is installed and can be launched ... ", |
---|
| 1217 | sys.stdout.flush() |
---|
| 1218 | fh = open('/dev/null', 'w') |
---|
| 1219 | retcode = subprocess.call(["ssaha2", "-v"], stdout = fh) |
---|
| 1220 | fh.close() |
---|
| 1221 | print "ok." |
---|
| 1222 | except : |
---|
| 1223 | print "nope? Uh oh ...\n\n" |
---|
| 1224 | raise RuntimeError('Could not launch ssaha2. Have you installed it? Is it in your path?') |
---|
| 1225 | |
---|
| 1226 | |
---|
| 1227 | def load_linker_sequences(linker_fname): |
---|
| 1228 | '''Loads all linker sequences into memory, storing only the length |
---|
| 1229 | of each linker.''' |
---|
| 1230 | |
---|
| 1231 | global linkerlengths |
---|
| 1232 | |
---|
| 1233 | if not os.path.getsize(linker_fname): |
---|
| 1234 | raise RuntimeError("File empty? '" + linker_fname + "'") |
---|
| 1235 | fh = open(linker_fname, 'r') |
---|
| 1236 | linkerseqs = read_fasta(fh) |
---|
| 1237 | if len(linkerseqs) == 0: |
---|
| 1238 | raise RuntimeError(linker_fname + ": no sequence found?") |
---|
| 1239 | for i in linkerseqs: |
---|
| 1240 | if linkerlengths.has_key(i.name): |
---|
| 1241 | raise RuntimeError(linker_fname + ": sequence '" + i.name + "' present multiple times. Aborting.") |
---|
| 1242 | linkerlengths[i.name] = len(i.sequence) |
---|
| 1243 | fh.close() |
---|
| 1244 | |
---|
| 1245 | |
---|
| 1246 | def launch_ssaha(linker_fname, query_fname, output_fh): |
---|
| 1247 | '''Launches SSAHA2 on the linker and query file, string SSAHA2 output |
---|
| 1248 | into the output filehandle''' |
---|
| 1249 | |
---|
| 1250 | try: |
---|
| 1251 | print "Searching linker sequences with SSAHA2 (this may take a while) ... ", |
---|
| 1252 | sys.stdout.flush() |
---|
| 1253 | retcode = subprocess.call(["ssaha2", "-output", "ssaha2", "-solexa", "-kmer", "4", "-skip", "1", linker_fname, query_fname], stdout = output_fh) |
---|
| 1254 | if retcode: |
---|
| 1255 | raise RuntimeError('Ups.') |
---|
| 1256 | else: |
---|
| 1257 | print "ok." |
---|
| 1258 | except: |
---|
| 1259 | print "\n" |
---|
| 1260 | raise RuntimeError('An error occured during the SSAHA2 execution, aborting.') |
---|
| 1261 | |
---|
| 1262 | def read_ssaha_data(ssahadata_fh): |
---|
| 1263 | '''Given file handle, reads file generated with SSAHA2 (with default |
---|
| 1264 | output format) and stores all matches as list ssahapematches |
---|
| 1265 | (ssaha paired-end matches) dictionary''' |
---|
| 1266 | |
---|
| 1267 | global ssahapematches |
---|
| 1268 | |
---|
| 1269 | print "Parsing SSAHA2 result file ... ", |
---|
| 1270 | sys.stdout.flush() |
---|
| 1271 | |
---|
| 1272 | for line in ssahadata_fh: |
---|
| 1273 | if line.startswith('ALIGNMENT'): |
---|
| 1274 | ml = line.split() |
---|
| 1275 | if len(ml) != 12 : |
---|
| 1276 | print "\n", line, |
---|
| 1277 | raise RuntimeError('Expected 12 elements in the SSAHA2 line with ALIGMENT keyword, but found ' + str(len(ml))) |
---|
| 1278 | if not ssahapematches.has_key(ml[2]) : |
---|
| 1279 | ssahapematches[ml[2]] = ([]) |
---|
| 1280 | if ml[8] == 'F': |
---|
| 1281 | #print line, |
---|
| 1282 | |
---|
| 1283 | # store everything except the first element (output |
---|
| 1284 | # format name (ALIGNMENT)) and the last element |
---|
| 1285 | # (length) |
---|
| 1286 | ssahapematches[ml[2]].append(ml[1:-1]) |
---|
| 1287 | else: |
---|
| 1288 | #print ml |
---|
| 1289 | ml[4],ml[5] = ml[5],ml[4] |
---|
| 1290 | #print ml |
---|
| 1291 | ssahapematches[ml[2]].append(ml[1:-1]) |
---|
| 1292 | |
---|
| 1293 | print "done." |
---|
| 1294 | |
---|
| 1295 | |
---|
| 1296 | ########################################################################## |
---|
| 1297 | # |
---|
| 1298 | # BaCh: This block was shamelessly copied from |
---|
| 1299 | # http://python.genedrift.org/2007/07/04/reading-fasta-files-conclusion/ |
---|
| 1300 | # and then subsequently modified to read fasta correctly |
---|
| 1301 | # It's still not fool proof, but should be good enough |
---|
| 1302 | # |
---|
| 1303 | ########################################################################## |
---|
| 1304 | |
---|
| 1305 | class Fasta: |
---|
| 1306 | def __init__(self, name, sequence): |
---|
| 1307 | self.name = name |
---|
| 1308 | self.sequence = sequence |
---|
| 1309 | |
---|
| 1310 | def read_fasta(file): |
---|
| 1311 | items = [] |
---|
| 1312 | aninstance = Fasta('', '') |
---|
| 1313 | linenum = 0 |
---|
| 1314 | for line in file: |
---|
| 1315 | linenum += 1 |
---|
| 1316 | if line.startswith(">"): |
---|
| 1317 | if len(aninstance.sequence): |
---|
| 1318 | items.append(aninstance) |
---|
| 1319 | aninstance = Fasta('', '') |
---|
| 1320 | # name == all characters until the first whitespace |
---|
| 1321 | # (split()[0]) but without the starting ">" ([1:]) |
---|
| 1322 | aninstance.name = line.split()[0][1:] |
---|
| 1323 | aninstance.sequence = '' |
---|
| 1324 | if len(aninstance.name) == 0: |
---|
| 1325 | raise RuntimeError(file.name + ': no name in line ' + str(linenum) + '?') |
---|
| 1326 | |
---|
| 1327 | else: |
---|
| 1328 | if len(aninstance.name) == 0: |
---|
| 1329 | raise RuntimeError(file.name + ': no sequence header at line ' + str(linenum) + '?') |
---|
| 1330 | aninstance.sequence += line.strip() |
---|
| 1331 | |
---|
| 1332 | if len(aninstance.name) and len(aninstance.sequence): |
---|
| 1333 | items.append(aninstance) |
---|
| 1334 | |
---|
| 1335 | return items |
---|
| 1336 | ########################################################################## |
---|
| 1337 | |
---|
| 1338 | def version_string (): |
---|
| 1339 | return "sff_extract " + __version__ |
---|
| 1340 | |
---|
| 1341 | def read_config(): |
---|
| 1342 | '''It reads the configuration options from the command line arguments and |
---|
| 1343 | it returns a dict with them.''' |
---|
| 1344 | from optparse import OptionParser, OptionGroup |
---|
| 1345 | usage = "usage: %prog [options] sff1 sff2 ..." |
---|
| 1346 | desc = "Extract sequences from 454 SFF files into FASTA, FASTA quality"\ |
---|
| 1347 | " and XML traceinfo format. When a paired-end linker sequence"\ |
---|
| 1348 | " is given (-l), use SSAHA2 to scan the sequences for the linker,"\ |
---|
| 1349 | " then split the sequences, removing the linker." |
---|
| 1350 | parser = OptionParser(usage = usage, version = version_string(), description = desc) |
---|
| 1351 | parser.add_option('-a', '--append', action="store_true", dest='append', |
---|
| 1352 | help='append output to existing files', default=False) |
---|
| 1353 | parser.add_option('-i', '--xml_info', dest='xml_info', |
---|
| 1354 | help='extra info to write in the xml file') |
---|
| 1355 | parser.add_option("-l", "--linker_file", dest="pelinker_fname", |
---|
| 1356 | help="FASTA file with paired-end linker sequences", metavar="FILE") |
---|
| 1357 | |
---|
| 1358 | group = OptionGroup(parser, "File name options","") |
---|
| 1359 | group.add_option('-c', '--clip', action="store_true", dest='clip', |
---|
| 1360 | help='clip (completely remove) ends with low qual and/or adaptor sequence', default=False) |
---|
| 1361 | group.add_option('-u', '--upper_case', action="store_false", dest='mix_case', |
---|
| 1362 | help='all bases in upper case, including clipped ends', default=True) |
---|
| 1363 | group.add_option('', '--min_left_clip', dest='min_leftclip', |
---|
| 1364 | metavar="INTEGER", type = "int", |
---|
| 1365 | help='if the left clip coming from the SFF is smaller than this value, override it', default=0) |
---|
| 1366 | group.add_option('-Q', '--fastq', action="store_true", dest='want_fastq', |
---|
| 1367 | help='store as FASTQ file instead of FASTA + FASTA quality file', default=False) |
---|
| 1368 | parser.add_option_group(group) |
---|
| 1369 | |
---|
| 1370 | group = OptionGroup(parser, "File name options","") |
---|
| 1371 | group.add_option("-o", "--out_basename", dest="basename", |
---|
| 1372 | help="base name for all output files") |
---|
| 1373 | group.add_option("-s", "--seq_file", dest="seq_fname", |
---|
| 1374 | help="output sequence file name", metavar="FILE") |
---|
| 1375 | group.add_option("-q", "--qual_file", dest="qual_fname", |
---|
| 1376 | help="output quality file name", metavar="FILE") |
---|
| 1377 | group.add_option("-x", "--xml_file", dest="xml_fname", |
---|
| 1378 | help="output ancillary xml file name", metavar="FILE") |
---|
| 1379 | parser.add_option_group(group) |
---|
| 1380 | |
---|
| 1381 | #default fnames |
---|
| 1382 | #is there an sff file? |
---|
| 1383 | basename = 'reads' |
---|
| 1384 | if sys.argv[-1][-4:].lower() == '.sff': |
---|
| 1385 | basename = sys.argv[-1][:-4] |
---|
| 1386 | def_seq_fname = basename + '.fasta' |
---|
| 1387 | def_qual_fname = basename + '.fasta.qual' |
---|
| 1388 | def_xml_fname = basename + '.xml' |
---|
| 1389 | def_pelinker_fname = '' |
---|
| 1390 | parser.set_defaults(seq_fname = def_seq_fname) |
---|
| 1391 | parser.set_defaults(qual_fname = def_qual_fname) |
---|
| 1392 | parser.set_defaults(xml_fname = def_xml_fname) |
---|
| 1393 | parser.set_defaults(pelinker_fname = def_pelinker_fname) |
---|
| 1394 | |
---|
| 1395 | #we parse the cmd line |
---|
| 1396 | (options, args) = parser.parse_args() |
---|
| 1397 | |
---|
| 1398 | #we put the result in a dict |
---|
| 1399 | global config |
---|
| 1400 | config = {} |
---|
| 1401 | for property in dir(options): |
---|
| 1402 | if property[0] == '_' or property in ('ensure_value', 'read_file', |
---|
| 1403 | 'read_module'): |
---|
| 1404 | continue |
---|
| 1405 | config[property] = getattr(options, property) |
---|
| 1406 | |
---|
| 1407 | if config['basename'] is None: |
---|
| 1408 | config['basename']=basename |
---|
| 1409 | |
---|
| 1410 | #if we have not set a file name with -s, -q or -x we set the basename |
---|
| 1411 | #based file name |
---|
| 1412 | if config['want_fastq']: |
---|
| 1413 | config['qual_fname'] = '' |
---|
| 1414 | if config['seq_fname'] == def_seq_fname: |
---|
| 1415 | config['seq_fname'] = config['basename'] + '.fastq' |
---|
| 1416 | else: |
---|
| 1417 | if config['seq_fname'] == def_seq_fname: |
---|
| 1418 | config['seq_fname'] = config['basename'] + '.fasta' |
---|
| 1419 | if config['qual_fname'] == def_qual_fname: |
---|
| 1420 | config['qual_fname'] = config['basename'] + '.fasta.qual' |
---|
| 1421 | |
---|
| 1422 | if config['xml_fname'] == def_xml_fname: |
---|
| 1423 | config['xml_fname'] = config['basename'] + '.xml' |
---|
| 1424 | |
---|
| 1425 | #we parse the extra info for the xml file |
---|
| 1426 | config['xml_info'] = parse_extra_info(config['xml_info']) |
---|
| 1427 | return config, args |
---|
| 1428 | |
---|
| 1429 | |
---|
| 1430 | |
---|
| 1431 | ########################################################################## |
---|
| 1432 | |
---|
| 1433 | |
---|
| 1434 | def testsome(): |
---|
| 1435 | sys.exit() |
---|
| 1436 | return |
---|
| 1437 | |
---|
| 1438 | |
---|
| 1439 | def debug(): |
---|
| 1440 | try: |
---|
| 1441 | dummy = 1 |
---|
| 1442 | #debug() |
---|
| 1443 | #testsome() |
---|
| 1444 | |
---|
| 1445 | config, args = read_config() |
---|
| 1446 | load_linker_sequences(config['pelinker_fname']) |
---|
| 1447 | |
---|
| 1448 | #pid = os.getpid() |
---|
| 1449 | pid = 15603 |
---|
| 1450 | |
---|
| 1451 | #tmpfasta_fname = 'sffe.tmp.'+ str(pid)+'.fasta' |
---|
| 1452 | #tmpfasta_fh = open(tmpfasta_fname, 'w') |
---|
| 1453 | tmpfasta_fname = 'FLVI58L05.fa' |
---|
| 1454 | tmpfasta_fh = open(tmpfasta_fname, 'r') |
---|
| 1455 | |
---|
| 1456 | tmpssaha_fname = 'sffe.tmp.'+str(pid)+'.ssaha2' |
---|
| 1457 | tmpssaha_fh = open(tmpssaha_fname, 'w') |
---|
| 1458 | |
---|
| 1459 | launch_ssaha(config['pelinker_fname'], tmpfasta_fh.name, tmpssaha_fh) |
---|
| 1460 | |
---|
| 1461 | tmpssaha_fh = open("sffe.tmp.15603.ssaha2", 'r') |
---|
| 1462 | read_ssaha_data(tmpssaha_fh) |
---|
| 1463 | |
---|
| 1464 | sys.exit() |
---|
| 1465 | |
---|
| 1466 | extract_reads_from_sff(config, args) |
---|
| 1467 | |
---|
| 1468 | except (OSError, IOError, RuntimeError), errval: |
---|
| 1469 | print errval |
---|
| 1470 | sys.exit() |
---|
| 1471 | |
---|
| 1472 | sys.exit() |
---|
| 1473 | |
---|
| 1474 | |
---|
| 1475 | def main(): |
---|
| 1476 | |
---|
| 1477 | argv = sys.argv |
---|
| 1478 | if len(argv) == 1: |
---|
| 1479 | sys.argv.append('-h') |
---|
| 1480 | read_config() |
---|
| 1481 | sys.exit() |
---|
| 1482 | try: |
---|
| 1483 | #debug(); |
---|
| 1484 | |
---|
| 1485 | config, args = read_config() |
---|
| 1486 | |
---|
| 1487 | if config['pelinker_fname']: |
---|
| 1488 | #tests_for_ssaha(config['pelinker_fname']) |
---|
| 1489 | load_linker_sequences(config['pelinker_fname']) |
---|
| 1490 | if len(args) == 0: |
---|
| 1491 | raise RuntimeError("No SFF file given?") |
---|
| 1492 | extract_reads_from_sff(config, args) |
---|
| 1493 | except (OSError, IOError, RuntimeError), errval: |
---|
| 1494 | print errval |
---|
| 1495 | return 1 |
---|
| 1496 | |
---|
| 1497 | if stern_warning: |
---|
| 1498 | return 1 |
---|
| 1499 | |
---|
| 1500 | return 0 |
---|
| 1501 | |
---|
| 1502 | |
---|
| 1503 | |
---|
| 1504 | if __name__ == "__main__": |
---|
| 1505 | sys.exit(main()) |
---|