[2] | 1 | #!/usr/bin/env python |
---|
| 2 | """ |
---|
| 3 | convert SOLiD calor-base data to nucleotide sequence |
---|
| 4 | example: T011213122200221123032111221021210131332222101 |
---|
| 5 | TTGTCATGAGAAAGACAGCCGACACTCAAGTCAACGTATCTCTGGT |
---|
| 6 | """ |
---|
| 7 | |
---|
| 8 | import sys, os |
---|
| 9 | |
---|
| 10 | def stop_err(msg): |
---|
| 11 | |
---|
| 12 | sys.stderr.write(msg) |
---|
| 13 | sys.stderr.write('\n') |
---|
| 14 | sys.exit() |
---|
| 15 | |
---|
| 16 | def color2base(color_seq): |
---|
| 17 | |
---|
| 18 | first_nuc = ['A','C','G','T'] |
---|
| 19 | code_matrix = {} |
---|
| 20 | code_matrix['0'] = ['A','C','G','T'] |
---|
| 21 | code_matrix['1'] = ['C','A','T','G'] |
---|
| 22 | code_matrix['2'] = ['G','T','A','C'] |
---|
| 23 | code_matrix['3'] = ['T','G','C','A'] |
---|
| 24 | |
---|
| 25 | overlap_nuc = '' |
---|
| 26 | nuc_seq = '' |
---|
| 27 | |
---|
| 28 | seq_prefix = prefix = color_seq[0].upper() |
---|
| 29 | color_seq = color_seq[1:] |
---|
| 30 | |
---|
| 31 | if not (seq_prefix in first_nuc): |
---|
| 32 | stop_err('The leading nucleotide is invalid. Must be one of the four nucleotides: A, T, C, G.\nThe file contains a %s' %seq_prefix ) |
---|
| 33 | |
---|
| 34 | for code in color_seq: |
---|
| 35 | |
---|
| 36 | if not (code in ['0','1','2','3']): |
---|
| 37 | stop_err('Expect digits (0, 1, 2, 3) in the color-cading data. File contains numbers other than the set.\nThe file contains a %s' %code) |
---|
| 38 | |
---|
| 39 | second_nuc = code_matrix[code] |
---|
| 40 | overlap_nuc = second_nuc[first_nuc.index(prefix)] |
---|
| 41 | nuc_seq += overlap_nuc |
---|
| 42 | prefix = overlap_nuc |
---|
| 43 | |
---|
| 44 | return seq_prefix, nuc_seq |
---|
| 45 | |
---|
| 46 | def __main__(): |
---|
| 47 | |
---|
| 48 | infilename = sys.argv[1] |
---|
| 49 | keep_prefix = sys.argv[2].lower() |
---|
| 50 | outfilename = sys.argv[3] |
---|
| 51 | |
---|
| 52 | outfile = open(outfilename,'w') |
---|
| 53 | |
---|
| 54 | prefix = '' |
---|
| 55 | color_seq = '' |
---|
| 56 | for i, line in enumerate(file(infilename)): |
---|
| 57 | line = line.rstrip('\r\n') |
---|
| 58 | |
---|
| 59 | if not line: continue |
---|
| 60 | if line.startswith("#"): continue |
---|
| 61 | |
---|
| 62 | if line.startswith(">"): |
---|
| 63 | |
---|
| 64 | if color_seq: |
---|
| 65 | prefix, nuc_seq = color2base(color_seq) |
---|
| 66 | |
---|
| 67 | if keep_prefix == 'yes': |
---|
| 68 | nuc_seq = prefix + nuc_seq |
---|
| 69 | |
---|
| 70 | outfile.write(title+'\n') |
---|
| 71 | outfile.write(nuc_seq+'\n') |
---|
| 72 | |
---|
| 73 | title = line |
---|
| 74 | color_seq = '' |
---|
| 75 | else: |
---|
| 76 | color_seq += line |
---|
| 77 | |
---|
| 78 | if color_seq: |
---|
| 79 | prefix, nuc_seq = color2base(color_seq) |
---|
| 80 | |
---|
| 81 | if keep_prefix == 'yes': |
---|
| 82 | nuc_seq = prefix + nuc_seq |
---|
| 83 | |
---|
| 84 | outfile.write(title+'\n') |
---|
| 85 | outfile.write(nuc_seq+'\n') |
---|
| 86 | |
---|
| 87 | outfile.close() |
---|
| 88 | |
---|
| 89 | if __name__=='__main__': __main__() |
---|