1 | #!/usr/bin/python2.6 |
---|
2 | |
---|
3 | """ |
---|
4 | Returns a bed-like translation of a CDS in which each record corresponds to |
---|
5 | a single site in the CDS and includes additional fields for site degenaracy, |
---|
6 | position ind CDS, and amino acid encoded. |
---|
7 | |
---|
8 | usage: %prog nibdir genefile [options] |
---|
9 | -o, --outfile=o: output file |
---|
10 | -f, --format=f: format bed (default), or gtf|gff |
---|
11 | -a, --allpositions: 1st, 2nd and 3rd positions are evaluated for degeneracy given the sequence at the other two positions. Many 1d sites in 1st codon positions become 2d sites when considered this way. |
---|
12 | -n, --include_name: include the 'name' or 'id' field from the source file on every line of output |
---|
13 | """ |
---|
14 | |
---|
15 | import re |
---|
16 | import sys |
---|
17 | import os |
---|
18 | import string |
---|
19 | from bx.seq import nib |
---|
20 | from bx.bitset import * |
---|
21 | from bx.bitset_builders import * |
---|
22 | from bx.bitset_utils import * |
---|
23 | from bx.gene_reader import * |
---|
24 | from bx.cookbook import doc_optparse |
---|
25 | |
---|
26 | GENETIC_CODE = """ |
---|
27 | TTT (Phe/F)Phenylalanine |
---|
28 | TTC (Phe/F)Phenylalanine |
---|
29 | TTA (Leu/L)Leucine |
---|
30 | TTG (Leu/L)Leucine, Start |
---|
31 | TCT (Ser/S)Serine |
---|
32 | TCC (Ser/S)Serine |
---|
33 | TCA (Ser/S)Serine |
---|
34 | TCG (Ser/S)Serine |
---|
35 | TAT (Tyr/Y)Tyrosine |
---|
36 | TAC (Tyr/Y)Tyrosine |
---|
37 | TAA Ochre (Stop) |
---|
38 | TAG Amber (Stop) |
---|
39 | TGT (Cys/C)Cysteine |
---|
40 | TGC (Cys/C)Cysteine |
---|
41 | TGA Opal (Stop) |
---|
42 | TGG (Trp/W)Tryptophan |
---|
43 | CTT (Leu/L)Leucine |
---|
44 | CTC (Leu/L)Leucine |
---|
45 | CTA (Leu/L)Leucine |
---|
46 | CTG (Leu/L)Leucine, Start |
---|
47 | CCT (Pro/P)Proline |
---|
48 | CCC (Pro/P)Proline |
---|
49 | CCA (Pro/P)Proline |
---|
50 | CCG (Pro/P)Proline |
---|
51 | CAT (His/H)Histidine |
---|
52 | CAC (His/H)Histidine |
---|
53 | CAA (Gln/Q)Glutamine |
---|
54 | CAG (Gln/Q)Glutamine |
---|
55 | CGT (Arg/R)Arginine |
---|
56 | CGC (Arg/R)Arginine |
---|
57 | CGA (Arg/R)Arginine |
---|
58 | CGG (Arg/R)Arginine |
---|
59 | ATT (Ile/I)Isoleucine, Start2 |
---|
60 | ATC (Ile/I)Isoleucine |
---|
61 | ATA (Ile/I)Isoleucine |
---|
62 | ATG (Met/M)Methionine, Start1 |
---|
63 | ACT (Thr/T)Threonine |
---|
64 | ACC (Thr/T)Threonine |
---|
65 | ACA (Thr/T)Threonine |
---|
66 | ACG (Thr/T)Threonine |
---|
67 | AAT (Asn/N)Asparagine |
---|
68 | AAC (Asn/N)Asparagine |
---|
69 | AAA (Lys/K)Lysine |
---|
70 | AAG (Lys/K)Lysine |
---|
71 | AGT (Ser/S)Serine |
---|
72 | AGC (Ser/S)Serine |
---|
73 | AGA (Arg/R)Arginine |
---|
74 | AGG (Arg/R)Arginine |
---|
75 | GTT (Val/V)Valine |
---|
76 | GTC (Val/V)Valine |
---|
77 | GTA (Val/V)Valine |
---|
78 | GTG (Val/V)Valine, Start2 |
---|
79 | GCT (Ala/A)Alanine |
---|
80 | GCC (Ala/A)Alanine |
---|
81 | GCA (Ala/A)Alanine |
---|
82 | GCG (Ala/A)Alanine |
---|
83 | GAT (Asp/D)Aspartic acid |
---|
84 | GAC (Asp/D)Aspartic acid |
---|
85 | GAA (Glu/E)Glutamic acid |
---|
86 | GAG (Glu/E)Glutamic acid |
---|
87 | GGT (Gly/G)Glycine |
---|
88 | GGC (Gly/G)Glycine |
---|
89 | GGA (Gly/G)Glycine |
---|
90 | GGG (Gly/G)Glycine |
---|
91 | """ |
---|
92 | |
---|
93 | def translate( codon, genetic_code): |
---|
94 | c1,c2,c3 = codon |
---|
95 | return genetic_code[c1][c2][c3] |
---|
96 | |
---|
97 | """ parse the doc string to hash the genetic code""" |
---|
98 | GEN_CODE = {} |
---|
99 | for line in GENETIC_CODE.split('\n'): |
---|
100 | if line.strip() == '': continue |
---|
101 | f = re.split('\s|\(|\)|\/',line) |
---|
102 | codon = f[0] |
---|
103 | c1,c2,c3 = codon |
---|
104 | aminoacid = f[3] |
---|
105 | if c1 not in GEN_CODE: GEN_CODE[c1] = {} |
---|
106 | if c2 not in GEN_CODE[c1]: GEN_CODE[c1][c2] = {} |
---|
107 | |
---|
108 | GEN_CODE[c1][c2][c3] = aminoacid |
---|
109 | |
---|
110 | def getnib( nibdir ): |
---|
111 | seqs = {} |
---|
112 | for nibf in os.listdir( nibdir ): |
---|
113 | if not nibf.endswith('.nib'): continue |
---|
114 | chr = nibf.replace('.nib','') |
---|
115 | file = os.path.join( nibdir, nibf ) |
---|
116 | seqs[chr] = nib.NibFile( open(file) ) |
---|
117 | |
---|
118 | return seqs |
---|
119 | |
---|
120 | REVMAP = string.maketrans("ACGTacgt","TGCAtgca") |
---|
121 | def revComp(seq): |
---|
122 | return seq[::-1].translate(REVMAP) |
---|
123 | |
---|
124 | def Comp(seq): |
---|
125 | return seq.translate(REVMAP) |
---|
126 | |
---|
127 | def codon_degeneracy( codon, position=3 ): |
---|
128 | aa = translate( codon, GEN_CODE ) |
---|
129 | if position==1: |
---|
130 | degeneracy1 = [GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa) |
---|
131 | elif position==2: |
---|
132 | degeneracy2 = [GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa) |
---|
133 | elif position==3: |
---|
134 | degeneracy = GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa) |
---|
135 | return degeneracy |
---|
136 | |
---|
137 | def main(): |
---|
138 | |
---|
139 | options, args = doc_optparse.parse( __doc__ ) |
---|
140 | try: |
---|
141 | if options.outfile: |
---|
142 | out = open( options.outfile, "w") |
---|
143 | else: |
---|
144 | out = sys.stdout |
---|
145 | if options.format: |
---|
146 | format = options.format |
---|
147 | else: |
---|
148 | format = 'bed' |
---|
149 | |
---|
150 | allpositions = bool( options.allpositions ) |
---|
151 | include_name = bool( options.include_name ) |
---|
152 | nibdir = args[0] |
---|
153 | bedfile = args[1] |
---|
154 | except: |
---|
155 | doc_optparse.exit() |
---|
156 | |
---|
157 | nibs = getnib(nibdir) |
---|
158 | |
---|
159 | for chrom, strand, cds_exons, name in CDSReader( open(bedfile), format=format): |
---|
160 | |
---|
161 | cds_seq = '' |
---|
162 | |
---|
163 | # genome_seq_index maps the position in CDS to position on the genome |
---|
164 | genome_seq_index = [] |
---|
165 | for (c_start, c_end) in cds_exons: |
---|
166 | cds_seq += nibs[chrom].get( c_start, c_end-c_start ) |
---|
167 | for i in range(c_start,c_end): |
---|
168 | genome_seq_index.append(i) |
---|
169 | |
---|
170 | cds_seq = cds_seq.upper() |
---|
171 | |
---|
172 | if strand == '+': |
---|
173 | frsts = range( 0, len(cds_seq), 3) |
---|
174 | offsign = 1 |
---|
175 | else: |
---|
176 | cds_seq = Comp( cds_seq ) |
---|
177 | frsts = range( 2, len(cds_seq), 3) |
---|
178 | offsign = -1 |
---|
179 | |
---|
180 | offone = 1 * offsign |
---|
181 | offtwo = 2 * offsign |
---|
182 | |
---|
183 | all = ['A','C','G','T'] |
---|
184 | |
---|
185 | for first_pos in frsts: |
---|
186 | c1 = first_pos |
---|
187 | c2 = first_pos + offone |
---|
188 | c3 = first_pos + offtwo |
---|
189 | try: |
---|
190 | assert c3 < len(cds_seq) |
---|
191 | except AssertionError: |
---|
192 | print >>sys.stderr, "out of sequence at %d for %s, %d" % (c3, chrom, genome_seq_index[ first_pos ]) |
---|
193 | continue |
---|
194 | codon = cds_seq[c1], cds_seq[c2], cds_seq[c3] |
---|
195 | aa = translate( codon, GEN_CODE ) |
---|
196 | degeneracy3 = str(GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)) + "d" |
---|
197 | |
---|
198 | if not include_name: name_text = '' |
---|
199 | else: |
---|
200 | name_text = name.replace(' ','_') |
---|
201 | |
---|
202 | if allpositions: |
---|
203 | try: |
---|
204 | degeneracy1 = str([GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)) + "d" |
---|
205 | degeneracy2 = str([GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)) + "d" |
---|
206 | except TypeError, s: |
---|
207 | print >>sys.stderr, GEN_CODE.values() |
---|
208 | raise TypeError, s |
---|
209 | |
---|
210 | if strand == '+': |
---|
211 | print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text |
---|
212 | print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text |
---|
213 | print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text |
---|
214 | else: |
---|
215 | print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text |
---|
216 | print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text |
---|
217 | print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text |
---|
218 | else: |
---|
219 | if strand == '+': |
---|
220 | for b in c1,c2: |
---|
221 | print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text |
---|
222 | print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text |
---|
223 | else: |
---|
224 | print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text |
---|
225 | for b in c2,c1: |
---|
226 | print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text |
---|
227 | out.close() |
---|
228 | |
---|
229 | if __name__ == '__main__': |
---|
230 | main() |
---|
231 | #format = sys.argv[1] |
---|
232 | #file = sys.argv[2] |
---|
233 | #for chr, strand, cds_exons in CDSReader( open(file), format=format): |
---|
234 | # s_points = [ "%d,%d" % (a[0],a[1]) for a in cds_exons ] |
---|
235 | # print chr, strand, len(cds_exons), "\t".join(s_points) |
---|
236 | |
---|