| 1 | #!/usr/bin/python2.6 |
|---|
| 2 | |
|---|
| 3 | """ |
|---|
| 4 | Returns a bed-like translation of a CDS in which each record corresponds to |
|---|
| 5 | a single site in the CDS and includes additional fields for site degenaracy, |
|---|
| 6 | position ind CDS, and amino acid encoded. |
|---|
| 7 | |
|---|
| 8 | usage: %prog nibdir genefile [options] |
|---|
| 9 | -o, --outfile=o: output file |
|---|
| 10 | -f, --format=f: format bed (default), or gtf|gff |
|---|
| 11 | -a, --allpositions: 1st, 2nd and 3rd positions are evaluated for degeneracy given the sequence at the other two positions. Many 1d sites in 1st codon positions become 2d sites when considered this way. |
|---|
| 12 | -n, --include_name: include the 'name' or 'id' field from the source file on every line of output |
|---|
| 13 | """ |
|---|
| 14 | |
|---|
| 15 | import re |
|---|
| 16 | import sys |
|---|
| 17 | import os |
|---|
| 18 | import string |
|---|
| 19 | from bx.seq import nib |
|---|
| 20 | from bx.bitset import * |
|---|
| 21 | from bx.bitset_builders import * |
|---|
| 22 | from bx.bitset_utils import * |
|---|
| 23 | from bx.gene_reader import * |
|---|
| 24 | from bx.cookbook import doc_optparse |
|---|
| 25 | |
|---|
| 26 | GENETIC_CODE = """ |
|---|
| 27 | TTT (Phe/F)Phenylalanine |
|---|
| 28 | TTC (Phe/F)Phenylalanine |
|---|
| 29 | TTA (Leu/L)Leucine |
|---|
| 30 | TTG (Leu/L)Leucine, Start |
|---|
| 31 | TCT (Ser/S)Serine |
|---|
| 32 | TCC (Ser/S)Serine |
|---|
| 33 | TCA (Ser/S)Serine |
|---|
| 34 | TCG (Ser/S)Serine |
|---|
| 35 | TAT (Tyr/Y)Tyrosine |
|---|
| 36 | TAC (Tyr/Y)Tyrosine |
|---|
| 37 | TAA Ochre (Stop) |
|---|
| 38 | TAG Amber (Stop) |
|---|
| 39 | TGT (Cys/C)Cysteine |
|---|
| 40 | TGC (Cys/C)Cysteine |
|---|
| 41 | TGA Opal (Stop) |
|---|
| 42 | TGG (Trp/W)Tryptophan |
|---|
| 43 | CTT (Leu/L)Leucine |
|---|
| 44 | CTC (Leu/L)Leucine |
|---|
| 45 | CTA (Leu/L)Leucine |
|---|
| 46 | CTG (Leu/L)Leucine, Start |
|---|
| 47 | CCT (Pro/P)Proline |
|---|
| 48 | CCC (Pro/P)Proline |
|---|
| 49 | CCA (Pro/P)Proline |
|---|
| 50 | CCG (Pro/P)Proline |
|---|
| 51 | CAT (His/H)Histidine |
|---|
| 52 | CAC (His/H)Histidine |
|---|
| 53 | CAA (Gln/Q)Glutamine |
|---|
| 54 | CAG (Gln/Q)Glutamine |
|---|
| 55 | CGT (Arg/R)Arginine |
|---|
| 56 | CGC (Arg/R)Arginine |
|---|
| 57 | CGA (Arg/R)Arginine |
|---|
| 58 | CGG (Arg/R)Arginine |
|---|
| 59 | ATT (Ile/I)Isoleucine, Start2 |
|---|
| 60 | ATC (Ile/I)Isoleucine |
|---|
| 61 | ATA (Ile/I)Isoleucine |
|---|
| 62 | ATG (Met/M)Methionine, Start1 |
|---|
| 63 | ACT (Thr/T)Threonine |
|---|
| 64 | ACC (Thr/T)Threonine |
|---|
| 65 | ACA (Thr/T)Threonine |
|---|
| 66 | ACG (Thr/T)Threonine |
|---|
| 67 | AAT (Asn/N)Asparagine |
|---|
| 68 | AAC (Asn/N)Asparagine |
|---|
| 69 | AAA (Lys/K)Lysine |
|---|
| 70 | AAG (Lys/K)Lysine |
|---|
| 71 | AGT (Ser/S)Serine |
|---|
| 72 | AGC (Ser/S)Serine |
|---|
| 73 | AGA (Arg/R)Arginine |
|---|
| 74 | AGG (Arg/R)Arginine |
|---|
| 75 | GTT (Val/V)Valine |
|---|
| 76 | GTC (Val/V)Valine |
|---|
| 77 | GTA (Val/V)Valine |
|---|
| 78 | GTG (Val/V)Valine, Start2 |
|---|
| 79 | GCT (Ala/A)Alanine |
|---|
| 80 | GCC (Ala/A)Alanine |
|---|
| 81 | GCA (Ala/A)Alanine |
|---|
| 82 | GCG (Ala/A)Alanine |
|---|
| 83 | GAT (Asp/D)Aspartic acid |
|---|
| 84 | GAC (Asp/D)Aspartic acid |
|---|
| 85 | GAA (Glu/E)Glutamic acid |
|---|
| 86 | GAG (Glu/E)Glutamic acid |
|---|
| 87 | GGT (Gly/G)Glycine |
|---|
| 88 | GGC (Gly/G)Glycine |
|---|
| 89 | GGA (Gly/G)Glycine |
|---|
| 90 | GGG (Gly/G)Glycine |
|---|
| 91 | """ |
|---|
| 92 | |
|---|
| 93 | def translate( codon, genetic_code): |
|---|
| 94 | c1,c2,c3 = codon |
|---|
| 95 | return genetic_code[c1][c2][c3] |
|---|
| 96 | |
|---|
| 97 | """ parse the doc string to hash the genetic code""" |
|---|
| 98 | GEN_CODE = {} |
|---|
| 99 | for line in GENETIC_CODE.split('\n'): |
|---|
| 100 | if line.strip() == '': continue |
|---|
| 101 | f = re.split('\s|\(|\)|\/',line) |
|---|
| 102 | codon = f[0] |
|---|
| 103 | c1,c2,c3 = codon |
|---|
| 104 | aminoacid = f[3] |
|---|
| 105 | if c1 not in GEN_CODE: GEN_CODE[c1] = {} |
|---|
| 106 | if c2 not in GEN_CODE[c1]: GEN_CODE[c1][c2] = {} |
|---|
| 107 | |
|---|
| 108 | GEN_CODE[c1][c2][c3] = aminoacid |
|---|
| 109 | |
|---|
| 110 | def getnib( nibdir ): |
|---|
| 111 | seqs = {} |
|---|
| 112 | for nibf in os.listdir( nibdir ): |
|---|
| 113 | if not nibf.endswith('.nib'): continue |
|---|
| 114 | chr = nibf.replace('.nib','') |
|---|
| 115 | file = os.path.join( nibdir, nibf ) |
|---|
| 116 | seqs[chr] = nib.NibFile( open(file) ) |
|---|
| 117 | |
|---|
| 118 | return seqs |
|---|
| 119 | |
|---|
| 120 | REVMAP = string.maketrans("ACGTacgt","TGCAtgca") |
|---|
| 121 | def revComp(seq): |
|---|
| 122 | return seq[::-1].translate(REVMAP) |
|---|
| 123 | |
|---|
| 124 | def Comp(seq): |
|---|
| 125 | return seq.translate(REVMAP) |
|---|
| 126 | |
|---|
| 127 | def codon_degeneracy( codon, position=3 ): |
|---|
| 128 | aa = translate( codon, GEN_CODE ) |
|---|
| 129 | if position==1: |
|---|
| 130 | degeneracy1 = [GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa) |
|---|
| 131 | elif position==2: |
|---|
| 132 | degeneracy2 = [GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa) |
|---|
| 133 | elif position==3: |
|---|
| 134 | degeneracy = GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa) |
|---|
| 135 | return degeneracy |
|---|
| 136 | |
|---|
| 137 | def main(): |
|---|
| 138 | |
|---|
| 139 | options, args = doc_optparse.parse( __doc__ ) |
|---|
| 140 | try: |
|---|
| 141 | if options.outfile: |
|---|
| 142 | out = open( options.outfile, "w") |
|---|
| 143 | else: |
|---|
| 144 | out = sys.stdout |
|---|
| 145 | if options.format: |
|---|
| 146 | format = options.format |
|---|
| 147 | else: |
|---|
| 148 | format = 'bed' |
|---|
| 149 | |
|---|
| 150 | allpositions = bool( options.allpositions ) |
|---|
| 151 | include_name = bool( options.include_name ) |
|---|
| 152 | nibdir = args[0] |
|---|
| 153 | bedfile = args[1] |
|---|
| 154 | except: |
|---|
| 155 | doc_optparse.exit() |
|---|
| 156 | |
|---|
| 157 | nibs = getnib(nibdir) |
|---|
| 158 | |
|---|
| 159 | for chrom, strand, cds_exons, name in CDSReader( open(bedfile), format=format): |
|---|
| 160 | |
|---|
| 161 | cds_seq = '' |
|---|
| 162 | |
|---|
| 163 | # genome_seq_index maps the position in CDS to position on the genome |
|---|
| 164 | genome_seq_index = [] |
|---|
| 165 | for (c_start, c_end) in cds_exons: |
|---|
| 166 | cds_seq += nibs[chrom].get( c_start, c_end-c_start ) |
|---|
| 167 | for i in range(c_start,c_end): |
|---|
| 168 | genome_seq_index.append(i) |
|---|
| 169 | |
|---|
| 170 | cds_seq = cds_seq.upper() |
|---|
| 171 | |
|---|
| 172 | if strand == '+': |
|---|
| 173 | frsts = range( 0, len(cds_seq), 3) |
|---|
| 174 | offsign = 1 |
|---|
| 175 | else: |
|---|
| 176 | cds_seq = Comp( cds_seq ) |
|---|
| 177 | frsts = range( 2, len(cds_seq), 3) |
|---|
| 178 | offsign = -1 |
|---|
| 179 | |
|---|
| 180 | offone = 1 * offsign |
|---|
| 181 | offtwo = 2 * offsign |
|---|
| 182 | |
|---|
| 183 | all = ['A','C','G','T'] |
|---|
| 184 | |
|---|
| 185 | for first_pos in frsts: |
|---|
| 186 | c1 = first_pos |
|---|
| 187 | c2 = first_pos + offone |
|---|
| 188 | c3 = first_pos + offtwo |
|---|
| 189 | try: |
|---|
| 190 | assert c3 < len(cds_seq) |
|---|
| 191 | except AssertionError: |
|---|
| 192 | print >>sys.stderr, "out of sequence at %d for %s, %d" % (c3, chrom, genome_seq_index[ first_pos ]) |
|---|
| 193 | continue |
|---|
| 194 | codon = cds_seq[c1], cds_seq[c2], cds_seq[c3] |
|---|
| 195 | aa = translate( codon, GEN_CODE ) |
|---|
| 196 | degeneracy3 = str(GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)) + "d" |
|---|
| 197 | |
|---|
| 198 | if not include_name: name_text = '' |
|---|
| 199 | else: |
|---|
| 200 | name_text = name.replace(' ','_') |
|---|
| 201 | |
|---|
| 202 | if allpositions: |
|---|
| 203 | try: |
|---|
| 204 | degeneracy1 = str([GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)) + "d" |
|---|
| 205 | degeneracy2 = str([GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)) + "d" |
|---|
| 206 | except TypeError, s: |
|---|
| 207 | print >>sys.stderr, GEN_CODE.values() |
|---|
| 208 | raise TypeError, s |
|---|
| 209 | |
|---|
| 210 | if strand == '+': |
|---|
| 211 | print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text |
|---|
| 212 | print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text |
|---|
| 213 | print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text |
|---|
| 214 | else: |
|---|
| 215 | print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text |
|---|
| 216 | print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text |
|---|
| 217 | print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text |
|---|
| 218 | else: |
|---|
| 219 | if strand == '+': |
|---|
| 220 | for b in c1,c2: |
|---|
| 221 | print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text |
|---|
| 222 | print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text |
|---|
| 223 | else: |
|---|
| 224 | print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text |
|---|
| 225 | for b in c2,c1: |
|---|
| 226 | print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text |
|---|
| 227 | out.close() |
|---|
| 228 | |
|---|
| 229 | if __name__ == '__main__': |
|---|
| 230 | main() |
|---|
| 231 | #format = sys.argv[1] |
|---|
| 232 | #file = sys.argv[2] |
|---|
| 233 | #for chr, strand, cds_exons in CDSReader( open(file), format=format): |
|---|
| 234 | # s_points = [ "%d,%d" % (a[0],a[1]) for a in cds_exons ] |
|---|
| 235 | # print chr, strand, len(cds_exons), "\t".join(s_points) |
|---|
| 236 | |
|---|