root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/EGG-INFO/scripts/gene_fourfold_sites.py

リビジョン 3, 7.5 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1#!/usr/bin/python2.6
2
3"""
4Returns a bed-like translation of a CDS in which each record corresponds to
5a single site in the CDS and includes additional fields for site degenaracy,
6position ind CDS, and amino acid encoded.
7
8usage: %prog nibdir genefile [options]
9    -o, --outfile=o:      output file
10    -f, --format=f:       format bed (default), or gtf|gff
11    -a, --allpositions: 1st, 2nd and 3rd positions are evaluated for degeneracy given the sequence at the other two positions.  Many 1d sites in 1st codon positions become 2d sites when considered this way.
12    -n, --include_name: include the 'name' or 'id' field from the source file on every line of output
13"""
14
15import re
16import sys
17import os
18import string
19from bx.seq import nib
20from bx.bitset import *
21from bx.bitset_builders import *
22from bx.bitset_utils import *
23from bx.gene_reader import *
24from bx.cookbook import doc_optparse
25
26GENETIC_CODE = """
27TTT (Phe/F)Phenylalanine
28TTC (Phe/F)Phenylalanine
29TTA (Leu/L)Leucine
30TTG (Leu/L)Leucine, Start
31TCT (Ser/S)Serine
32TCC (Ser/S)Serine
33TCA (Ser/S)Serine
34TCG (Ser/S)Serine
35TAT (Tyr/Y)Tyrosine
36TAC (Tyr/Y)Tyrosine
37TAA Ochre (Stop)
38TAG Amber (Stop)
39TGT (Cys/C)Cysteine
40TGC (Cys/C)Cysteine
41TGA Opal (Stop)
42TGG (Trp/W)Tryptophan
43CTT (Leu/L)Leucine
44CTC (Leu/L)Leucine
45CTA (Leu/L)Leucine
46CTG (Leu/L)Leucine, Start
47CCT (Pro/P)Proline
48CCC (Pro/P)Proline
49CCA (Pro/P)Proline
50CCG (Pro/P)Proline
51CAT (His/H)Histidine
52CAC (His/H)Histidine
53CAA (Gln/Q)Glutamine
54CAG (Gln/Q)Glutamine
55CGT (Arg/R)Arginine
56CGC (Arg/R)Arginine
57CGA (Arg/R)Arginine
58CGG (Arg/R)Arginine
59ATT (Ile/I)Isoleucine, Start2
60ATC (Ile/I)Isoleucine
61ATA (Ile/I)Isoleucine
62ATG (Met/M)Methionine, Start1
63ACT (Thr/T)Threonine
64ACC (Thr/T)Threonine
65ACA (Thr/T)Threonine
66ACG (Thr/T)Threonine
67AAT (Asn/N)Asparagine
68AAC (Asn/N)Asparagine
69AAA (Lys/K)Lysine
70AAG (Lys/K)Lysine
71AGT (Ser/S)Serine
72AGC (Ser/S)Serine
73AGA (Arg/R)Arginine
74AGG (Arg/R)Arginine
75GTT (Val/V)Valine
76GTC (Val/V)Valine
77GTA (Val/V)Valine
78GTG (Val/V)Valine, Start2
79GCT (Ala/A)Alanine
80GCC (Ala/A)Alanine
81GCA (Ala/A)Alanine
82GCG (Ala/A)Alanine
83GAT (Asp/D)Aspartic acid
84GAC (Asp/D)Aspartic acid
85GAA (Glu/E)Glutamic acid
86GAG (Glu/E)Glutamic acid
87GGT (Gly/G)Glycine
88GGC (Gly/G)Glycine
89GGA (Gly/G)Glycine
90GGG (Gly/G)Glycine
91"""
92
93def translate( codon, genetic_code):
94    c1,c2,c3 = codon
95    return genetic_code[c1][c2][c3]
96
97""" parse the doc string to hash the genetic code"""
98GEN_CODE = {}
99for line in GENETIC_CODE.split('\n'):
100    if line.strip() == '': continue
101    f = re.split('\s|\(|\)|\/',line)
102    codon = f[0]
103    c1,c2,c3 = codon
104    aminoacid = f[3]
105    if c1 not in GEN_CODE: GEN_CODE[c1] = {}
106    if c2 not in GEN_CODE[c1]: GEN_CODE[c1][c2] = {}
107
108    GEN_CODE[c1][c2][c3] = aminoacid
109
110def getnib( nibdir ):
111    seqs = {}
112    for nibf in os.listdir( nibdir ):
113        if not nibf.endswith('.nib'): continue
114        chr = nibf.replace('.nib','')
115        file = os.path.join( nibdir, nibf )
116        seqs[chr] = nib.NibFile( open(file) )
117
118    return seqs
119
120REVMAP = string.maketrans("ACGTacgt","TGCAtgca")
121def revComp(seq):
122    return seq[::-1].translate(REVMAP)
123
124def Comp(seq):
125    return seq.translate(REVMAP)
126
127def codon_degeneracy( codon, position=3 ):
128    aa = translate( codon, GEN_CODE )
129    if position==1:
130        degeneracy1 = [GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)
131    elif position==2:
132        degeneracy2 = [GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)
133    elif position==3:
134        degeneracy = GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)
135    return degeneracy
136
137def main():
138
139    options, args = doc_optparse.parse( __doc__ )
140    try:
141        if options.outfile:
142            out = open( options.outfile, "w")
143        else:
144            out = sys.stdout
145        if options.format:
146            format = options.format
147        else:
148            format = 'bed'
149
150        allpositions = bool( options.allpositions )
151        include_name = bool( options.include_name )
152        nibdir = args[0]
153        bedfile = args[1]
154    except:
155        doc_optparse.exit()
156
157    nibs = getnib(nibdir)
158
159    for chrom, strand, cds_exons, name in CDSReader( open(bedfile), format=format):
160
161        cds_seq = ''
162
163        # genome_seq_index maps the position in CDS to position on the genome
164        genome_seq_index = []
165        for (c_start, c_end) in cds_exons:
166            cds_seq += nibs[chrom].get( c_start, c_end-c_start )
167            for i in range(c_start,c_end):
168                genome_seq_index.append(i)
169
170        cds_seq = cds_seq.upper()
171
172        if strand == '+':
173            frsts = range( 0, len(cds_seq), 3)
174            offsign = 1
175        else:
176            cds_seq = Comp( cds_seq )
177            frsts = range( 2, len(cds_seq), 3)
178            offsign = -1
179
180        offone = 1 * offsign
181        offtwo = 2 * offsign
182
183        all = ['A','C','G','T']
184
185        for first_pos in frsts:
186            c1 = first_pos
187            c2 = first_pos + offone
188            c3 = first_pos + offtwo
189            try:
190                assert c3 < len(cds_seq)
191            except AssertionError:
192                print >>sys.stderr, "out of sequence at %d for %s, %d" % (c3, chrom, genome_seq_index[ first_pos ])
193                continue
194            codon = cds_seq[c1], cds_seq[c2], cds_seq[c3]
195            aa = translate( codon, GEN_CODE )
196            degeneracy3 = str(GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)) + "d"
197
198            if not include_name: name_text = ''
199            else:
200                name_text = name.replace(' ','_')
201
202            if allpositions:
203                try:
204                    degeneracy1 = str([GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)) + "d"
205                    degeneracy2 = str([GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)) + "d"
206                except TypeError, s:
207                    print >>sys.stderr, GEN_CODE.values()
208                    raise TypeError, s
209
210                if strand == '+':
211                    print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text
212                    print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text
213                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
214                else:
215                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
216                    print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text
217                    print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text
218            else:
219                if strand == '+':
220                    for b in c1,c2:
221                        print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
222                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
223                else:
224                    print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
225                    for b in c2,c1:
226                        print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
227    out.close()
228
229if __name__ == '__main__':
230    main()
231    #format = sys.argv[1]
232    #file = sys.argv[2]
233    #for chr, strand, cds_exons in CDSReader( open(file), format=format):
234    #    s_points = [ "%d,%d" % (a[0],a[1]) for a in cds_exons ]
235    #    print chr, strand, len(cds_exons), "\t".join(s_points)
236
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。