Context Navigation

gene_fourfold_sites.py @ 3

リビジョン 3, 7.5 KB (コミッタ: kohda, 14 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	#!/usr/bin/python2.6
2
3	"""
4	Returns a bed-like translation of a CDS in which each record corresponds to
5	a single site in the CDS and includes additional fields for site degenaracy,
6	position ind CDS, and amino acid encoded.
7
8	usage: %prog nibdir genefile [options]
9	-o, --outfile=o: output file
10	-f, --format=f: format bed (default), or gtf\|gff
11	-a, --allpositions: 1st, 2nd and 3rd positions are evaluated for degeneracy given the sequence at the other two positions. Many 1d sites in 1st codon positions become 2d sites when considered this way.
12	-n, --include_name: include the 'name' or 'id' field from the source file on every line of output
13	"""
14
15	import re
16	import sys
17	import os
18	import string
19	from bx.seq import nib
20	from bx.bitset import *
21	from bx.bitset_builders import *
22	from bx.bitset_utils import *
23	from bx.gene_reader import *
24	from bx.cookbook import doc_optparse
25
26	GENETIC_CODE = """
27	TTT (Phe/F)Phenylalanine
28	TTC (Phe/F)Phenylalanine
29	TTA (Leu/L)Leucine
30	TTG (Leu/L)Leucine, Start
31	TCT (Ser/S)Serine
32	TCC (Ser/S)Serine
33	TCA (Ser/S)Serine
34	TCG (Ser/S)Serine
35	TAT (Tyr/Y)Tyrosine
36	TAC (Tyr/Y)Tyrosine
37	TAA Ochre (Stop)
38	TAG Amber (Stop)
39	TGT (Cys/C)Cysteine
40	TGC (Cys/C)Cysteine
41	TGA Opal (Stop)
42	TGG (Trp/W)Tryptophan
43	CTT (Leu/L)Leucine
44	CTC (Leu/L)Leucine
45	CTA (Leu/L)Leucine
46	CTG (Leu/L)Leucine, Start
47	CCT (Pro/P)Proline
48	CCC (Pro/P)Proline
49	CCA (Pro/P)Proline
50	CCG (Pro/P)Proline
51	CAT (His/H)Histidine
52	CAC (His/H)Histidine
53	CAA (Gln/Q)Glutamine
54	CAG (Gln/Q)Glutamine
55	CGT (Arg/R)Arginine
56	CGC (Arg/R)Arginine
57	CGA (Arg/R)Arginine
58	CGG (Arg/R)Arginine
59	ATT (Ile/I)Isoleucine, Start2
60	ATC (Ile/I)Isoleucine
61	ATA (Ile/I)Isoleucine
62	ATG (Met/M)Methionine, Start1
63	ACT (Thr/T)Threonine
64	ACC (Thr/T)Threonine
65	ACA (Thr/T)Threonine
66	ACG (Thr/T)Threonine
67	AAT (Asn/N)Asparagine
68	AAC (Asn/N)Asparagine
69	AAA (Lys/K)Lysine
70	AAG (Lys/K)Lysine
71	AGT (Ser/S)Serine
72	AGC (Ser/S)Serine
73	AGA (Arg/R)Arginine
74	AGG (Arg/R)Arginine
75	GTT (Val/V)Valine
76	GTC (Val/V)Valine
77	GTA (Val/V)Valine
78	GTG (Val/V)Valine, Start2
79	GCT (Ala/A)Alanine
80	GCC (Ala/A)Alanine
81	GCA (Ala/A)Alanine
82	GCG (Ala/A)Alanine
83	GAT (Asp/D)Aspartic acid
84	GAC (Asp/D)Aspartic acid
85	GAA (Glu/E)Glutamic acid
86	GAG (Glu/E)Glutamic acid
87	GGT (Gly/G)Glycine
88	GGC (Gly/G)Glycine
89	GGA (Gly/G)Glycine
90	GGG (Gly/G)Glycine
91	"""
92
93	def translate( codon, genetic_code):
94	c1,c2,c3 = codon
95	return genetic_code[c1][c2][c3]
96
97	""" parse the doc string to hash the genetic code"""
98	GEN_CODE = {}
99	for line in GENETIC_CODE.split('\n'):
100	if line.strip() == '': continue
101	f = re.split('\s\|\(\|\)\|\/',line)
102	codon = f[0]
103	c1,c2,c3 = codon
104	aminoacid = f[3]
105	if c1 not in GEN_CODE: GEN_CODE[c1] = {}
106	if c2 not in GEN_CODE[c1]: GEN_CODE[c1][c2] = {}
107
108	GEN_CODE[c1][c2][c3] = aminoacid
109
110	def getnib( nibdir ):
111	seqs = {}
112	for nibf in os.listdir( nibdir ):
113	if not nibf.endswith('.nib'): continue
114	chr = nibf.replace('.nib','')
115	file = os.path.join( nibdir, nibf )
116	seqs[chr] = nib.NibFile( open(file) )
117
118	return seqs
119
120	REVMAP = string.maketrans("ACGTacgt","TGCAtgca")
121	def revComp(seq):
122	return seq[::-1].translate(REVMAP)
123
124	def Comp(seq):
125	return seq.translate(REVMAP)
126
127	def codon_degeneracy( codon, position=3 ):
128	aa = translate( codon, GEN_CODE )
129	if position==1:
130	degeneracy1 = [GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)
131	elif position==2:
132	degeneracy2 = [GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)
133	elif position==3:
134	degeneracy = GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)
135	return degeneracy
136
137	def main():
138
139	options, args = doc_optparse.parse( __doc__ )
140	try:
141	if options.outfile:
142	out = open( options.outfile, "w")
143	else:
144	out = sys.stdout
145	if options.format:
146	format = options.format
147	else:
148	format = 'bed'
149
150	allpositions = bool( options.allpositions )
151	include_name = bool( options.include_name )
152	nibdir = args[0]
153	bedfile = args[1]
154	except:
155	doc_optparse.exit()
156
157	nibs = getnib(nibdir)
158
159	for chrom, strand, cds_exons, name in CDSReader( open(bedfile), format=format):
160
161	cds_seq = ''
162
163	# genome_seq_index maps the position in CDS to position on the genome
164	genome_seq_index = []
165	for (c_start, c_end) in cds_exons:
166	cds_seq += nibs[chrom].get( c_start, c_end-c_start )
167	for i in range(c_start,c_end):
168	genome_seq_index.append(i)
169
170	cds_seq = cds_seq.upper()
171
172	if strand == '+':
173	frsts = range( 0, len(cds_seq), 3)
174	offsign = 1
175	else:
176	cds_seq = Comp( cds_seq )
177	frsts = range( 2, len(cds_seq), 3)
178	offsign = -1
179
180	offone = 1 * offsign
181	offtwo = 2 * offsign
182
183	all = ['A','C','G','T']
184
185	for first_pos in frsts:
186	c1 = first_pos
187	c2 = first_pos + offone
188	c3 = first_pos + offtwo
189	try:
190	assert c3 < len(cds_seq)
191	except AssertionError:
192	print >>sys.stderr, "out of sequence at %d for %s, %d" % (c3, chrom, genome_seq_index[ first_pos ])
193	continue
194	codon = cds_seq[c1], cds_seq[c2], cds_seq[c3]
195	aa = translate( codon, GEN_CODE )
196	degeneracy3 = str(GEN_CODE[ codon[0] ][ codon[1] ].values().count(aa)) + "d"
197
198	if not include_name: name_text = ''
199	else:
200	name_text = name.replace(' ','_')
201
202	if allpositions:
203	try:
204	degeneracy1 = str([GEN_CODE[ k ][ codon[1] ][ codon[2] ] for k in all].count(aa)) + "d"
205	degeneracy2 = str([GEN_CODE[ codon[0] ][ k ][ codon[2] ] for k in all].count(aa)) + "d"
206	except TypeError, s:
207	print >>sys.stderr, GEN_CODE.values()
208	raise TypeError, s
209
210	if strand == '+':
211	print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text
212	print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text
213	print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
214	else:
215	print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
216	print >>out, chrom, genome_seq_index[c2], genome_seq_index[c2] + 1, cds_seq[c2], degeneracy2, aa, name_text
217	print >>out, chrom, genome_seq_index[c1], genome_seq_index[c1] + 1, cds_seq[c1], degeneracy1, aa, name_text
218	else:
219	if strand == '+':
220	for b in c1,c2:
221	print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
222	print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
223	else:
224	print >>out, chrom, genome_seq_index[c3], genome_seq_index[c3] + 1, cds_seq[c3], degeneracy3, aa, name_text
225	for b in c2,c1:
226	print >>out, chrom, genome_seq_index[b], genome_seq_index[b] + 1, cds_seq[b], "1d", aa, name_text
227	out.close()
228
229	if __name__ == '__main__':
230	main()
231	#format = sys.argv[1]
232	#file = sys.argv[2]
233	#for chr, strand, cds_exons in CDSReader( open(file), format=format):
234	# s_points = [ "%d,%d" % (a[0],a[1]) for a in cds_exons ]
235	# print chr, strand, len(cds_exons), "\t".join(s_points)
236

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/bx_python-0.5.0_dev_f74aec067563-py2.6-macosx-10.6-universal-ucs2.egg/EGG-INFO/scripts/gene_fourfold_sites.py @ 3

異なるフォーマットでダウンロード: