Context Navigation

codingSnps.pl @ 3

リビジョン 2, 16.6 KB (コミッタ: hatakeyama, 14 年前)
import galaxy-central
属性 svn:executable の設定値 ``*

行番号
1	#!/usr/bin/env perl
2
3	use strict;
4	use warnings;
5
6	#########################################################################
7	# codingSnps.pl
8	# This takes a bed file with the names being / separated nts
9	# and a gene bed file with cds start and stop.
10	# It then checks for changes in coding regions, reporting
11	# those that cause a frameshift or substitution in the amino acid.
12	#########################################################################
13
14	if (!@ARGV or scalar @ARGV < 3) {
15	print "Usage: codingSnps.pl snps.bed genes.bed (/dir/nib/\|Galaxy build= loc=) [chr=# start=# end=# snp=#] > codingSnps.txt\n";
16	exit;
17	}
18	my $uniq = 0; #flag for whether want uniq positions
19	my $syn = 0; #flag for if want synonomous changes rather than non-syn
20	my $snpFile = shift @ARGV;
21	my $geneFile = shift @ARGV;
22	my $nibDir = shift @ARGV;
23	if ($nibDir eq 'Galaxy') { getGalaxyInfo(); }
24	my $seqFlag = "2bit"; #flag to set sequence type 2bit\|nib
25	my $col0 = 0; #bed like columns in default positions
26	my $col1 = 1;
27	my $col2 = 2;
28	my $col3 = 3;
29	#column positions 1 based coming in (for Galaxy)
30	foreach (@ARGV) {
31	if (/chr=(\d+)/) { $col0 = $1 -1; }
32	elsif (/start=(\d+)/) { $col1 = $1 -1; }
33	elsif (/end=(\d+)/) { $col2 = $1 -1; }
34	elsif (/snp=(\d+)/) { $col3 = $1 -1; }
35	}
36	if ($col0 < 0 \|\| $col1 < 0 \|\| $col2 < 0 \|\| $col3 < 0) {
37	print STDERR "ERROR column numbers are given with origin 1\n";
38	exit 1;
39	}
40	my @genes; #bed lines for genes, sorted by chrom and start
41	my %chrSt; #index in array where each chrom starts
42	my %codon; #hash of codon amino acid conversions
43	my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom
44	my $ignoreN = 1; #skip N
45
46	my %amb = (
47	"R" => "A/G",
48	"Y" => "C/T",
49	"S" => "C/G",
50	"W" => "A/T",
51	"K" => "G/T",
52	"M" => "A/C",
53	"B" => "C/G/T",
54	"D" => "A/G/T",
55	"H" => "A/C/T",
56	"V" => "A/C/G",
57	"N" => "A/C/G/T"
58	);
59	fill_codon();
60	open(FH, "cat $geneFile \| sort -k1,1 -k2,2n \|")
61	or die "Couldn't open and sort $geneFile, $!\n";
62	my $i = 0;
63	while(<FH>) {
64	chomp;
65	if (/refGene.cdsEnd\|ccdsGene.exonEnds/) { $ends = 1; next; }
66	push(@genes, "$_");
67	my @f = split(/\t/);
68	if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; }
69	$i++;
70	}
71	close FH or die "Couldn't close $geneFile, $!\n";
72
73	if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; }
74
75	#open snps sorted as well
76	my $s1 = $col0 + 1; #sort order is origin 1
77	my $s2 = $col1 + 1;
78	open(FH, "cat $snpFile \| sort -k$s1,$s1 -k$s2,${s2}n \|")
79	or die "Couldn't open and sort $snpFile, $!\n";
80	$i = 0;
81	my @g; #one genes fields, should be used repeatedly
82	my %done;
83	while(<FH>) {
84	chomp;
85	if (/^\s*#/) { next; } #comment
86	my @s = split(/\t/); #SNP fields
87	if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; }
88	my $size = $#s;
89	if ($col0 > $size \|\| $col1 > $size \|\| $col2 > $size \|\| $col3 > $size) {
90	print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n";
91	exit 1;
92	}
93	if ($s[$col1] =~ /\D/) {
94	print STDERR "ERROR the start point must be an integer not $s[$col1]\n";
95	exit 1;
96	}
97	if ($s[$col2] =~ /\D/) {
98	print STDERR "ERROR the start point must be an integer not $s[$col2]\n";
99	exit 1;
100	}
101	if ($s[$col3] eq 'N' && $ignoreN) { next; }
102	if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; }
103	if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row
104	$i = $chrSt{$s[$col0]};
105	@g = split(/\t/, $genes[$i]);
106	}elsif (!@g) {
107	next; #no gene for this chrom
108	}elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom
109	$i = $chrSt{$s[$col0]};
110	@g = split(/\t/, $genes[$i]);
111	}elsif ($s[$col0] ne $g[0]) {
112	next; #no gene for this chrom
113	}elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) {
114	next; #before any genes
115	}elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) {
116	next; #after all genes on chr
117	}else {
118	while ($s[$col1] > $g[2] && $i < $#genes) {
119	$i++;
120	@g = split(/\t/, $genes[$i]);
121	if ($s[$col0] ne $g[0]) { last; } #end of gene
122	}
123	if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) {
124	next; #no overlap with genes
125	}
126	}
127
128	processSnp(\@s, \@g);
129	if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; }
130
131	my $k = $i + 1; #check for more genes without losing data of first
132	if ($k <= $#genes) {
133	my @g2 = split(/\t/, $genes[$k]);
134	while (@g2 && $k <= $#genes) {
135	@g2 = split(/\t/, $genes[$k]);
136	if ($s[$col0] ne $g2[0]) {
137	undef @g2;
138	last; #not same chrom
139	}else {
140	while ($s[$col1] > $g2[2] && $k <= $#genes) {
141	$k++;
142	@g2 = split(/\t/, $genes[$k]);
143	if ($s[$col0] ne $g2[0]) { last; } #end of chrom
144	}
145	if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) {
146	undef @g2;
147	last; #no overlap with more genes
148	}
149	processSnp(\@s, \@g2);
150	if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; }
151	}
152	$k++;
153	}
154	}
155	}
156	close FH or die "Couldn't close $snpFile, $!\n";
157
158	exit;
159
160	########################################################################
161	sub processSnp {
162	my $sref = shift;
163	my $gref = shift;
164	#overlaps gene, but maybe not coding seq
165	#inside cds
166	if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) {
167	return; #outside of coding
168	}
169	#now check exon
170	my $i = 0;
171	my @st = split(/,/, $gref->[11]);
172	my @size = split(/,/, $gref->[10]);
173	if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; }
174	my @pos;
175	my $in = 0;
176	for($i = 0; $i < $gref->[9]; $i++) {
177	my $sta = $gref->[1] + $st[$i] + 1; #1 based position
178	my $end = $sta + $size[$i] - 1; #
179	if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes
180	if ($end < $gref->[6]) { next; } #utr only
181	if ($sta > $gref->[7]) { next; } #utr only
182	#shorten to coding only
183	if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; }
184	if ($end > $gref->[7]) { $end = $gref->[7]; }
185	if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; }
186	elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; }
187	push(@pos, ($sta .. $end)); #add exon worth of positions
188	}
189	#@pos has coding positions for whole gene (chr coors),
190	#and $in has whether we need to continue
191	if (!$in) { return; } #not in coding exon
192	if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons
193	if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or
194	$sref->[$col3] =~ /^-+$/) { #indel or del
195	my $copy = $sref->[$col3];
196	my $c = ($copy =~ tr/-//);
197	if ($c % 3 == 0) { return; } #not frameshift
198	#handle bed4 to bed4 + 4 (pgSnp)
199	print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
200	#if ($sref->[4]) { print "\t$sref->[4]"; }
201	#if ($sref->[5]) { print "\t$sref->[5]"; }
202	#if ($sref->[6]) { print "\t$sref->[6]"; }
203	#if ($sref->[7]) { print "\t$sref->[7]"; }
204	print "\t$gref->[3]\tframeshift\n";
205	$done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
206	return;
207	}elsif ($sref->[$col1] == $sref->[$col2]) { #insertion
208	my $copy = $sref->[$col3];
209	my $c = ($copy =~ tr/\[ACTG]+//);
210	if ($c % 3 == 0) { return; } #not frameshift
211	#handle bed4 to bed4 + 4 (pgSnp)
212	print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
213	#if ($sref->[4]) { print "\t$sref->[4]"; }
214	#if ($sref->[5]) { print "\t$sref->[5]"; }
215	#if ($sref->[6]) { print "\t$sref->[6]"; }
216	#if ($sref->[7]) { print "\t$sref->[7]"; }
217	print "\t$gref->[3]\tframeshift\n";
218	$done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
219	return;
220	}
221	#check for amino acid substitutions
222	my $s = $sref->[$col1] + 1;
223	my $e = $sref->[$col2];
224	my $len = $sref->[$col2] - $sref->[$col1];
225	if ($gref->[5] eq '-') {
226	@pos = reverse(@pos);
227	my $t = $s;
228	$s = $e;
229	$e = $t;
230	}
231	$i = 0;
232	my $found = 0;
233	foreach (@pos) {
234	if ($s == $_) {
235	$found = 1;
236	last;
237	}
238	$i++;
239	}
240	if ($found) {
241	my $fs = $i; #keep original start index
242	#have index where substitution starts
243	my $cp = $i % 3;
244	$i -= $cp; #i is now first position in codon
245	my $cdNum = int($i / 3) + 1;
246	my $ls = $i;
247	if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; }
248	if (!@pos) { die "ERROR not defined array pos\n"; }
249	if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; }
250	if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; }
251	while ($ls <= $#pos && $pos[$ls] ne $e) {
252	$ls++;
253	}
254	my $i2 = $ls + (2 - ($ls % 3));
255	if ($i2 > $#pos) { return; } #not a full codon, partial gene?
256
257	if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; }
258	my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]);
259	if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; }
260	my @vars = split(/\//, $sref->[$col3]);
261	if ($gref->[5] eq '-') { #complement oldnts and revcomp vars
262	$oldnts = compl($oldnts);
263	if (!$oldnts) { return; } #skip this one
264	$oldnts = join('', (reverse(split(/ */, $oldnts))));
265	foreach (@vars) {
266	$_ = reverse(split(/ */));
267	$_ = compl($_);
268	}
269	}
270	my $r = $fs - $i; #difference in old indexes gives new index
271	my @newnts;
272	my $changed = '';
273	foreach my $v (@vars) {
274	if (!$v or length($v) != 1) { return; } #only simple changes
275	my @new = split(/ */, $oldnts);
276	$changed = splice(@new, $r, $len, split(/ */, $v));
277	#should only change single nt
278	push(@newnts, join("", @new));
279	}
280	#now compute amino acids
281	my $oldaa = getaa($oldnts);
282	my @newaa;
283	my $change = 0; #flag for if there is a change
284	foreach my $v (@newnts) {
285	my $t = getaa($v);
286	if ($t ne $oldaa) { $change = 1; }
287	push(@newaa, $t);
288	}
289	if (!$change && $syn) {
290	print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
291	print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n";
292	return;
293	}elsif ($syn) { return; } #only want synonymous changes
294	if (!$change) { return; } #no change in amino acids
295	#if (abs($pos[$i] - $pos[$i2]) > 200) {
296	#print STDERR "TESTING found mutation at splice site $sref->[0]\t$sref->[1]\t$sref->[2]\n";
297	#print STDERR "old $oldaa, new ", join(', ', @newaa), "\n";
298	#print STDERR "oldnt $oldnts, strand $gref->[5]\n";
299	#exit;
300	#}
301	print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
302	#if (defined $sref->[4]) { print "\t$sref->[4]"; }
303	#if (defined $sref->[5]) { print "\t$sref->[5]"; }
304	#if (defined $sref->[6]) { print "\t$sref->[6]"; }
305	#if (defined $sref->[7]) { print "\t$sref->[7]"; }
306	if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
307	if (!$changed) { return; } #skip this one
308	print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n";
309	$done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
310	}
311	}
312
313	sub getnts {
314	my $chr = shift;
315	my @pos = @_; #list of positions not necessarily in order
316	#list may be reversed or have gaps(introns), at least 3 bps
317	my $seq = '';
318	if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; }
319	if ($pos[0] < $pos[1]) { #not reversed
320	my $s = $pos[0];
321	for(my $i = 1; $i <= $#pos; $i++) {
322	if ($pos[$i] == $pos[$i-1] + 1) { next; }
323	if ($seqFlag eq '2bit') {
324	$seq .= fetchSeq2bit($chr, $s, $pos[$i-1]);
325	}else {
326	$seq .= fetchSeqNib($chr, $s, $pos[$i-1]);
327	}
328	$s = $pos[$i];
329	}
330	if (length $seq != scalar @pos) { #still need to fetch seq
331	#if (abs($pos[$#pos]-$pos[0]) > 200) {
332	#print STDERR "TESTING have split codon $chr $pos[0] $pos[$#pos]\n";
333	#exit;
334	#}
335	if ($seqFlag eq '2bit') {
336	$seq .= fetchSeq2bit($chr, $s, $pos[$#pos]);
337	}else {
338	$seq .= fetchSeqNib($chr, $s, $pos[$#pos]);
339	}
340	}
341	}else { #reversed
342	my $s = $pos[$#pos];
343	for(my $i = $#pos -1; $i >= 0; $i--) {
344	if ($pos[$i] == $pos[$i+1] + 1) { next; }
345	if ($seqFlag eq '2bit') {
346	$seq .= fetchSeq2bit($chr, $s, $pos[$i+1]);
347	}else {
348	$seq .= fetchSeqNib($chr, $s, $pos[$i+1]);
349	}
350	$s = $pos[$i];
351	}
352	if (length $seq != scalar @pos) { #still need to fetch seq
353	#if (abs($pos[$#pos]-$pos[0]) > 200) {
354	#print STDERR "TESTING have split codon $pos[0] .. $pos[$#pos]\n";
355	#}
356	if ($seqFlag eq '2bit') {
357	$seq .= fetchSeq2bit($chr, $s, $pos[0]);
358	}else {
359	$seq .= fetchSeqNib($chr, $s, $pos[0]);
360	}
361	}
362	}
363	}
364
365	sub fetchSeq2bit {
366	my $chr = shift;
367	my $st = shift;
368	my $end = shift;
369	my $strand = '+';
370	$st--; #change to UCSC numbering
371	open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout \|") or
372	die "Couldn't run twoBitToFa, $!\n";
373	my $seq = '';
374	while (<BIT>) {
375	chomp;
376	if (/^>/) { next; } #header
377	$seq .= $_;
378	}
379	close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n";
380	return $seq;
381	}
382
383	sub fetchSeqNib {
384	my $chr = shift;
385	my $st = shift;
386	my $end = shift;
387	my $strand = '+';
388	$st--; #change to UCSC numbering
389	open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout \|") or die "Couldn't run nibFrag, $!\n";
390	my $seq = '';
391	while (<NIB>) {
392	chomp;
393	if (/^>/) { next; } #header
394	$seq .= $_;
395	}
396	close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n";
397	return $seq;
398	}
399
400	sub compl {
401	my $nts = shift;
402	my $comp = '';
403	foreach my $n (split(/ */, $nts)) {
404	if ($n eq 'A') { $comp .= 'T'; }
405	elsif ($n eq 'T') { $comp .= 'A'; }
406	elsif ($n eq 'C') { $comp .= 'G'; }
407	elsif ($n eq 'G') { $comp .= 'C'; }
408	elsif ($n eq 'N') { $comp .= 'N'; }
409	elsif ($n eq '-') { $comp .= '-'; } #deletion
410	else { $comp = undef; }
411	}
412	return $comp;
413	}
414
415	sub getaa {
416	my $nts = shift; #in multiples of 3
417	my $aa = '';
418	my @n = split(/ */, $nts);
419	while (@n) {
420	my @t = splice(@n, 0, 3);
421	my $n = uc(join("", @t));
422	if (!exists $codon{$n}) { $aa .= 'N'; next; }
423	$aa .= $codon{$n};
424	}
425	return $aa;
426	}
427
428	sub fill_codon {
429	$codon{GCA} = 'Ala';
430	$codon{GCC} = 'Ala';
431	$codon{GCG} = 'Ala';
432	$codon{GCT} = 'Ala';
433	$codon{CGG} = 'Arg';
434	$codon{CGT} = 'Arg';
435	$codon{CGC} = 'Arg';
436	$codon{AGA} = 'Arg';
437	$codon{AGG} = 'Arg';
438	$codon{CGA} = 'Arg';
439	$codon{AAC} = 'Asn';
440	$codon{AAT} = 'Asn';
441	$codon{GAC} = 'Asp';
442	$codon{GAT} = 'Asp';
443	$codon{TGC} = 'Cys';
444	$codon{TGT} = 'Cys';
445	$codon{CAG} = 'Gln';
446	$codon{CAA} = 'Gln';
447	$codon{GAA} = 'Glu';
448	$codon{GAG} = 'Glu';
449	$codon{GGG} = 'Gly';
450	$codon{GGA} = 'Gly';
451	$codon{GGC} = 'Gly';
452	$codon{GGT} = 'Gly';
453	$codon{CAC} = 'His';
454	$codon{CAT} = 'His';
455	$codon{ATA} = 'Ile';
456	$codon{ATT} = 'Ile';
457	$codon{ATC} = 'Ile';
458	$codon{CTA} = 'Leu';
459	$codon{CTC} = 'Leu';
460	$codon{CTG} = 'Leu';
461	$codon{CTT} = 'Leu';
462	$codon{TTG} = 'Leu';
463	$codon{TTA} = 'Leu';
464	$codon{AAA} = 'Lys';
465	$codon{AAG} = 'Lys';
466	$codon{ATG} = 'Met';
467	$codon{TTC} = 'Phe';
468	$codon{TTT} = 'Phe';
469	$codon{CCT} = 'Pro';
470	$codon{CCA} = 'Pro';
471	$codon{CCC} = 'Pro';
472	$codon{CCG} = 'Pro';
473	$codon{TCA} = 'Ser';
474	$codon{AGC} = 'Ser';
475	$codon{AGT} = 'Ser';
476	$codon{TCC} = 'Ser';
477	$codon{TCT} = 'Ser';
478	$codon{TCG} = 'Ser';
479	$codon{TGA} = 'Stop';
480	$codon{TAG} = 'Stop';
481	$codon{TAA} = 'Stop';
482	$codon{ACT} = 'Thr';
483	$codon{ACA} = 'Thr';
484	$codon{ACC} = 'Thr';
485	$codon{ACG} = 'Thr';
486	$codon{TGG} = 'Trp';
487	$codon{TAT} = 'Tyr';
488	$codon{TAC} = 'Tyr';
489	$codon{GTC} = 'Val';
490	$codon{GTA} = 'Val';
491	$codon{GTG} = 'Val';
492	$codon{GTT} = 'Val';
493	}
494
495	sub getGalaxyInfo {
496	my $build;
497	my $locFile;
498	foreach (@ARGV) {
499	if (/build=(.*)/) { $build = $1; }
500	elsif (/loc=(.*)/) { $locFile = $1; }
501	}
502	if (!$build or !$locFile) {
503	print STDERR "ERROR missing build or locfile for Galaxy input\n";
504	exit 1;
505	}
506	# read $locFile to get $nibDir (ignoring commets)
507	open(LF, "< $locFile") \|\| die "open($locFile): $!\n";
508	while(<LF>) {
509	s/#.*$//;
510	s/(?:^\s+\|\s+$)//g;
511	next if (/^$/);
512
513	my @t = split(/\t/);
514	if ($t[0] eq $build) { $nibDir = $t[1]; }
515	}
516	close(LF);
517	if ($nibDir eq 'Galaxy') {
518	print STDERR "Failed to find sequence directory in locfile $locFile\n";
519	}
520	$nibDir .= "/$build.2bit"; #we want full path and filename
521	}

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/evolution/codingSnps.pl @ 3

異なるフォーマットでダウンロード: