1 | #!/usr/bin/env perl |
---|
2 | |
---|
3 | use strict; |
---|
4 | use warnings; |
---|
5 | |
---|
6 | ######################################################################### |
---|
7 | # codingSnps.pl |
---|
8 | # This takes a bed file with the names being / separated nts |
---|
9 | # and a gene bed file with cds start and stop. |
---|
10 | # It then checks for changes in coding regions, reporting |
---|
11 | # those that cause a frameshift or substitution in the amino acid. |
---|
12 | ######################################################################### |
---|
13 | |
---|
14 | if (!@ARGV or scalar @ARGV < 3) { |
---|
15 | print "Usage: codingSnps.pl snps.bed genes.bed (/dir/nib/|Galaxy build= loc=) [chr=# start=# end=# snp=#] > codingSnps.txt\n"; |
---|
16 | exit; |
---|
17 | } |
---|
18 | my $uniq = 0; #flag for whether want uniq positions |
---|
19 | my $syn = 0; #flag for if want synonomous changes rather than non-syn |
---|
20 | my $snpFile = shift @ARGV; |
---|
21 | my $geneFile = shift @ARGV; |
---|
22 | my $nibDir = shift @ARGV; |
---|
23 | if ($nibDir eq 'Galaxy') { getGalaxyInfo(); } |
---|
24 | my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib |
---|
25 | my $col0 = 0; #bed like columns in default positions |
---|
26 | my $col1 = 1; |
---|
27 | my $col2 = 2; |
---|
28 | my $col3 = 3; |
---|
29 | #column positions 1 based coming in (for Galaxy) |
---|
30 | foreach (@ARGV) { |
---|
31 | if (/chr=(\d+)/) { $col0 = $1 -1; } |
---|
32 | elsif (/start=(\d+)/) { $col1 = $1 -1; } |
---|
33 | elsif (/end=(\d+)/) { $col2 = $1 -1; } |
---|
34 | elsif (/snp=(\d+)/) { $col3 = $1 -1; } |
---|
35 | } |
---|
36 | if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) { |
---|
37 | print STDERR "ERROR column numbers are given with origin 1\n"; |
---|
38 | exit 1; |
---|
39 | } |
---|
40 | my @genes; #bed lines for genes, sorted by chrom and start |
---|
41 | my %chrSt; #index in array where each chrom starts |
---|
42 | my %codon; #hash of codon amino acid conversions |
---|
43 | my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom |
---|
44 | my $ignoreN = 1; #skip N |
---|
45 | |
---|
46 | my %amb = ( |
---|
47 | "R" => "A/G", |
---|
48 | "Y" => "C/T", |
---|
49 | "S" => "C/G", |
---|
50 | "W" => "A/T", |
---|
51 | "K" => "G/T", |
---|
52 | "M" => "A/C", |
---|
53 | "B" => "C/G/T", |
---|
54 | "D" => "A/G/T", |
---|
55 | "H" => "A/C/T", |
---|
56 | "V" => "A/C/G", |
---|
57 | "N" => "A/C/G/T" |
---|
58 | ); |
---|
59 | fill_codon(); |
---|
60 | open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") |
---|
61 | or die "Couldn't open and sort $geneFile, $!\n"; |
---|
62 | my $i = 0; |
---|
63 | while(<FH>) { |
---|
64 | chomp; |
---|
65 | if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; } |
---|
66 | push(@genes, "$_"); |
---|
67 | my @f = split(/\t/); |
---|
68 | if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; } |
---|
69 | $i++; |
---|
70 | } |
---|
71 | close FH or die "Couldn't close $geneFile, $!\n"; |
---|
72 | |
---|
73 | if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; } |
---|
74 | |
---|
75 | #open snps sorted as well |
---|
76 | my $s1 = $col0 + 1; #sort order is origin 1 |
---|
77 | my $s2 = $col1 + 1; |
---|
78 | open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |") |
---|
79 | or die "Couldn't open and sort $snpFile, $!\n"; |
---|
80 | $i = 0; |
---|
81 | my @g; #one genes fields, should be used repeatedly |
---|
82 | my %done; |
---|
83 | while(<FH>) { |
---|
84 | chomp; |
---|
85 | if (/^\s*#/) { next; } #comment |
---|
86 | my @s = split(/\t/); #SNP fields |
---|
87 | if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; } |
---|
88 | my $size = $#s; |
---|
89 | if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) { |
---|
90 | print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n"; |
---|
91 | exit 1; |
---|
92 | } |
---|
93 | if ($s[$col1] =~ /\D/) { |
---|
94 | print STDERR "ERROR the start point must be an integer not $s[$col1]\n"; |
---|
95 | exit 1; |
---|
96 | } |
---|
97 | if ($s[$col2] =~ /\D/) { |
---|
98 | print STDERR "ERROR the start point must be an integer not $s[$col2]\n"; |
---|
99 | exit 1; |
---|
100 | } |
---|
101 | if ($s[$col3] eq 'N' && $ignoreN) { next; } |
---|
102 | if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; } |
---|
103 | if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row |
---|
104 | $i = $chrSt{$s[$col0]}; |
---|
105 | @g = split(/\t/, $genes[$i]); |
---|
106 | }elsif (!@g) { |
---|
107 | next; #no gene for this chrom |
---|
108 | }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom |
---|
109 | $i = $chrSt{$s[$col0]}; |
---|
110 | @g = split(/\t/, $genes[$i]); |
---|
111 | }elsif ($s[$col0] ne $g[0]) { |
---|
112 | next; #no gene for this chrom |
---|
113 | }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) { |
---|
114 | next; #before any genes |
---|
115 | }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) { |
---|
116 | next; #after all genes on chr |
---|
117 | }else { |
---|
118 | while ($s[$col1] > $g[2] && $i < $#genes) { |
---|
119 | $i++; |
---|
120 | @g = split(/\t/, $genes[$i]); |
---|
121 | if ($s[$col0] ne $g[0]) { last; } #end of gene |
---|
122 | } |
---|
123 | if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) { |
---|
124 | next; #no overlap with genes |
---|
125 | } |
---|
126 | } |
---|
127 | |
---|
128 | processSnp(\@s, \@g); |
---|
129 | if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; } |
---|
130 | |
---|
131 | my $k = $i + 1; #check for more genes without losing data of first |
---|
132 | if ($k <= $#genes) { |
---|
133 | my @g2 = split(/\t/, $genes[$k]); |
---|
134 | while (@g2 && $k <= $#genes) { |
---|
135 | @g2 = split(/\t/, $genes[$k]); |
---|
136 | if ($s[$col0] ne $g2[0]) { |
---|
137 | undef @g2; |
---|
138 | last; #not same chrom |
---|
139 | }else { |
---|
140 | while ($s[$col1] > $g2[2] && $k <= $#genes) { |
---|
141 | $k++; |
---|
142 | @g2 = split(/\t/, $genes[$k]); |
---|
143 | if ($s[$col0] ne $g2[0]) { last; } #end of chrom |
---|
144 | } |
---|
145 | if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) { |
---|
146 | undef @g2; |
---|
147 | last; #no overlap with more genes |
---|
148 | } |
---|
149 | processSnp(\@s, \@g2); |
---|
150 | if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; } |
---|
151 | } |
---|
152 | $k++; |
---|
153 | } |
---|
154 | } |
---|
155 | } |
---|
156 | close FH or die "Couldn't close $snpFile, $!\n"; |
---|
157 | |
---|
158 | exit; |
---|
159 | |
---|
160 | ######################################################################## |
---|
161 | sub processSnp { |
---|
162 | my $sref = shift; |
---|
163 | my $gref = shift; |
---|
164 | #overlaps gene, but maybe not coding seq |
---|
165 | #inside cds |
---|
166 | if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) { |
---|
167 | return; #outside of coding |
---|
168 | } |
---|
169 | #now check exon |
---|
170 | my $i = 0; |
---|
171 | my @st = split(/,/, $gref->[11]); |
---|
172 | my @size = split(/,/, $gref->[10]); |
---|
173 | if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; } |
---|
174 | my @pos; |
---|
175 | my $in = 0; |
---|
176 | for($i = 0; $i < $gref->[9]; $i++) { |
---|
177 | my $sta = $gref->[1] + $st[$i] + 1; #1 based position |
---|
178 | my $end = $sta + $size[$i] - 1; # |
---|
179 | if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes |
---|
180 | if ($end < $gref->[6]) { next; } #utr only |
---|
181 | if ($sta > $gref->[7]) { next; } #utr only |
---|
182 | #shorten to coding only |
---|
183 | if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; } |
---|
184 | if ($end > $gref->[7]) { $end = $gref->[7]; } |
---|
185 | if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; } |
---|
186 | elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; } |
---|
187 | push(@pos, ($sta .. $end)); #add exon worth of positions |
---|
188 | } |
---|
189 | #@pos has coding positions for whole gene (chr coors), |
---|
190 | #and $in has whether we need to continue |
---|
191 | if (!$in) { return; } #not in coding exon |
---|
192 | if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons |
---|
193 | if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or |
---|
194 | $sref->[$col3] =~ /^-+$/) { #indel or del |
---|
195 | my $copy = $sref->[$col3]; |
---|
196 | my $c = ($copy =~ tr/-//); |
---|
197 | if ($c % 3 == 0) { return; } #not frameshift |
---|
198 | #handle bed4 to bed4 + 4 (pgSnp) |
---|
199 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
---|
200 | #if ($sref->[4]) { print "\t$sref->[4]"; } |
---|
201 | #if ($sref->[5]) { print "\t$sref->[5]"; } |
---|
202 | #if ($sref->[6]) { print "\t$sref->[6]"; } |
---|
203 | #if ($sref->[7]) { print "\t$sref->[7]"; } |
---|
204 | print "\t$gref->[3]\tframeshift\n"; |
---|
205 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
---|
206 | return; |
---|
207 | }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion |
---|
208 | my $copy = $sref->[$col3]; |
---|
209 | my $c = ($copy =~ tr/\[ACTG]+//); |
---|
210 | if ($c % 3 == 0) { return; } #not frameshift |
---|
211 | #handle bed4 to bed4 + 4 (pgSnp) |
---|
212 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
---|
213 | #if ($sref->[4]) { print "\t$sref->[4]"; } |
---|
214 | #if ($sref->[5]) { print "\t$sref->[5]"; } |
---|
215 | #if ($sref->[6]) { print "\t$sref->[6]"; } |
---|
216 | #if ($sref->[7]) { print "\t$sref->[7]"; } |
---|
217 | print "\t$gref->[3]\tframeshift\n"; |
---|
218 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
---|
219 | return; |
---|
220 | } |
---|
221 | #check for amino acid substitutions |
---|
222 | my $s = $sref->[$col1] + 1; |
---|
223 | my $e = $sref->[$col2]; |
---|
224 | my $len = $sref->[$col2] - $sref->[$col1]; |
---|
225 | if ($gref->[5] eq '-') { |
---|
226 | @pos = reverse(@pos); |
---|
227 | my $t = $s; |
---|
228 | $s = $e; |
---|
229 | $e = $t; |
---|
230 | } |
---|
231 | $i = 0; |
---|
232 | my $found = 0; |
---|
233 | foreach (@pos) { |
---|
234 | if ($s == $_) { |
---|
235 | $found = 1; |
---|
236 | last; |
---|
237 | } |
---|
238 | $i++; |
---|
239 | } |
---|
240 | if ($found) { |
---|
241 | my $fs = $i; #keep original start index |
---|
242 | #have index where substitution starts |
---|
243 | my $cp = $i % 3; |
---|
244 | $i -= $cp; #i is now first position in codon |
---|
245 | my $cdNum = int($i / 3) + 1; |
---|
246 | my $ls = $i; |
---|
247 | if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; } |
---|
248 | if (!@pos) { die "ERROR not defined array pos\n"; } |
---|
249 | if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; } |
---|
250 | if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; } |
---|
251 | while ($ls <= $#pos && $pos[$ls] ne $e) { |
---|
252 | $ls++; |
---|
253 | } |
---|
254 | my $i2 = $ls + (2 - ($ls % 3)); |
---|
255 | if ($i2 > $#pos) { return; } #not a full codon, partial gene? |
---|
256 | |
---|
257 | if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; } |
---|
258 | my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]); |
---|
259 | if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; } |
---|
260 | my @vars = split(/\//, $sref->[$col3]); |
---|
261 | if ($gref->[5] eq '-') { #complement oldnts and revcomp vars |
---|
262 | $oldnts = compl($oldnts); |
---|
263 | if (!$oldnts) { return; } #skip this one |
---|
264 | $oldnts = join('', (reverse(split(/ */, $oldnts)))); |
---|
265 | foreach (@vars) { |
---|
266 | $_ = reverse(split(/ */)); |
---|
267 | $_ = compl($_); |
---|
268 | } |
---|
269 | } |
---|
270 | my $r = $fs - $i; #difference in old indexes gives new index |
---|
271 | my @newnts; |
---|
272 | my $changed = ''; |
---|
273 | foreach my $v (@vars) { |
---|
274 | if (!$v or length($v) != 1) { return; } #only simple changes |
---|
275 | my @new = split(/ */, $oldnts); |
---|
276 | $changed = splice(@new, $r, $len, split(/ */, $v)); |
---|
277 | #should only change single nt |
---|
278 | push(@newnts, join("", @new)); |
---|
279 | } |
---|
280 | #now compute amino acids |
---|
281 | my $oldaa = getaa($oldnts); |
---|
282 | my @newaa; |
---|
283 | my $change = 0; #flag for if there is a change |
---|
284 | foreach my $v (@newnts) { |
---|
285 | my $t = getaa($v); |
---|
286 | if ($t ne $oldaa) { $change = 1; } |
---|
287 | push(@newaa, $t); |
---|
288 | } |
---|
289 | if (!$change && $syn) { |
---|
290 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
---|
291 | print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; |
---|
292 | return; |
---|
293 | }elsif ($syn) { return; } #only want synonymous changes |
---|
294 | if (!$change) { return; } #no change in amino acids |
---|
295 | #if (abs($pos[$i] - $pos[$i2]) > 200) { |
---|
296 | #print STDERR "TESTING found mutation at splice site $sref->[0]\t$sref->[1]\t$sref->[2]\n"; |
---|
297 | #print STDERR "old $oldaa, new ", join(', ', @newaa), "\n"; |
---|
298 | #print STDERR "oldnt $oldnts, strand $gref->[5]\n"; |
---|
299 | #exit; |
---|
300 | #} |
---|
301 | print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; |
---|
302 | #if (defined $sref->[4]) { print "\t$sref->[4]"; } |
---|
303 | #if (defined $sref->[5]) { print "\t$sref->[5]"; } |
---|
304 | #if (defined $sref->[6]) { print "\t$sref->[6]"; } |
---|
305 | #if (defined $sref->[7]) { print "\t$sref->[7]"; } |
---|
306 | if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref |
---|
307 | if (!$changed) { return; } #skip this one |
---|
308 | print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; |
---|
309 | $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; |
---|
310 | } |
---|
311 | } |
---|
312 | |
---|
313 | sub getnts { |
---|
314 | my $chr = shift; |
---|
315 | my @pos = @_; #list of positions not necessarily in order |
---|
316 | #list may be reversed or have gaps(introns), at least 3 bps |
---|
317 | my $seq = ''; |
---|
318 | if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; } |
---|
319 | if ($pos[0] < $pos[1]) { #not reversed |
---|
320 | my $s = $pos[0]; |
---|
321 | for(my $i = 1; $i <= $#pos; $i++) { |
---|
322 | if ($pos[$i] == $pos[$i-1] + 1) { next; } |
---|
323 | if ($seqFlag eq '2bit') { |
---|
324 | $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]); |
---|
325 | }else { |
---|
326 | $seq .= fetchSeqNib($chr, $s, $pos[$i-1]); |
---|
327 | } |
---|
328 | $s = $pos[$i]; |
---|
329 | } |
---|
330 | if (length $seq != scalar @pos) { #still need to fetch seq |
---|
331 | #if (abs($pos[$#pos]-$pos[0]) > 200) { |
---|
332 | #print STDERR "TESTING have split codon $chr $pos[0] $pos[$#pos]\n"; |
---|
333 | #exit; |
---|
334 | #} |
---|
335 | if ($seqFlag eq '2bit') { |
---|
336 | $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]); |
---|
337 | }else { |
---|
338 | $seq .= fetchSeqNib($chr, $s, $pos[$#pos]); |
---|
339 | } |
---|
340 | } |
---|
341 | }else { #reversed |
---|
342 | my $s = $pos[$#pos]; |
---|
343 | for(my $i = $#pos -1; $i >= 0; $i--) { |
---|
344 | if ($pos[$i] == $pos[$i+1] + 1) { next; } |
---|
345 | if ($seqFlag eq '2bit') { |
---|
346 | $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]); |
---|
347 | }else { |
---|
348 | $seq .= fetchSeqNib($chr, $s, $pos[$i+1]); |
---|
349 | } |
---|
350 | $s = $pos[$i]; |
---|
351 | } |
---|
352 | if (length $seq != scalar @pos) { #still need to fetch seq |
---|
353 | #if (abs($pos[$#pos]-$pos[0]) > 200) { |
---|
354 | #print STDERR "TESTING have split codon $pos[0] .. $pos[$#pos]\n"; |
---|
355 | #} |
---|
356 | if ($seqFlag eq '2bit') { |
---|
357 | $seq .= fetchSeq2bit($chr, $s, $pos[0]); |
---|
358 | }else { |
---|
359 | $seq .= fetchSeqNib($chr, $s, $pos[0]); |
---|
360 | } |
---|
361 | } |
---|
362 | } |
---|
363 | } |
---|
364 | |
---|
365 | sub fetchSeq2bit { |
---|
366 | my $chr = shift; |
---|
367 | my $st = shift; |
---|
368 | my $end = shift; |
---|
369 | my $strand = '+'; |
---|
370 | $st--; #change to UCSC numbering |
---|
371 | open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or |
---|
372 | die "Couldn't run twoBitToFa, $!\n"; |
---|
373 | my $seq = ''; |
---|
374 | while (<BIT>) { |
---|
375 | chomp; |
---|
376 | if (/^>/) { next; } #header |
---|
377 | $seq .= $_; |
---|
378 | } |
---|
379 | close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n"; |
---|
380 | return $seq; |
---|
381 | } |
---|
382 | |
---|
383 | sub fetchSeqNib { |
---|
384 | my $chr = shift; |
---|
385 | my $st = shift; |
---|
386 | my $end = shift; |
---|
387 | my $strand = '+'; |
---|
388 | $st--; #change to UCSC numbering |
---|
389 | open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n"; |
---|
390 | my $seq = ''; |
---|
391 | while (<NIB>) { |
---|
392 | chomp; |
---|
393 | if (/^>/) { next; } #header |
---|
394 | $seq .= $_; |
---|
395 | } |
---|
396 | close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; |
---|
397 | return $seq; |
---|
398 | } |
---|
399 | |
---|
400 | sub compl { |
---|
401 | my $nts = shift; |
---|
402 | my $comp = ''; |
---|
403 | foreach my $n (split(/ */, $nts)) { |
---|
404 | if ($n eq 'A') { $comp .= 'T'; } |
---|
405 | elsif ($n eq 'T') { $comp .= 'A'; } |
---|
406 | elsif ($n eq 'C') { $comp .= 'G'; } |
---|
407 | elsif ($n eq 'G') { $comp .= 'C'; } |
---|
408 | elsif ($n eq 'N') { $comp .= 'N'; } |
---|
409 | elsif ($n eq '-') { $comp .= '-'; } #deletion |
---|
410 | else { $comp = undef; } |
---|
411 | } |
---|
412 | return $comp; |
---|
413 | } |
---|
414 | |
---|
415 | sub getaa { |
---|
416 | my $nts = shift; #in multiples of 3 |
---|
417 | my $aa = ''; |
---|
418 | my @n = split(/ */, $nts); |
---|
419 | while (@n) { |
---|
420 | my @t = splice(@n, 0, 3); |
---|
421 | my $n = uc(join("", @t)); |
---|
422 | if (!exists $codon{$n}) { $aa .= 'N'; next; } |
---|
423 | $aa .= $codon{$n}; |
---|
424 | } |
---|
425 | return $aa; |
---|
426 | } |
---|
427 | |
---|
428 | sub fill_codon { |
---|
429 | $codon{GCA} = 'Ala'; |
---|
430 | $codon{GCC} = 'Ala'; |
---|
431 | $codon{GCG} = 'Ala'; |
---|
432 | $codon{GCT} = 'Ala'; |
---|
433 | $codon{CGG} = 'Arg'; |
---|
434 | $codon{CGT} = 'Arg'; |
---|
435 | $codon{CGC} = 'Arg'; |
---|
436 | $codon{AGA} = 'Arg'; |
---|
437 | $codon{AGG} = 'Arg'; |
---|
438 | $codon{CGA} = 'Arg'; |
---|
439 | $codon{AAC} = 'Asn'; |
---|
440 | $codon{AAT} = 'Asn'; |
---|
441 | $codon{GAC} = 'Asp'; |
---|
442 | $codon{GAT} = 'Asp'; |
---|
443 | $codon{TGC} = 'Cys'; |
---|
444 | $codon{TGT} = 'Cys'; |
---|
445 | $codon{CAG} = 'Gln'; |
---|
446 | $codon{CAA} = 'Gln'; |
---|
447 | $codon{GAA} = 'Glu'; |
---|
448 | $codon{GAG} = 'Glu'; |
---|
449 | $codon{GGG} = 'Gly'; |
---|
450 | $codon{GGA} = 'Gly'; |
---|
451 | $codon{GGC} = 'Gly'; |
---|
452 | $codon{GGT} = 'Gly'; |
---|
453 | $codon{CAC} = 'His'; |
---|
454 | $codon{CAT} = 'His'; |
---|
455 | $codon{ATA} = 'Ile'; |
---|
456 | $codon{ATT} = 'Ile'; |
---|
457 | $codon{ATC} = 'Ile'; |
---|
458 | $codon{CTA} = 'Leu'; |
---|
459 | $codon{CTC} = 'Leu'; |
---|
460 | $codon{CTG} = 'Leu'; |
---|
461 | $codon{CTT} = 'Leu'; |
---|
462 | $codon{TTG} = 'Leu'; |
---|
463 | $codon{TTA} = 'Leu'; |
---|
464 | $codon{AAA} = 'Lys'; |
---|
465 | $codon{AAG} = 'Lys'; |
---|
466 | $codon{ATG} = 'Met'; |
---|
467 | $codon{TTC} = 'Phe'; |
---|
468 | $codon{TTT} = 'Phe'; |
---|
469 | $codon{CCT} = 'Pro'; |
---|
470 | $codon{CCA} = 'Pro'; |
---|
471 | $codon{CCC} = 'Pro'; |
---|
472 | $codon{CCG} = 'Pro'; |
---|
473 | $codon{TCA} = 'Ser'; |
---|
474 | $codon{AGC} = 'Ser'; |
---|
475 | $codon{AGT} = 'Ser'; |
---|
476 | $codon{TCC} = 'Ser'; |
---|
477 | $codon{TCT} = 'Ser'; |
---|
478 | $codon{TCG} = 'Ser'; |
---|
479 | $codon{TGA} = 'Stop'; |
---|
480 | $codon{TAG} = 'Stop'; |
---|
481 | $codon{TAA} = 'Stop'; |
---|
482 | $codon{ACT} = 'Thr'; |
---|
483 | $codon{ACA} = 'Thr'; |
---|
484 | $codon{ACC} = 'Thr'; |
---|
485 | $codon{ACG} = 'Thr'; |
---|
486 | $codon{TGG} = 'Trp'; |
---|
487 | $codon{TAT} = 'Tyr'; |
---|
488 | $codon{TAC} = 'Tyr'; |
---|
489 | $codon{GTC} = 'Val'; |
---|
490 | $codon{GTA} = 'Val'; |
---|
491 | $codon{GTG} = 'Val'; |
---|
492 | $codon{GTT} = 'Val'; |
---|
493 | } |
---|
494 | |
---|
495 | sub getGalaxyInfo { |
---|
496 | my $build; |
---|
497 | my $locFile; |
---|
498 | foreach (@ARGV) { |
---|
499 | if (/build=(.*)/) { $build = $1; } |
---|
500 | elsif (/loc=(.*)/) { $locFile = $1; } |
---|
501 | } |
---|
502 | if (!$build or !$locFile) { |
---|
503 | print STDERR "ERROR missing build or locfile for Galaxy input\n"; |
---|
504 | exit 1; |
---|
505 | } |
---|
506 | # read $locFile to get $nibDir (ignoring commets) |
---|
507 | open(LF, "< $locFile") || die "open($locFile): $!\n"; |
---|
508 | while(<LF>) { |
---|
509 | s/#.*$//; |
---|
510 | s/(?:^\s+|\s+$)//g; |
---|
511 | next if (/^$/); |
---|
512 | |
---|
513 | my @t = split(/\t/); |
---|
514 | if ($t[0] eq $build) { $nibDir = $t[1]; } |
---|
515 | } |
---|
516 | close(LF); |
---|
517 | if ($nibDir eq 'Galaxy') { |
---|
518 | print STDERR "Failed to find sequence directory in locfile $locFile\n"; |
---|
519 | } |
---|
520 | $nibDir .= "/$build.2bit"; #we want full path and filename |
---|
521 | } |
---|