root/galaxy-central/tools/evolution/codingSnps.pl

リビジョン 2, 16.6 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

  • 属性 svn:executable の設定値 *
行番号 
1#!/usr/bin/env perl
2
3use strict;
4use warnings;
5
6#########################################################################
7#       codingSnps.pl
8#       This takes a bed file with the names being / separated nts
9#       and a gene bed file with cds start and stop.
10#       It then checks for changes in coding regions, reporting
11#       those that cause a frameshift or substitution in the amino acid.
12#########################################################################
13
14if (!@ARGV or scalar @ARGV < 3) {
15   print "Usage: codingSnps.pl snps.bed genes.bed (/dir/nib/|Galaxy build= loc=) [chr=# start=# end=# snp=#] > codingSnps.txt\n";
16   exit;
17}
18my $uniq = 0; #flag for whether want uniq positions
19my $syn = 0;  #flag for if want synonomous changes rather than non-syn
20my $snpFile = shift @ARGV;
21my $geneFile = shift @ARGV;
22my $nibDir = shift @ARGV;
23if ($nibDir eq 'Galaxy') { getGalaxyInfo(); }
24my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib
25my $col0 = 0; #bed like columns in default positions
26my $col1 = 1;
27my $col2 = 2;
28my $col3 = 3;
29#column positions 1 based coming in (for Galaxy)
30foreach (@ARGV) {
31   if (/chr=(\d+)/) { $col0 = $1 -1; }
32   elsif (/start=(\d+)/) { $col1 = $1 -1; }
33   elsif (/end=(\d+)/) { $col2 = $1 -1; }
34   elsif (/snp=(\d+)/) { $col3 = $1 -1; }
35}
36if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) {
37   print STDERR "ERROR column numbers are given with origin 1\n";
38   exit 1;
39}
40my @genes; #bed lines for genes, sorted by chrom and start
41my %chrSt; #index in array where each chrom starts
42my %codon; #hash of codon amino acid conversions
43my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom
44my $ignoreN = 1; #skip N
45
46my %amb = (
47"R" => "A/G",
48"Y" => "C/T",
49"S" => "C/G",
50"W" => "A/T",
51"K" => "G/T",
52"M" => "A/C",
53"B" => "C/G/T",
54"D" => "A/G/T",
55"H" => "A/C/T",
56"V" => "A/C/G",
57"N" => "A/C/G/T"
58);
59fill_codon();
60open(FH, "cat $geneFile | sort -k1,1 -k2,2n |")
61   or die "Couldn't open and sort $geneFile, $!\n";
62my $i = 0;
63while(<FH>) {
64   chomp;
65   if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; }
66   push(@genes, "$_");
67   my @f = split(/\t/);
68   if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; }
69   $i++;
70}
71close FH or die "Couldn't close $geneFile, $!\n";
72
73if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; }
74
75#open snps sorted as well
76my $s1 = $col0 + 1; #sort order is origin 1
77my $s2 = $col1 + 1;
78open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |")
79   or die "Couldn't open and sort $snpFile, $!\n";
80$i = 0;
81my @g; #one genes fields, should be used repeatedly
82my %done;
83while(<FH>) {
84   chomp;
85   if (/^\s*#/) { next; } #comment
86   my @s = split(/\t/); #SNP fields
87   if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; }
88   my $size = $#s;
89   if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) {
90      print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n";
91      exit 1;
92   }
93   if ($s[$col1] =~ /\D/) {
94      print STDERR "ERROR the start point must be an integer not $s[$col1]\n";
95      exit 1;
96   }
97   if ($s[$col2] =~ /\D/) {
98      print STDERR "ERROR the start point must be an integer not $s[$col2]\n";
99      exit 1;
100   }
101   if ($s[$col3] eq 'N' && $ignoreN) { next; }
102   if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; }
103   if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row
104      $i = $chrSt{$s[$col0]};
105      @g = split(/\t/, $genes[$i]);
106   }elsif (!@g) {
107      next; #no gene for this chrom
108   }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom
109      $i = $chrSt{$s[$col0]};
110      @g = split(/\t/, $genes[$i]);
111   }elsif ($s[$col0] ne $g[0]) {
112      next; #no gene for this chrom
113   }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) {
114      next; #before any genes
115   }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) {
116      next; #after all genes on chr
117   }else {
118      while ($s[$col1] > $g[2] && $i < $#genes) {
119         $i++;
120         @g = split(/\t/, $genes[$i]);
121         if ($s[$col0] ne $g[0]) { last; } #end of gene
122      }
123      if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) {
124         next; #no overlap with genes
125      }
126   }
127
128   processSnp(\@s, \@g);
129   if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; }
130
131   my $k = $i + 1; #check for more genes without losing data of first
132   if ($k <= $#genes) {
133      my @g2 = split(/\t/, $genes[$k]);
134      while (@g2 && $k <= $#genes) {
135         @g2 = split(/\t/, $genes[$k]);
136         if ($s[$col0] ne $g2[0]) {
137            undef @g2;
138            last; #not same chrom
139         }else {
140            while ($s[$col1] > $g2[2] && $k <= $#genes) {
141               $k++;
142               @g2 = split(/\t/, $genes[$k]);
143               if ($s[$col0] ne $g2[0]) { last; } #end of chrom
144            }
145            if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) {
146               undef @g2;
147               last; #no overlap with more genes
148            }
149            processSnp(\@s, \@g2);
150            if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; }
151         }     
152         $k++;
153      }
154   }
155}
156close FH or die "Couldn't close $snpFile, $!\n";
157
158exit;
159
160########################################################################
161sub processSnp {
162   my $sref = shift;
163   my $gref = shift;
164   #overlaps gene, but maybe not coding seq
165   #inside cds
166   if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) {
167      return; #outside of coding
168   }
169   #now check exon
170   my $i = 0;
171   my @st = split(/,/, $gref->[11]);
172   my @size = split(/,/, $gref->[10]);
173   if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; }
174   my @pos;
175   my $in = 0;
176   for($i = 0; $i < $gref->[9]; $i++) {
177      my $sta = $gref->[1] + $st[$i] + 1; #1 based position
178      my $end = $sta + $size[$i] - 1; #
179      if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes
180      if ($end < $gref->[6]) { next; } #utr only
181      if ($sta > $gref->[7]) { next; } #utr only
182      #shorten to coding only
183      if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; }
184      if ($end > $gref->[7]) { $end = $gref->[7]; }
185      if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; }
186      elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; }
187      push(@pos, ($sta .. $end)); #add exon worth of positions
188   }
189   #@pos has coding positions for whole gene (chr coors),
190   #and $in has whether we need to continue
191   if (!$in) { return; } #not in coding exon
192   if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons
193   if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or
194       $sref->[$col3] =~ /^-+$/) { #indel or del
195      my $copy = $sref->[$col3];
196      my $c = ($copy =~ tr/-//);
197      if ($c % 3 == 0) { return; } #not frameshift
198      #handle bed4 to bed4 + 4 (pgSnp)
199      print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
200      #if ($sref->[4]) { print "\t$sref->[4]"; }
201      #if ($sref->[5]) { print "\t$sref->[5]"; }
202      #if ($sref->[6]) { print "\t$sref->[6]"; }
203      #if ($sref->[7]) { print "\t$sref->[7]"; }
204      print "\t$gref->[3]\tframeshift\n";
205      $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
206      return;
207   }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion
208      my $copy = $sref->[$col3];
209      my $c = ($copy =~ tr/\[ACTG]+//);
210      if ($c % 3 == 0) { return; } #not frameshift
211      #handle bed4 to bed4 + 4 (pgSnp)
212      print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
213      #if ($sref->[4]) { print "\t$sref->[4]"; }
214      #if ($sref->[5]) { print "\t$sref->[5]"; }
215      #if ($sref->[6]) { print "\t$sref->[6]"; }
216      #if ($sref->[7]) { print "\t$sref->[7]"; }
217      print "\t$gref->[3]\tframeshift\n";
218      $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
219      return;
220   }
221   #check for amino acid substitutions
222   my $s = $sref->[$col1] + 1;
223   my $e = $sref->[$col2];
224   my $len = $sref->[$col2] - $sref->[$col1];
225   if ($gref->[5] eq '-') {
226      @pos = reverse(@pos);
227      my $t = $s;
228      $s = $e;
229      $e = $t;
230   }
231   $i = 0;
232   my $found = 0;
233   foreach (@pos) {
234      if ($s == $_) {
235         $found = 1;
236         last;
237      }
238      $i++;
239   }
240   if ($found) {
241      my $fs = $i; #keep original start index
242      #have index where substitution starts
243      my $cp = $i % 3;
244      $i -= $cp; #i is now first position in codon
245      my $cdNum = int($i / 3) + 1;
246      my $ls = $i;
247      if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; }
248      if (!@pos) { die "ERROR not defined array pos\n"; }
249      if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; }
250      if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; }
251      while ($ls <= $#pos && $pos[$ls] ne $e) {
252         $ls++;
253      }
254      my $i2 = $ls + (2 - ($ls % 3));
255      if ($i2 > $#pos) { return; } #not a full codon, partial gene?
256
257      if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; }
258      my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]);
259      if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; }
260      my @vars = split(/\//, $sref->[$col3]);
261      if ($gref->[5] eq '-') { #complement oldnts and revcomp vars
262         $oldnts = compl($oldnts);
263         if (!$oldnts) { return; } #skip this one
264         $oldnts = join('', (reverse(split(/ */, $oldnts))));
265         foreach (@vars) {
266            $_ = reverse(split(/ */));
267            $_ = compl($_);
268         }
269      }
270      my $r = $fs - $i; #difference in old indexes gives new index
271      my @newnts;
272      my $changed = '';
273      foreach my $v (@vars) {
274         if (!$v or length($v) != 1) { return; } #only simple changes
275         my @new = split(/ */, $oldnts);
276         $changed = splice(@new, $r, $len, split(/ */, $v));
277         #should only change single nt
278         push(@newnts, join("", @new));
279      }
280      #now compute amino acids
281      my $oldaa = getaa($oldnts);
282      my @newaa;
283      my $change = 0; #flag for if there is a change
284      foreach my $v (@newnts) {
285         my $t = getaa($v);
286         if ($t ne $oldaa) { $change = 1; }
287         push(@newaa, $t);
288      }
289      if (!$change && $syn) {
290          print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
291          print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n";
292          return;
293      }elsif ($syn) { return; } #only want synonymous changes
294      if (!$change) { return; } #no change in amino acids
295#if (abs($pos[$i] - $pos[$i2]) > 200) {
296#print STDERR "TESTING found mutation at splice site $sref->[0]\t$sref->[1]\t$sref->[2]\n";
297#print STDERR "old $oldaa, new ", join(', ', @newaa), "\n";
298#print STDERR "oldnt $oldnts, strand $gref->[5]\n";
299#exit;
300#}
301      print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]";
302      #if (defined $sref->[4]) { print "\t$sref->[4]"; }
303      #if (defined $sref->[5]) { print "\t$sref->[5]"; }
304      #if (defined $sref->[6]) { print "\t$sref->[6]"; }
305      #if (defined $sref->[7]) { print "\t$sref->[7]"; }
306      if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref
307      if (!$changed) { return; } #skip this one
308      print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n";
309      $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++;
310   }
311}
312
313sub getnts {
314   my $chr = shift;
315   my @pos = @_; #list of positions not necessarily in order
316   #list may be reversed or have gaps(introns), at least 3 bps
317   my $seq = '';
318   if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; }
319   if ($pos[0] < $pos[1]) { #not reversed
320      my $s = $pos[0];
321      for(my $i = 1; $i <= $#pos; $i++) {
322         if ($pos[$i] == $pos[$i-1] + 1) { next; }
323         if ($seqFlag eq '2bit') {
324            $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]);
325         }else {
326            $seq .= fetchSeqNib($chr, $s, $pos[$i-1]);
327         }
328         $s = $pos[$i];
329      }
330      if (length $seq != scalar @pos) { #still need to fetch seq
331#if (abs($pos[$#pos]-$pos[0]) > 200) {
332#print STDERR "TESTING have split codon $chr $pos[0] $pos[$#pos]\n";
333#exit;
334#}
335         if ($seqFlag eq '2bit') {
336            $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]);
337         }else {
338            $seq .= fetchSeqNib($chr, $s, $pos[$#pos]);
339         }
340      }
341   }else { #reversed
342      my $s = $pos[$#pos];
343      for(my $i = $#pos -1; $i >= 0; $i--) {
344         if ($pos[$i] == $pos[$i+1] + 1) { next; }
345         if ($seqFlag eq '2bit') {
346            $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]);
347         }else {
348            $seq .= fetchSeqNib($chr, $s, $pos[$i+1]);
349         }
350         $s = $pos[$i];
351      }
352      if (length $seq != scalar @pos) { #still need to fetch seq
353#if (abs($pos[$#pos]-$pos[0]) > 200) {
354#print STDERR "TESTING have split codon $pos[0] .. $pos[$#pos]\n";
355#}
356         if ($seqFlag eq '2bit') {
357            $seq .= fetchSeq2bit($chr, $s, $pos[0]);
358         }else {
359            $seq .= fetchSeqNib($chr, $s, $pos[0]);
360         }
361      }
362   }
363}
364
365sub fetchSeq2bit {
366   my $chr = shift;
367   my $st = shift;
368   my $end = shift;
369   my $strand = '+';
370   $st--; #change to UCSC numbering
371   open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or
372      die "Couldn't run twoBitToFa, $!\n";
373   my $seq = '';
374   while (<BIT>) {
375      chomp;
376      if (/^>/) { next; } #header
377      $seq .= $_;
378   }
379   close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n";
380   return $seq;
381}
382
383sub fetchSeqNib {
384   my $chr = shift;
385   my $st = shift;
386   my $end = shift;
387   my $strand = '+';
388   $st--; #change to UCSC numbering
389   open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n";
390   my $seq = '';
391   while (<NIB>) {
392      chomp;
393      if (/^>/) { next; } #header
394      $seq .= $_;
395   }
396   close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n";
397   return $seq;
398}
399
400sub compl {
401   my $nts = shift;
402   my $comp = '';
403   foreach my $n (split(/ */, $nts)) {
404      if ($n eq 'A') { $comp .= 'T'; }
405      elsif ($n eq 'T') { $comp .= 'A'; }
406      elsif ($n eq 'C') { $comp .= 'G'; }
407      elsif ($n eq 'G') { $comp .= 'C'; }
408      elsif ($n eq 'N') { $comp .= 'N'; }
409      elsif ($n eq '-') { $comp .= '-'; } #deletion
410      else { $comp = undef; }
411   }
412   return $comp;
413}
414
415sub getaa {
416   my $nts = shift;  #in multiples of 3
417   my $aa = '';
418   my @n = split(/ */, $nts);
419   while (@n) {
420      my @t = splice(@n, 0, 3);
421      my $n = uc(join("", @t));
422      if (!exists $codon{$n}) { $aa .= 'N'; next; }
423      $aa .= $codon{$n};
424   }
425   return $aa;
426}
427
428sub fill_codon {
429$codon{GCA} = 'Ala';
430$codon{GCC} = 'Ala';
431$codon{GCG} = 'Ala';
432$codon{GCT} = 'Ala';
433$codon{CGG} = 'Arg';
434$codon{CGT} = 'Arg';
435$codon{CGC} = 'Arg';
436$codon{AGA} = 'Arg';
437$codon{AGG} = 'Arg';
438$codon{CGA} = 'Arg';
439$codon{AAC} = 'Asn';
440$codon{AAT} = 'Asn';
441$codon{GAC} = 'Asp';
442$codon{GAT} = 'Asp';
443$codon{TGC} = 'Cys';
444$codon{TGT} = 'Cys';
445$codon{CAG} = 'Gln';
446$codon{CAA} = 'Gln';
447$codon{GAA} = 'Glu';
448$codon{GAG} = 'Glu';
449$codon{GGG} = 'Gly';
450$codon{GGA} = 'Gly';
451$codon{GGC} = 'Gly';
452$codon{GGT} = 'Gly';
453$codon{CAC} = 'His';
454$codon{CAT} = 'His';
455$codon{ATA} = 'Ile';
456$codon{ATT} = 'Ile';
457$codon{ATC} = 'Ile';
458$codon{CTA} = 'Leu';
459$codon{CTC} = 'Leu';
460$codon{CTG} = 'Leu';
461$codon{CTT} = 'Leu';
462$codon{TTG} = 'Leu';
463$codon{TTA} = 'Leu';
464$codon{AAA} = 'Lys';
465$codon{AAG} = 'Lys';
466$codon{ATG} = 'Met';
467$codon{TTC} = 'Phe';
468$codon{TTT} = 'Phe';
469$codon{CCT} = 'Pro';
470$codon{CCA} = 'Pro';
471$codon{CCC} = 'Pro';
472$codon{CCG} = 'Pro';
473$codon{TCA} = 'Ser';
474$codon{AGC} = 'Ser';
475$codon{AGT} = 'Ser';
476$codon{TCC} = 'Ser';
477$codon{TCT} = 'Ser';
478$codon{TCG} = 'Ser';
479$codon{TGA} = 'Stop';
480$codon{TAG} = 'Stop';
481$codon{TAA} = 'Stop';
482$codon{ACT} = 'Thr';
483$codon{ACA} = 'Thr';
484$codon{ACC} = 'Thr';
485$codon{ACG} = 'Thr';
486$codon{TGG} = 'Trp';
487$codon{TAT} = 'Tyr';
488$codon{TAC} = 'Tyr';
489$codon{GTC} = 'Val';
490$codon{GTA} = 'Val';
491$codon{GTG} = 'Val';
492$codon{GTT} = 'Val';
493}
494
495sub getGalaxyInfo {
496   my $build;
497   my $locFile;
498   foreach (@ARGV) {
499      if (/build=(.*)/) { $build = $1; }
500      elsif (/loc=(.*)/) { $locFile = $1; }
501   }
502   if (!$build or !$locFile) {
503      print STDERR "ERROR missing build or locfile for Galaxy input\n";
504      exit 1;
505   }
506   # read $locFile to get $nibDir (ignoring commets)
507   open(LF, "< $locFile") || die "open($locFile): $!\n";
508   while(<LF>) {
509      s/#.*$//;
510      s/(?:^\s+|\s+$)//g;
511      next if (/^$/);
512   
513      my @t = split(/\t/);
514      if ($t[0] eq $build) { $nibDir = $t[1]; }
515   }
516   close(LF);
517   if ($nibDir eq 'Galaxy') {
518      print STDERR "Failed to find sequence directory in locfile $locFile\n";
519   }
520   $nibDir .= "/$build.2bit";  #we want full path and filename
521}
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。