1 | #!/usr/bin/perl |
---|
2 | |
---|
3 | ########################################################################### |
---|
4 | # Purpose: To calculate the correlation of two sets of scores in one file. |
---|
5 | # Usage: correlation.pl infile.bed output.txt column1 column2 |
---|
6 | # (column start from 1) |
---|
7 | # Written by: Yi Zhang (June, 2005) |
---|
8 | ########################################################################### |
---|
9 | if (!$ARGV[0] || !$ARGV[1] || !defined($ARGV[2]) || !defined($ARGV[3]) ) { |
---|
10 | print STDERR "Usage: correlation.pl infile.bed output.txt column1 column2\n"; |
---|
11 | print STDERR " (column start from 1)\n"; |
---|
12 | exit; |
---|
13 | } |
---|
14 | my $file = $ARGV[0]; |
---|
15 | my $out = $ARGV[1]; |
---|
16 | |
---|
17 | die "<font color=\"yellow\">The input columns contain numerical values: $ARGV[2], $ARGV[3]</font>.\n" if ($ARGV[2] =~ /[a-zA-Z]+/ || $ARGV[3] =~ /[a-zA-Z]+/); |
---|
18 | |
---|
19 | my $col1 = $ARGV[2] - 1; |
---|
20 | my $col2 = $ARGV[3] - 1; |
---|
21 | |
---|
22 | my ($f, $o); |
---|
23 | my (@a, @b); |
---|
24 | |
---|
25 | my $n_t = 0; |
---|
26 | open($f, $file) or die "Could't open $file, $!\n"; |
---|
27 | while(<$f>) { |
---|
28 | chomp; |
---|
29 | my @t = split(/\t/); |
---|
30 | if ($n_t == 0) { |
---|
31 | $n_t = scalar(@t) - 1; |
---|
32 | die "<font color=\"yellow\">The input column number exceeds the size of the file: $col1, $col2, $n_t</font>\n" if ( $col1 > $n_t || $col2 > $n_t ); |
---|
33 | } |
---|
34 | die "<font color=\"yellow\">The columns you have selected contain non numeric characters:$t[$col1] and $t[$col2] \n</font>" if ($t[$col1] =~ /[a-zA-Z]+/ || $t[$col2] =~ /[a-zA-Z]+/); |
---|
35 | push(@a, $t[$col1]); |
---|
36 | push(@b, $t[$col2]); |
---|
37 | } |
---|
38 | close($f); |
---|
39 | |
---|
40 | my $result = correlation(\@a, \@b); |
---|
41 | |
---|
42 | open($o, ">$out") or die "Couldn't open $out, $!\n"; |
---|
43 | $col1 = $col1 + 1; |
---|
44 | $col2 = $col2 + 1; |
---|
45 | print $o "The correlation of column $col1 and $col2 is $result\n"; |
---|
46 | close($o); |
---|
47 | print "The correlation of column $col1 and $col2 is $result\n"; |
---|
48 | |
---|
49 | sub correlation { |
---|
50 | my ($array1ref, $array2ref) = @_; |
---|
51 | my ($sum1, $sum2); |
---|
52 | my ($sum1_squared, $sum2_squared); |
---|
53 | foreach (@$array1ref) { $sum1 += $_; $sum1_squared += $_**2; } |
---|
54 | foreach (@$array2ref) { $sum2 += $_; $sum2_squared += $_**2; } |
---|
55 | my $numerator = (@$array1ref**2) * covariance($array1ref, $array2ref); |
---|
56 | my $denominator = sqrt(((@$array1ref * $sum1_squared) - ($sum1**2)) * |
---|
57 | ((@$array1ref * $sum2_squared) - ($sum2**2))); |
---|
58 | my $r; |
---|
59 | if ($denominator == 0) { |
---|
60 | print STDERR "The denominator is 0.\n"; |
---|
61 | exit 0; |
---|
62 | } else { |
---|
63 | $r = $numerator / $denominator; |
---|
64 | } |
---|
65 | return $r; |
---|
66 | } |
---|
67 | |
---|
68 | sub covariance { |
---|
69 | my ($array1ref, $array2ref) = @_; |
---|
70 | my ($i, $result); |
---|
71 | for ($i = 0; $i < @$array1ref; $i++) { |
---|
72 | $result += $array1ref->[$i] * $array2ref->[$i]; |
---|
73 | } |
---|
74 | $result /= @$array1ref; |
---|
75 | $result -= mean($array1ref) * mean($array2ref); |
---|
76 | } |
---|
77 | |
---|
78 | sub mean { |
---|
79 | my ($arrayref) = @_; |
---|
80 | my $result; |
---|
81 | foreach (@$arrayref) { $result += $_; } |
---|
82 | return $result/@$arrayref; |
---|
83 | } |
---|
84 | |
---|