root/galaxy-central/tools/regVariation/compute_motifs_frequency.pl

リビジョン 2, 10.1 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

  • 属性 svn:executable の設定値 *
行番号 
1#!/usr/bin/perl -w
2
3# a program to compute the frequency of each motif at each window in both upstream and downstream sequences flanking indels
4# in a chromosome/genome.
5# the first input is a TABULAR format file containing the motif names and sequences, such that the file consists of two
6# columns: the left column represents the motif names and the right column represents the motif sequence, one line per motif.
7# the second input is a TABULAR format file containing the upstream and downstream sequences flanking indels, one line per indel.
8# the fourth input is an integer number representing the window size according to which the upstream and downstream sequences
9# flanking each indel will be divided.
10# the first output is a TABULAR format file containing the windows into which both upstream and downstream sequences flanking
11# indels are divided.
12# the second output is a TABULAR format file containing the motifs and their corresponding frequencies at each window in both
13# upstream and downstream sequences flanking indels, one line per motif.
14 
15use strict;
16use warnings;
17
18#variable to handle the falnking sequences information
19my $sequence = "";
20my $upstreamFlankingSequence = "";
21my $downstreamFlankingSequence = "";
22my $discardedSequenceLength = 0;
23my $lengthOfDownstreamFlankingSequenceAfterTrimming = 0;
24
25#variable to handle the window information
26my $window = "";
27my $windowStartIndex = 0;
28my $windowNumber = 0;
29my $totalWindowsNumber = 0;
30my $totalNumberOfWindowsInUpstreamSequence = 0;
31my $totalNumberOfWindowsInDownstreamSequence = 0;
32my $totalWindowsNumberInBothFlankingSequences = 0;
33my $totalWindowsNumberInMotifCountersTwoDimArray = 0;
34my $upstreamAndDownstreamFlankingSequencesWindows = "";
35
36#variable to handle the motif information
37my $motif = "";
38my $motifSequence = "";
39my $motifNumber = 0;
40my $totalMotifsNumber = 0;
41
42#arrays to sotre window and motif data
43my @windowsArray = ();
44my @motifNamesArray = ();
45my @motifSequencesArray = ();
46my @motifCountersTwoDimArray = ();
47
48#variables to store line counter values
49my $lineCounter1 = 0;
50my $lineCounter2 = 0;
51
52# check to make sure having correct files
53my $usage = "usage: compute_motifs_frequency.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] [TABULAR.out]\n";
54die $usage unless @ARGV == 5;
55
56#get the input and output arguments
57my $motifsInputFile = $ARGV[0];
58my $indelFlankingSequencesInputFile = $ARGV[1];
59my $windowSize = $ARGV[2];
60my $indelFlankingSequencesWindowsOutputFile = $ARGV[3];
61my $motifFrequenciesOutputFile = $ARGV[4];
62
63#open the input and output files
64open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n");
65open (INPUT2, "<", $indelFlankingSequencesInputFile) || die("Could not open file $indelFlankingSequencesInputFile \n");
66open (OUTPUT1, ">", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n");   
67open (OUTPUT2, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n");
68
69#store the motifs input file in the array @motifsData
70my @motifsData = <INPUT1>;
71
72#iterated through the motifs (lines) of the motifs input file
73foreach $motif (@motifsData){
74        chomp ($motif);
75        #print ($motif . "\n");
76       
77        #split the motif data into its name and its sequence
78        my @motifNameAndSequenceArray = split(/\t/, $motif);
79       
80        #store the name of the motif into the array @motifNamesArray
81        push @motifNamesArray, $motifNameAndSequenceArray[0];
82       
83        #store the sequence of the motif into the array @motifSequencesArray
84        push @motifSequencesArray, $motifNameAndSequenceArray[1];
85}
86
87#compute the size of the motif names array
88$totalMotifsNumber = @motifNamesArray;
89
90#store the input file in the array @sequencesData
91my @sequencesData = <INPUT2>;
92
93#iterated through the sequences of the second input file in order to create windwos file
94foreach $sequence (@sequencesData){
95        chomp ($sequence);
96        $lineCounter1++;
97       
98        my @indelAndSequenceArray = split(/\t/, $sequence);
99       
100        #get the upstream falnking sequence
101        $upstreamFlankingSequence = $indelAndSequenceArray[3];
102       
103        #if the window size is 0, then the whole upstream will be one window only
104        if ($windowSize == 0){
105                $totalNumberOfWindowsInUpstreamSequence = 1;
106                $windowSize = length ($upstreamFlankingSequence);
107        }
108        else{
109                #compute the total number of windows into which the upstream flanking sequence will be divided
110                $totalNumberOfWindowsInUpstreamSequence = length ($upstreamFlankingSequence) / $windowSize;
111               
112                #compute the length of the subsequence to be discared from the upstream flanking sequence if any
113                $discardedSequenceLength = length ($upstreamFlankingSequence) % $windowSize;
114               
115                #check if the sequence could be split into windows of equal sizes
116            if ($discardedSequenceLength != 0) {
117                #trim the upstream flanking sequence
118                        $upstreamFlankingSequence = substr($upstreamFlankingSequence, $discardedSequenceLength);
119                }
120        }
121               
122        #split the upstream flanking sequence into windows
123        for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInUpstreamSequence; $windowNumber++){
124                $windowStartIndex = $windowNumber * $windowSize;
125                print OUTPUT1 (substr($upstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t");
126        }
127       
128        #add a column representing the indel
129        print OUTPUT1 ("indel" . "\t");
130       
131        #get the downstream falnking sequence
132        $downstreamFlankingSequence = $indelAndSequenceArray[4];
133       
134        #if the window size is 0, then the whole upstream will be one window only
135        if ($windowSize == 0){
136                $totalNumberOfWindowsInDownstreamSequence = 1;
137                $windowSize = length ($downstreamFlankingSequence);
138        }
139        else{
140                #compute the total number of windows into which the downstream flanking sequence will be divided
141                $totalNumberOfWindowsInDownstreamSequence = length ($downstreamFlankingSequence) / $windowSize;
142               
143                #compute the length of the subsequence to be discared from the upstream flanking sequence if any
144                $discardedSequenceLength = length ($downstreamFlankingSequence) % $windowSize;
145               
146                #check if the sequence could be split into windows of equal sizes
147            if ($discardedSequenceLength != 0) {
148                #compute the length of the sequence to be discarded
149                $lengthOfDownstreamFlankingSequenceAfterTrimming = length ($downstreamFlankingSequence) - $discardedSequenceLength;
150               
151                #trim the downstream flanking sequence
152                        $downstreamFlankingSequence = substr($downstreamFlankingSequence, 0, $lengthOfDownstreamFlankingSequenceAfterTrimming);
153                }
154        }
155       
156        #split the downstream flanking sequence into windows
157        for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInDownstreamSequence; $windowNumber++){
158                $windowStartIndex = $windowNumber * $windowSize;
159                print OUTPUT1 (substr($downstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t");
160        }
161       
162        print OUTPUT1 ("\n");
163}
164
165#compute the total number of windows on both upstream and downstream sequences flanking the indel
166$totalWindowsNumberInBothFlankingSequences = $totalNumberOfWindowsInUpstreamSequence + $totalNumberOfWindowsInDownstreamSequence;
167
168#add an additional cell to store the name of the motif and another one for the indel itself
169$totalWindowsNumberInMotifCountersTwoDimArray = $totalWindowsNumberInBothFlankingSequences + 1 + 1;
170
171#initialize the two dimensional array $motifCountersTwoDimArray. the first column will be initialized with motif names
172for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
173       
174        for ($windowNumber = 0; $windowNumber < $totalWindowsNumberInMotifCountersTwoDimArray; $windowNumber++){
175               
176                if ($windowNumber == 0){
177                        $motifCountersTwoDimArray [$motifNumber] [0] = $motifNamesArray[$motifNumber];
178                }
179                elsif ($windowNumber == $totalNumberOfWindowsInUpstreamSequence + 1){
180                        $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = "indel";
181                }
182                else{
183                        $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = 0;
184                }
185        }
186}
187
188close(OUTPUT1);
189
190#open the file the contains the windows of the upstream and downstream flanking sequences, which is actually the first output file
191open (INPUT3, "<", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n");   
192
193#store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData
194my @windowsData = <INPUT3>;
195
196#iterated through the lines of the first output file. Each line represents   
197#the windows of the upstream and downstream flanking sequences of an indel
198foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){
199       
200        chomp ($upstreamAndDownstreamFlankingSequencesWindows);
201        $lineCounter2++;
202       
203        #split both upstream and downstream flanking sequences into their windows
204        my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows);
205       
206        $totalWindowsNumber = @windowsArray;
207       
208        #iterate through the windows to search for matched motifs and increment their corresponding counters accordingly
209        WINDOWS:
210        for ($windowNumber = 0; $windowNumber < $totalWindowsNumber; $windowNumber++){
211               
212                #get the window
213                $window = $windowsArray[$windowNumber];
214               
215        #if the window is the one that contains the indel, then skip the indel window
216        if ($window eq "indel") { 
217                next WINDOWS;   
218        }
219        else{  #iterated through the motif sequences to check their occurrences in the winodw
220               #and increment their corresponding counters accordingly
221               
222                for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
223                        #get the motif sequence
224                        $motifSequence = $motifSequencesArray[$motifNumber];
225                       
226                        #if the motif is found in the window, then increment its corresponding counter
227                        if ($window =~ m/$motifSequence/i){
228                                $motifCountersTwoDimArray [$motifNumber] [$windowNumber + 1]++;
229                        } 
230                }
231        }
232        }
233}
234
235#store the motif counters values in the second output file
236for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
237       
238        for ($windowNumber = 0; $windowNumber <= $totalWindowsNumber; $windowNumber++){
239               
240                print OUTPUT2 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] . "\t";
241                #print ($motifCountersTwoDimArray [$motifNumber] [$windowNumber] . " ");
242        }
243        print OUTPUT2 "\n";
244        #print ("\n");
245}
246               
247#close the input and output files
248close(OUTPUT2);
249close(OUTPUT1);
250close(INPUT3);
251close(INPUT2);
252close(INPUT1);
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。