root/galaxy-central/tools/regVariation/compute_motif_frequencies_for_all_motifs.pl

リビジョン 2, 5.8 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/perl -w
2
3# a program to compute the frequencies of each motif at a window size, determined by the user, in both
4# upstream and downstream sequences flanking indels in all chromosomes.
5# the first input is a TABULAR format file containing the motif names and sequences, such that the file
6# consists of two columns: the left column represents the motif names and the right column represents
7# the motif sequence, one line per motif.
8# the second input is a TABULAR format file containing the windows into which both upstream and downstream
9# sequences flanking indels have been divided.
10# the fourth input is an integer number representing the number of windows to be considered in both
11# upstream and downstream flanking sequences.
12# the output is a TABULAR format file consisting of three columns: the left column represents the motif
13# name, the middle column represents the motif frequency in the window of the upstream sequence flanking
14# an indel, and the the right column represents the motif frequency in the window of the downstream
15# sequence flanking an indel, one line per indel.
16# The total number of lines in the output file = number of motifs x number of indels.
17
18use strict;
19use warnings;
20
21#variable to handle the window information
22my $window = "";
23my $windowNumber = 0;
24my $totalWindowsNumber = 0;
25my $upstreamAndDownstreamFlankingSequencesWindows = "";
26
27#variable to handle the motif information
28my $motif = "";
29my $motifName = "";
30my $motifSequence = "";
31my $motifNumber = 0;
32my $totalMotifsNumber = 0;
33my $upstreamMotifFrequencyCounter = 0;
34my $downstreamMotifFrequencyCounter = 0;
35
36#arrays to sotre window and motif data
37my @windowsArray = ();
38my @motifNamesArray = ();
39my @motifSequencesArray = ();
40
41#variable to handle the indel information
42my $indelIndex = 0;
43
44#variable to store line counter value
45my $lineCounter = 0;
46
47# check to make sure having correct files
48my $usage = "usage: compute_motif_frequencies_for_all_motifs.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] \n";
49die $usage unless @ARGV == 4;
50
51#get the input arguments
52my $motifsInputFile = $ARGV[0];
53my $indelFlankingSequencesWindowsInputFile = $ARGV[1];
54my $numberOfConsideredWindows = $ARGV[2];
55my $motifFrequenciesOutputFile = $ARGV[3];
56
57#open the input files
58open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n");
59open (INPUT2, "<", $indelFlankingSequencesWindowsInputFile) || die("Could not open file indelFlankingSequencesWindowsInputFile \n");   
60open (OUTPUT, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n");   
61
62#store the motifs input file in the array @motifsData
63my @motifsData = <INPUT1>;
64
65#iterated through the motifs (lines) of the motifs input file
66foreach $motif (@motifsData){
67        chomp ($motif);
68        #print ($motif . "\n");
69       
70        #split the motif data into its name and its sequence
71        my @motifNameAndSequenceArray = split(/\t/, $motif);
72       
73        #store the name of the motif into the array @motifNamesArray
74        push @motifNamesArray, $motifNameAndSequenceArray[0];
75       
76        #store the sequence of the motif into the array @motifSequencesArray
77        push @motifSequencesArray, $motifNameAndSequenceArray[1];
78}
79
80#compute the size of the motif names array
81$totalMotifsNumber = @motifNamesArray;
82
83
84#store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData
85my @windowsData = <INPUT2>;
86
87#check if the number of considered window entered by the user is 0 or negative, if so make it equal to 1
88if ($numberOfConsideredWindows <= 0){
89        $numberOfConsideredWindows = 1;
90}
91
92#iterated through the motif sequences to check their occurrences in the considered windows
93#and store the count of their occurrences in the corresponding ouput file
94for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){
95       
96        #get the motif name
97        $motifName = $motifNamesArray[$motifNumber];
98       
99        #get the motif sequence
100    $motifSequence = $motifSequencesArray[$motifNumber];
101                               
102        #iterated through the lines of the second input file. Each line represents   
103        #the windows of the upstream and downstream flanking sequences of an indel
104        foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){
105               
106                chomp ($upstreamAndDownstreamFlankingSequencesWindows);
107                $lineCounter++;
108               
109                #split both upstream and downstream flanking sequences into their windows
110                my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows);
111               
112                if ($lineCounter == 1){
113                        $totalWindowsNumber = @windowsArray;
114                        $indelIndex = ($totalWindowsNumber - 1)/2;             
115                }
116               
117                #reset the motif frequency counters
118                $upstreamMotifFrequencyCounter = 0;
119                $downstreamMotifFrequencyCounter = 0;
120               
121                #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter
122                for ($windowNumber = $indelIndex - 1; $windowNumber > $indelIndex - $numberOfConsideredWindows - 1; $windowNumber--){
123                       
124                        #get the window
125                        $window = $windowsArray[$windowNumber];
126                       
127                        #if the motif is found in the window, then increment its corresponding counter
128                        if ($window =~ m/$motifSequence/i){
129                        $upstreamMotifFrequencyCounter++;
130                } 
131                }
132               
133                #iterate through the considered windows of the upstream flanking sequence and increment the motif frequency counter
134                for ($windowNumber = $indelIndex + 1; $windowNumber < $indelIndex + $numberOfConsideredWindows + 1; $windowNumber++){
135                       
136                        #get the window
137                    $window = $windowsArray[$windowNumber];
138                 
139                    #if the motif is found in the window, then increment its corresponding counter
140                        if ($window =~ m/$motifSequence/i){
141                        $downstreamMotifFrequencyCounter++;
142                } 
143                }
144               
145                #store the result into the output file of the motif
146                print OUTPUT $motifName . "\t" . $upstreamMotifFrequencyCounter . "\t" . $downstreamMotifFrequencyCounter . "\n";
147        }
148}
149       
150#close the input and output files
151close(OUTPUT);
152close(INPUT2);
153close(INPUT1);
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。