1 | #!/usr/bin/perl -w |
---|
2 | |
---|
3 | use warnings; |
---|
4 | use IO::Handle; |
---|
5 | use POSIX qw(floor ceil); |
---|
6 | |
---|
7 | # example: perl execute_dwt_var_perClass.pl hg18_NCNR_10bp_3flanks_deletionHotspot_data_del.txt deletionHotspot 3flanks del |
---|
8 | |
---|
9 | $usage = "execute_dwt_var_perClass.pl [TABULAR.in] [TABULAR.out] [TABULAR.out] [PDF.out] \n"; |
---|
10 | die $usage unless @ARGV == 4; |
---|
11 | |
---|
12 | #get the input arguments |
---|
13 | my $inputFile = $ARGV[0]; |
---|
14 | my $firstOutputFile = $ARGV[1]; |
---|
15 | my $secondOutputFile = $ARGV[2]; |
---|
16 | my $thirdOutputFile = $ARGV[3]; |
---|
17 | |
---|
18 | open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \n"); |
---|
19 | open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n"); |
---|
20 | open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n"); |
---|
21 | open (OUTPUT3, ">", $thirdOutputFile) || die("Could not open file $thirdOutputFile \n"); |
---|
22 | open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); |
---|
23 | |
---|
24 | #save all error messages into the error file $errorFile using the error file handle ERROR |
---|
25 | STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); |
---|
26 | |
---|
27 | # choosing meaningful names for the output files |
---|
28 | $max_dwt = $firstOutputFile; |
---|
29 | $pvalue = $secondOutputFile; |
---|
30 | $pdf = $thirdOutputFile; |
---|
31 | |
---|
32 | # count the number of columns in the input file |
---|
33 | while($buffer = <INPUT>){ |
---|
34 | #if ($buffer =~ m/interval/){ |
---|
35 | chomp($buffer); |
---|
36 | $buffer =~ s/^#\s*//; |
---|
37 | @contrl = split(/\t/, $buffer); |
---|
38 | last; |
---|
39 | #} |
---|
40 | } |
---|
41 | print "The number of columns in the input file is: " . (@contrl) . "\n"; |
---|
42 | print "\n"; |
---|
43 | |
---|
44 | # count the number of motifs in the input file |
---|
45 | $count = 0; |
---|
46 | for ($i = 0; $i < @contrl; $i++){ |
---|
47 | $count++; |
---|
48 | print "# $contrl[$i]\n"; |
---|
49 | } |
---|
50 | print "The number of motifs in the input file is: $count \n"; |
---|
51 | |
---|
52 | # check if the number of motifs is not a multiple of 12, and round up is so |
---|
53 | $count2 = ($count/12); |
---|
54 | if ($count2 =~ m/(\D)/){ |
---|
55 | print "the number of motifs is not a multiple of 12 \n"; |
---|
56 | $count2 = ceil($count2); |
---|
57 | } |
---|
58 | else { |
---|
59 | print "the number of motifs is a multiple of 12 \n"; |
---|
60 | } |
---|
61 | print "There will be $count2 subfiles\n\n"; |
---|
62 | |
---|
63 | # split infile into subfiles only 12 motif per file for R plotting |
---|
64 | for ($x = 1; $x <= $count2; $x++){ |
---|
65 | $a = (($x - 1) * 12 + 1); |
---|
66 | $b = $x * 12; |
---|
67 | |
---|
68 | if ($x < $count2){ |
---|
69 | print "# data.short $x <- data_test[, +c($a:$b)]; \n"; |
---|
70 | } |
---|
71 | else{ |
---|
72 | print "# data.short $x <- data_test[, +c($a:ncol(data_test)]; \n"; |
---|
73 | } |
---|
74 | } |
---|
75 | |
---|
76 | print "\n"; |
---|
77 | print "There are 4 output files: \n"; |
---|
78 | print "The first output file is a pdf file\n"; |
---|
79 | print "The second output file is a max_dwt file\n"; |
---|
80 | print "The third output file is a pvalues file\n"; |
---|
81 | print "The fourth output file is a test_final_pvalues file\n"; |
---|
82 | |
---|
83 | # write R script |
---|
84 | $r_script = "get_dwt_varPermut_getMax.r"; |
---|
85 | print "The R file name is: $r_script \n"; |
---|
86 | |
---|
87 | open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; |
---|
88 | |
---|
89 | print Rcmd " |
---|
90 | ###################################################################### |
---|
91 | # plot power spectra, i.e. wavelet variance by class |
---|
92 | # add code to create null bands by permuting the original data series |
---|
93 | # get class of maximum significant variance per feature |
---|
94 | # generate plots and table matrix of variance including p-values |
---|
95 | ###################################################################### |
---|
96 | library(\"Rwave\"); |
---|
97 | library(\"wavethresh\"); |
---|
98 | library(\"waveslim\"); |
---|
99 | |
---|
100 | options(echo = FALSE) |
---|
101 | |
---|
102 | # normalize data |
---|
103 | norm <- function(data){ |
---|
104 | v <- (data-mean(data))/sd(data); |
---|
105 | if(sum(is.na(v)) >= 1){ |
---|
106 | v<-data; |
---|
107 | } |
---|
108 | return(v); |
---|
109 | } |
---|
110 | |
---|
111 | dwt_var_permut_getMax <- function(data, names, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") { |
---|
112 | max_var = NULL; |
---|
113 | matrix = NULL; |
---|
114 | title = NULL; |
---|
115 | final_pvalue = NULL; |
---|
116 | short.levels = NULL; |
---|
117 | scale = NULL; |
---|
118 | |
---|
119 | print(names); |
---|
120 | |
---|
121 | par(mfcol = c(length(names), length(names)), mar = c(0, 0, 0, 0), oma = c(4, 3, 3, 2), xaxt = \"s\", cex = 1, las = 1); |
---|
122 | |
---|
123 | short.levels <- wd(data[, 1], filter.number = filter, bc = bc)\$nlevels; |
---|
124 | |
---|
125 | title <- c(\"motif\"); |
---|
126 | for (i in 1:short.levels){ |
---|
127 | title <- c(title, paste(i, \"var\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"), paste(i, \"test\", sep = \"_\")); |
---|
128 | } |
---|
129 | print(title); |
---|
130 | |
---|
131 | # normalize the raw data |
---|
132 | data<-apply(data,2,norm); |
---|
133 | |
---|
134 | for(i in 1:length(names)){ |
---|
135 | for(j in 1:length(names)){ |
---|
136 | temp = NULL; |
---|
137 | results = NULL; |
---|
138 | wave1.dwt = NULL; |
---|
139 | out = NULL; |
---|
140 | |
---|
141 | out <- vector(length = length(title)); |
---|
142 | temp <- vector(length = short.levels); |
---|
143 | |
---|
144 | if(i < j) { |
---|
145 | plot(temp, type = \"n\", axes = FALSE, xlab = NA, ylab = NA); |
---|
146 | box(col = \"grey\"); |
---|
147 | grid(ny = 0, nx = NULL); |
---|
148 | } else { |
---|
149 | if (i > j){ |
---|
150 | plot(temp, type = \"n\", axes = FALSE, xlab = NA, ylab = NA); |
---|
151 | box(col = \"grey\"); |
---|
152 | grid(ny = 0, nx = NULL); |
---|
153 | } else { |
---|
154 | |
---|
155 | wave1.dwt <- dwt(data[, i], wf = wf, short.levels, boundary = boundary); |
---|
156 | |
---|
157 | temp_row = (short.levels + 1 ) * -1; |
---|
158 | temp_col = 1; |
---|
159 | temp <- wave.variance(wave1.dwt)[temp_row, temp_col]; |
---|
160 | |
---|
161 | #permutations code : |
---|
162 | feature1 = NULL; |
---|
163 | null = NULL; |
---|
164 | var_25 = NULL; |
---|
165 | var_975 = NULL; |
---|
166 | med = NULL; |
---|
167 | |
---|
168 | feature1 = data[, i]; |
---|
169 | for (k in 1:1000) { |
---|
170 | nk_1 = NULL; |
---|
171 | null.levels = NULL; |
---|
172 | var = NULL; |
---|
173 | null_wave1 = NULL; |
---|
174 | |
---|
175 | nk_1 = sample(feature1, length(feature1), replace = FALSE); |
---|
176 | null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels; |
---|
177 | var <- vector(length = length(null.levels)); |
---|
178 | null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary); |
---|
179 | var<- wave.variance(null_wave1)[-8, 1]; |
---|
180 | null= rbind(null, var); |
---|
181 | } |
---|
182 | null <- apply(null, 2, sort, na.last = TRUE); |
---|
183 | var_25 <- null[25, ]; |
---|
184 | var_975 <- null[975, ]; |
---|
185 | med <- (apply(null, 2, median, na.rm = TRUE)); |
---|
186 | |
---|
187 | # plot |
---|
188 | results <- cbind(temp, var_25, var_975); |
---|
189 | matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), axes = F); |
---|
190 | |
---|
191 | # get pvalues by comparison to null distribution |
---|
192 | out <- (names[i]); |
---|
193 | for (m in 1:length(temp)){ |
---|
194 | print(paste(\"scale\", m, sep = \" \")); |
---|
195 | print(paste(\"var\", temp[m], sep = \" \")); |
---|
196 | print(paste(\"med\", med[m], sep = \" \")); |
---|
197 | pv = tail = NULL; |
---|
198 | out <- c(out, format(temp[m], digits = 3)); |
---|
199 | if (temp[m] >= med[m]){ |
---|
200 | # R tail test |
---|
201 | print(\"R\"); |
---|
202 | tail <- \"R\"; |
---|
203 | pv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m]))); |
---|
204 | |
---|
205 | } else { |
---|
206 | if (temp[m] < med[m]){ |
---|
207 | # L tail test |
---|
208 | print(\"L\"); |
---|
209 | tail <- \"L\"; |
---|
210 | pv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m]))); |
---|
211 | } |
---|
212 | } |
---|
213 | out <- c(out, pv); |
---|
214 | print(pv); |
---|
215 | out <- c(out, tail); |
---|
216 | } |
---|
217 | final_pvalue <-rbind(final_pvalue, out); |
---|
218 | |
---|
219 | |
---|
220 | # get variances outside null bands by comparing temp to null |
---|
221 | ## temp stores variance for each scale, and null stores permuted variances for null bands |
---|
222 | for (n in 1:length(temp)){ |
---|
223 | if (temp[n] <= var_975[n]){ |
---|
224 | temp[n] <- NA; |
---|
225 | } else { |
---|
226 | temp[n] <- temp[n]; |
---|
227 | } |
---|
228 | } |
---|
229 | matrix <- rbind(matrix, temp) |
---|
230 | } |
---|
231 | } |
---|
232 | # labels |
---|
233 | if (i == 1){ |
---|
234 | mtext(names[j], side = 2, line = 0.5, las = 3, cex = 0.25); |
---|
235 | } |
---|
236 | if (j == 1){ |
---|
237 | mtext(names[i], side = 3, line = 0.5, cex = 0.25); |
---|
238 | } |
---|
239 | if (j == length(names)){ |
---|
240 | axis(1, at = (1:short.levels), las = 3, cex.axis = 0.5); |
---|
241 | } |
---|
242 | } |
---|
243 | } |
---|
244 | colnames(final_pvalue) <- title; |
---|
245 | #write.table(final_pvalue, file = \"test_final_pvalue.txt\", sep = \"\\t\", quote = FALSE, row.names = FALSE, append = TRUE); |
---|
246 | |
---|
247 | # get maximum variance larger than expectation by comparison to null bands |
---|
248 | varnames <- vector(); |
---|
249 | for(i in 1:length(names)){ |
---|
250 | name1 = paste(names[i], \"var\", sep = \"_\") |
---|
251 | varnames <- c(varnames, name1) |
---|
252 | } |
---|
253 | rownames(matrix) <- varnames; |
---|
254 | colnames(matrix) <- (1:short.levels); |
---|
255 | max_var <- names; |
---|
256 | scale <- vector(length = length(names)); |
---|
257 | for (x in 1:nrow(matrix)){ |
---|
258 | if (length(which.max(matrix[x, ])) == 0){ |
---|
259 | scale[x] <- NA; |
---|
260 | } |
---|
261 | else{ |
---|
262 | scale[x] <- colnames(matrix)[which.max(matrix[x, ])]; |
---|
263 | } |
---|
264 | } |
---|
265 | max_var <- cbind(max_var, scale); |
---|
266 | write.table(max_var, file = \"$max_dwt\", sep = \"\\t\", quote = FALSE, row.names = FALSE, append = TRUE); |
---|
267 | return(final_pvalue); |
---|
268 | }\n"; |
---|
269 | |
---|
270 | print Rcmd " |
---|
271 | # execute |
---|
272 | # read in data |
---|
273 | |
---|
274 | data_test = NULL; |
---|
275 | data_test <- read.delim(\"$inputFile\"); |
---|
276 | |
---|
277 | pdf(file = \"$pdf\", width = 11, height = 8); |
---|
278 | |
---|
279 | # loop to read and execute on all $count2 subfiles |
---|
280 | final = NULL; |
---|
281 | for (x in 1:$count2){ |
---|
282 | sub = NULL; |
---|
283 | sub_names = NULL; |
---|
284 | a = NULL; |
---|
285 | b = NULL; |
---|
286 | |
---|
287 | a = ((x - 1) * 12 + 1); |
---|
288 | b = x * 12; |
---|
289 | |
---|
290 | if (x < $count2){ |
---|
291 | sub <- data_test[, +c(a:b)]; |
---|
292 | sub_names <- colnames(data_test)[a:b]; |
---|
293 | final <- rbind(final, dwt_var_permut_getMax(sub, sub_names)); |
---|
294 | } |
---|
295 | else{ |
---|
296 | sub <- data_test[, +c(a:ncol(data_test))]; |
---|
297 | sub_names <- colnames(data_test)[a:ncol(data_test)]; |
---|
298 | final <- rbind(final, dwt_var_permut_getMax(sub, sub_names)); |
---|
299 | |
---|
300 | } |
---|
301 | } |
---|
302 | |
---|
303 | dev.off(); |
---|
304 | |
---|
305 | write.table(final, file = \"$pvalue\", sep = \"\\t\", quote = FALSE, row.names = FALSE); |
---|
306 | |
---|
307 | #eof\n"; |
---|
308 | |
---|
309 | close Rcmd; |
---|
310 | |
---|
311 | system("echo \"wavelet ANOVA started on \`hostname\` at \`date\`\"\n"); |
---|
312 | system("R --no-restore --no-save --no-readline < $r_script > $r_script.out"); |
---|
313 | system("echo \"wavelet ANOVA ended on \`hostname\` at \`date\`\"\n"); |
---|
314 | |
---|
315 | #close the input and output and error files |
---|
316 | close(ERROR); |
---|
317 | close(OUTPUT3); |
---|
318 | close(OUTPUT2); |
---|
319 | close(OUTPUT1); |
---|
320 | close(INPUT); |
---|