[2] | 1 | #!/usr/bin/perl -w |
---|
| 2 | |
---|
| 3 | use warnings; |
---|
| 4 | use IO::Handle; |
---|
| 5 | use POSIX qw(floor ceil); |
---|
| 6 | |
---|
| 7 | # example: perl execute_dwt_var_perClass.pl hg18_NCNR_10bp_3flanks_deletionHotspot_data_del.txt deletionHotspot 3flanks del |
---|
| 8 | |
---|
| 9 | $usage = "execute_dwt_var_perClass.pl [TABULAR.in] [TABULAR.out] [TABULAR.out] [PDF.out] \n"; |
---|
| 10 | die $usage unless @ARGV == 4; |
---|
| 11 | |
---|
| 12 | #get the input arguments |
---|
| 13 | my $inputFile = $ARGV[0]; |
---|
| 14 | my $firstOutputFile = $ARGV[1]; |
---|
| 15 | my $secondOutputFile = $ARGV[2]; |
---|
| 16 | my $thirdOutputFile = $ARGV[3]; |
---|
| 17 | |
---|
| 18 | open (INPUT, "<", $inputFile) || die("Could not open file $inputFile \n"); |
---|
| 19 | open (OUTPUT1, ">", $firstOutputFile) || die("Could not open file $firstOutputFile \n"); |
---|
| 20 | open (OUTPUT2, ">", $secondOutputFile) || die("Could not open file $secondOutputFile \n"); |
---|
| 21 | open (OUTPUT3, ">", $thirdOutputFile) || die("Could not open file $thirdOutputFile \n"); |
---|
| 22 | open (ERROR, ">", "error.txt") or die ("Could not open file error.txt \n"); |
---|
| 23 | |
---|
| 24 | #save all error messages into the error file $errorFile using the error file handle ERROR |
---|
| 25 | STDERR -> fdopen( \*ERROR, "w" ) or die ("Could not direct errors to the error file error.txt \n"); |
---|
| 26 | |
---|
| 27 | # choosing meaningful names for the output files |
---|
| 28 | $max_dwt = $firstOutputFile; |
---|
| 29 | $pvalue = $secondOutputFile; |
---|
| 30 | $pdf = $thirdOutputFile; |
---|
| 31 | |
---|
| 32 | # count the number of columns in the input file |
---|
| 33 | while($buffer = <INPUT>){ |
---|
| 34 | #if ($buffer =~ m/interval/){ |
---|
| 35 | chomp($buffer); |
---|
| 36 | $buffer =~ s/^#\s*//; |
---|
| 37 | @contrl = split(/\t/, $buffer); |
---|
| 38 | last; |
---|
| 39 | #} |
---|
| 40 | } |
---|
| 41 | print "The number of columns in the input file is: " . (@contrl) . "\n"; |
---|
| 42 | print "\n"; |
---|
| 43 | |
---|
| 44 | # count the number of motifs in the input file |
---|
| 45 | $count = 0; |
---|
| 46 | for ($i = 0; $i < @contrl; $i++){ |
---|
| 47 | $count++; |
---|
| 48 | print "# $contrl[$i]\n"; |
---|
| 49 | } |
---|
| 50 | print "The number of motifs in the input file is: $count \n"; |
---|
| 51 | |
---|
| 52 | # check if the number of motifs is not a multiple of 12, and round up is so |
---|
| 53 | $count2 = ($count/12); |
---|
| 54 | if ($count2 =~ m/(\D)/){ |
---|
| 55 | print "the number of motifs is not a multiple of 12 \n"; |
---|
| 56 | $count2 = ceil($count2); |
---|
| 57 | } |
---|
| 58 | else { |
---|
| 59 | print "the number of motifs is a multiple of 12 \n"; |
---|
| 60 | } |
---|
| 61 | print "There will be $count2 subfiles\n\n"; |
---|
| 62 | |
---|
| 63 | # split infile into subfiles only 12 motif per file for R plotting |
---|
| 64 | for ($x = 1; $x <= $count2; $x++){ |
---|
| 65 | $a = (($x - 1) * 12 + 1); |
---|
| 66 | $b = $x * 12; |
---|
| 67 | |
---|
| 68 | if ($x < $count2){ |
---|
| 69 | print "# data.short $x <- data_test[, +c($a:$b)]; \n"; |
---|
| 70 | } |
---|
| 71 | else{ |
---|
| 72 | print "# data.short $x <- data_test[, +c($a:ncol(data_test)]; \n"; |
---|
| 73 | } |
---|
| 74 | } |
---|
| 75 | |
---|
| 76 | print "\n"; |
---|
| 77 | print "There are 4 output files: \n"; |
---|
| 78 | print "The first output file is a pdf file\n"; |
---|
| 79 | print "The second output file is a max_dwt file\n"; |
---|
| 80 | print "The third output file is a pvalues file\n"; |
---|
| 81 | print "The fourth output file is a test_final_pvalues file\n"; |
---|
| 82 | |
---|
| 83 | # write R script |
---|
| 84 | $r_script = "get_dwt_varPermut_getMax.r"; |
---|
| 85 | print "The R file name is: $r_script \n"; |
---|
| 86 | |
---|
| 87 | open(Rcmd, ">", "$r_script") or die "Cannot open $r_script \n\n"; |
---|
| 88 | |
---|
| 89 | print Rcmd " |
---|
| 90 | ###################################################################### |
---|
| 91 | # plot power spectra, i.e. wavelet variance by class |
---|
| 92 | # add code to create null bands by permuting the original data series |
---|
| 93 | # get class of maximum significant variance per feature |
---|
| 94 | # generate plots and table matrix of variance including p-values |
---|
| 95 | ###################################################################### |
---|
| 96 | library(\"Rwave\"); |
---|
| 97 | library(\"wavethresh\"); |
---|
| 98 | library(\"waveslim\"); |
---|
| 99 | |
---|
| 100 | options(echo = FALSE) |
---|
| 101 | |
---|
| 102 | # normalize data |
---|
| 103 | norm <- function(data){ |
---|
| 104 | v <- (data-mean(data))/sd(data); |
---|
| 105 | if(sum(is.na(v)) >= 1){ |
---|
| 106 | v<-data; |
---|
| 107 | } |
---|
| 108 | return(v); |
---|
| 109 | } |
---|
| 110 | |
---|
| 111 | dwt_var_permut_getMax <- function(data, names, filter = 4, bc = \"symmetric\", method = \"kendall\", wf = \"haar\", boundary = \"reflection\") { |
---|
| 112 | max_var = NULL; |
---|
| 113 | matrix = NULL; |
---|
| 114 | title = NULL; |
---|
| 115 | final_pvalue = NULL; |
---|
| 116 | short.levels = NULL; |
---|
| 117 | scale = NULL; |
---|
| 118 | |
---|
| 119 | print(names); |
---|
| 120 | |
---|
| 121 | par(mfcol = c(length(names), length(names)), mar = c(0, 0, 0, 0), oma = c(4, 3, 3, 2), xaxt = \"s\", cex = 1, las = 1); |
---|
| 122 | |
---|
| 123 | short.levels <- wd(data[, 1], filter.number = filter, bc = bc)\$nlevels; |
---|
| 124 | |
---|
| 125 | title <- c(\"motif\"); |
---|
| 126 | for (i in 1:short.levels){ |
---|
| 127 | title <- c(title, paste(i, \"var\", sep = \"_\"), paste(i, \"pval\", sep = \"_\"), paste(i, \"test\", sep = \"_\")); |
---|
| 128 | } |
---|
| 129 | print(title); |
---|
| 130 | |
---|
| 131 | # normalize the raw data |
---|
| 132 | data<-apply(data,2,norm); |
---|
| 133 | |
---|
| 134 | for(i in 1:length(names)){ |
---|
| 135 | for(j in 1:length(names)){ |
---|
| 136 | temp = NULL; |
---|
| 137 | results = NULL; |
---|
| 138 | wave1.dwt = NULL; |
---|
| 139 | out = NULL; |
---|
| 140 | |
---|
| 141 | out <- vector(length = length(title)); |
---|
| 142 | temp <- vector(length = short.levels); |
---|
| 143 | |
---|
| 144 | if(i < j) { |
---|
| 145 | plot(temp, type = \"n\", axes = FALSE, xlab = NA, ylab = NA); |
---|
| 146 | box(col = \"grey\"); |
---|
| 147 | grid(ny = 0, nx = NULL); |
---|
| 148 | } else { |
---|
| 149 | if (i > j){ |
---|
| 150 | plot(temp, type = \"n\", axes = FALSE, xlab = NA, ylab = NA); |
---|
| 151 | box(col = \"grey\"); |
---|
| 152 | grid(ny = 0, nx = NULL); |
---|
| 153 | } else { |
---|
| 154 | |
---|
| 155 | wave1.dwt <- dwt(data[, i], wf = wf, short.levels, boundary = boundary); |
---|
| 156 | |
---|
| 157 | temp_row = (short.levels + 1 ) * -1; |
---|
| 158 | temp_col = 1; |
---|
| 159 | temp <- wave.variance(wave1.dwt)[temp_row, temp_col]; |
---|
| 160 | |
---|
| 161 | #permutations code : |
---|
| 162 | feature1 = NULL; |
---|
| 163 | null = NULL; |
---|
| 164 | var_25 = NULL; |
---|
| 165 | var_975 = NULL; |
---|
| 166 | med = NULL; |
---|
| 167 | |
---|
| 168 | feature1 = data[, i]; |
---|
| 169 | for (k in 1:1000) { |
---|
| 170 | nk_1 = NULL; |
---|
| 171 | null.levels = NULL; |
---|
| 172 | var = NULL; |
---|
| 173 | null_wave1 = NULL; |
---|
| 174 | |
---|
| 175 | nk_1 = sample(feature1, length(feature1), replace = FALSE); |
---|
| 176 | null.levels <- wd(nk_1, filter.number = filter, bc = bc)\$nlevels; |
---|
| 177 | var <- vector(length = length(null.levels)); |
---|
| 178 | null_wave1 <- dwt(nk_1, wf = wf, short.levels, boundary = boundary); |
---|
| 179 | var<- wave.variance(null_wave1)[-8, 1]; |
---|
| 180 | null= rbind(null, var); |
---|
| 181 | } |
---|
| 182 | null <- apply(null, 2, sort, na.last = TRUE); |
---|
| 183 | var_25 <- null[25, ]; |
---|
| 184 | var_975 <- null[975, ]; |
---|
| 185 | med <- (apply(null, 2, median, na.rm = TRUE)); |
---|
| 186 | |
---|
| 187 | # plot |
---|
| 188 | results <- cbind(temp, var_25, var_975); |
---|
| 189 | matplot(results, type = \"b\", pch = \"*\", lty = 1, col = c(1, 2, 2), axes = F); |
---|
| 190 | |
---|
| 191 | # get pvalues by comparison to null distribution |
---|
| 192 | out <- (names[i]); |
---|
| 193 | for (m in 1:length(temp)){ |
---|
| 194 | print(paste(\"scale\", m, sep = \" \")); |
---|
| 195 | print(paste(\"var\", temp[m], sep = \" \")); |
---|
| 196 | print(paste(\"med\", med[m], sep = \" \")); |
---|
| 197 | pv = tail = NULL; |
---|
| 198 | out <- c(out, format(temp[m], digits = 3)); |
---|
| 199 | if (temp[m] >= med[m]){ |
---|
| 200 | # R tail test |
---|
| 201 | print(\"R\"); |
---|
| 202 | tail <- \"R\"; |
---|
| 203 | pv <- (length(which(null[, m] >= temp[m])))/(length(na.exclude(null[, m]))); |
---|
| 204 | |
---|
| 205 | } else { |
---|
| 206 | if (temp[m] < med[m]){ |
---|
| 207 | # L tail test |
---|
| 208 | print(\"L\"); |
---|
| 209 | tail <- \"L\"; |
---|
| 210 | pv <- (length(which(null[, m] <= temp[m])))/(length(na.exclude(null[, m]))); |
---|
| 211 | } |
---|
| 212 | } |
---|
| 213 | out <- c(out, pv); |
---|
| 214 | print(pv); |
---|
| 215 | out <- c(out, tail); |
---|
| 216 | } |
---|
| 217 | final_pvalue <-rbind(final_pvalue, out); |
---|
| 218 | |
---|
| 219 | |
---|
| 220 | # get variances outside null bands by comparing temp to null |
---|
| 221 | ## temp stores variance for each scale, and null stores permuted variances for null bands |
---|
| 222 | for (n in 1:length(temp)){ |
---|
| 223 | if (temp[n] <= var_975[n]){ |
---|
| 224 | temp[n] <- NA; |
---|
| 225 | } else { |
---|
| 226 | temp[n] <- temp[n]; |
---|
| 227 | } |
---|
| 228 | } |
---|
| 229 | matrix <- rbind(matrix, temp) |
---|
| 230 | } |
---|
| 231 | } |
---|
| 232 | # labels |
---|
| 233 | if (i == 1){ |
---|
| 234 | mtext(names[j], side = 2, line = 0.5, las = 3, cex = 0.25); |
---|
| 235 | } |
---|
| 236 | if (j == 1){ |
---|
| 237 | mtext(names[i], side = 3, line = 0.5, cex = 0.25); |
---|
| 238 | } |
---|
| 239 | if (j == length(names)){ |
---|
| 240 | axis(1, at = (1:short.levels), las = 3, cex.axis = 0.5); |
---|
| 241 | } |
---|
| 242 | } |
---|
| 243 | } |
---|
| 244 | colnames(final_pvalue) <- title; |
---|
| 245 | #write.table(final_pvalue, file = \"test_final_pvalue.txt\", sep = \"\\t\", quote = FALSE, row.names = FALSE, append = TRUE); |
---|
| 246 | |
---|
| 247 | # get maximum variance larger than expectation by comparison to null bands |
---|
| 248 | varnames <- vector(); |
---|
| 249 | for(i in 1:length(names)){ |
---|
| 250 | name1 = paste(names[i], \"var\", sep = \"_\") |
---|
| 251 | varnames <- c(varnames, name1) |
---|
| 252 | } |
---|
| 253 | rownames(matrix) <- varnames; |
---|
| 254 | colnames(matrix) <- (1:short.levels); |
---|
| 255 | max_var <- names; |
---|
| 256 | scale <- vector(length = length(names)); |
---|
| 257 | for (x in 1:nrow(matrix)){ |
---|
| 258 | if (length(which.max(matrix[x, ])) == 0){ |
---|
| 259 | scale[x] <- NA; |
---|
| 260 | } |
---|
| 261 | else{ |
---|
| 262 | scale[x] <- colnames(matrix)[which.max(matrix[x, ])]; |
---|
| 263 | } |
---|
| 264 | } |
---|
| 265 | max_var <- cbind(max_var, scale); |
---|
| 266 | write.table(max_var, file = \"$max_dwt\", sep = \"\\t\", quote = FALSE, row.names = FALSE, append = TRUE); |
---|
| 267 | return(final_pvalue); |
---|
| 268 | }\n"; |
---|
| 269 | |
---|
| 270 | print Rcmd " |
---|
| 271 | # execute |
---|
| 272 | # read in data |
---|
| 273 | |
---|
| 274 | data_test = NULL; |
---|
| 275 | data_test <- read.delim(\"$inputFile\"); |
---|
| 276 | |
---|
| 277 | pdf(file = \"$pdf\", width = 11, height = 8); |
---|
| 278 | |
---|
| 279 | # loop to read and execute on all $count2 subfiles |
---|
| 280 | final = NULL; |
---|
| 281 | for (x in 1:$count2){ |
---|
| 282 | sub = NULL; |
---|
| 283 | sub_names = NULL; |
---|
| 284 | a = NULL; |
---|
| 285 | b = NULL; |
---|
| 286 | |
---|
| 287 | a = ((x - 1) * 12 + 1); |
---|
| 288 | b = x * 12; |
---|
| 289 | |
---|
| 290 | if (x < $count2){ |
---|
| 291 | sub <- data_test[, +c(a:b)]; |
---|
| 292 | sub_names <- colnames(data_test)[a:b]; |
---|
| 293 | final <- rbind(final, dwt_var_permut_getMax(sub, sub_names)); |
---|
| 294 | } |
---|
| 295 | else{ |
---|
| 296 | sub <- data_test[, +c(a:ncol(data_test))]; |
---|
| 297 | sub_names <- colnames(data_test)[a:ncol(data_test)]; |
---|
| 298 | final <- rbind(final, dwt_var_permut_getMax(sub, sub_names)); |
---|
| 299 | |
---|
| 300 | } |
---|
| 301 | } |
---|
| 302 | |
---|
| 303 | dev.off(); |
---|
| 304 | |
---|
| 305 | write.table(final, file = \"$pvalue\", sep = \"\\t\", quote = FALSE, row.names = FALSE); |
---|
| 306 | |
---|
| 307 | #eof\n"; |
---|
| 308 | |
---|
| 309 | close Rcmd; |
---|
| 310 | |
---|
| 311 | system("echo \"wavelet ANOVA started on \`hostname\` at \`date\`\"\n"); |
---|
| 312 | system("R --no-restore --no-save --no-readline < $r_script > $r_script.out"); |
---|
| 313 | system("echo \"wavelet ANOVA ended on \`hostname\` at \`date\`\"\n"); |
---|
| 314 | |
---|
| 315 | #close the input and output and error files |
---|
| 316 | close(ERROR); |
---|
| 317 | close(OUTPUT3); |
---|
| 318 | close(OUTPUT2); |
---|
| 319 | close(OUTPUT1); |
---|
| 320 | close(INPUT); |
---|