[2] | 1 | #!/usr/bin/env bash |
---|
| 2 | |
---|
| 3 | input_file="$1" |
---|
| 4 | output_file="$2" |
---|
| 5 | chromInfo_file="$3" |
---|
| 6 | chrom="$4" |
---|
| 7 | score_col="$5" |
---|
| 8 | hilbert_curve_level="$6" |
---|
| 9 | summarization_mode="$7" |
---|
| 10 | chrom_col="$8" |
---|
| 11 | start_col="$9" |
---|
| 12 | end_col="${10}" |
---|
| 13 | strand_col="${11}" |
---|
| 14 | |
---|
| 15 | ## use first sequence if chrom filed is empty |
---|
| 16 | if [ -z "$chrom" ]; then |
---|
| 17 | chrom=$( head -n 1 "$input_file" | cut -f$chrom_col ) |
---|
| 18 | fi |
---|
| 19 | |
---|
| 20 | ## get sequence length |
---|
| 21 | if [ ! -r "$chromInfo_file" ]; then |
---|
| 22 | echo "Unable to read chromInfo_file $chromInfo_file" 1>&2 |
---|
| 23 | exit 1 |
---|
| 24 | fi |
---|
| 25 | |
---|
| 26 | chrom_len=$( awk '$1 == chrom {print $2}' chrom=$chrom $chromInfo_file ) |
---|
| 27 | |
---|
| 28 | ## error if we can't find the chrom_len |
---|
| 29 | if [ -z "$chrom_len" ]; then |
---|
| 30 | echo "Can't find length for sequence \"$chrom\" in chromInfo_file $chromInfo_file" 1>&2 |
---|
| 31 | exit 1 |
---|
| 32 | fi |
---|
| 33 | |
---|
| 34 | ## make sure chrom_len is positive |
---|
| 35 | if [ $chrom_len -le 0 ]; then |
---|
| 36 | echo "sequence \"$chrom\" length $chrom_len <= 0" 1>&2 |
---|
| 37 | exit 1 |
---|
| 38 | fi |
---|
| 39 | |
---|
| 40 | ## modify R script depending on the inclusion of a score column, strand information |
---|
| 41 | input_cols="\$${start_col}, \$${end_col}" |
---|
| 42 | col_types='beg=0, end=0, strand=""' |
---|
| 43 | |
---|
| 44 | # if strand_col == 0 (strandCol metadata is not set), assume everything's on the plus strand |
---|
| 45 | if [ $strand_col -ne 0 ]; then |
---|
| 46 | input_cols="${input_cols}, \$${strand_col}" |
---|
| 47 | else |
---|
| 48 | input_cols="${input_cols}, \\\"+\\\"" |
---|
| 49 | fi |
---|
| 50 | |
---|
| 51 | # set plot value (either from data or use a constant value) |
---|
| 52 | if [ $score_col -eq -1 ]; then |
---|
| 53 | value=1 |
---|
| 54 | else |
---|
| 55 | input_cols="${input_cols}, \$${score_col}" |
---|
| 56 | col_types="${col_types}, score=0" |
---|
| 57 | value='chunk$score[i]' |
---|
| 58 | fi |
---|
| 59 | |
---|
| 60 | R --vanilla &> /dev/null <<endOfCode |
---|
| 61 | library(HilbertVis); |
---|
| 62 | |
---|
| 63 | chrom_len <- ${chrom_len}; |
---|
| 64 | chunk_size <- 1000; |
---|
| 65 | interval_count <- 0; |
---|
| 66 | invalid_strand <- 0; |
---|
| 67 | |
---|
| 68 | awk_cmd <- paste( |
---|
| 69 | "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}", |
---|
| 70 | "\$${chrom_col} == \"${chrom}\"", |
---|
| 71 | "{print ${input_cols}}' ${input_file}" |
---|
| 72 | ); |
---|
| 73 | |
---|
| 74 | col_types <- list(${col_types}); |
---|
| 75 | vec <- vector(mode="numeric", length=chrom_len); |
---|
| 76 | conn <- pipe(description=awk_cmd, open="r"); |
---|
| 77 | |
---|
| 78 | repeat { |
---|
| 79 | chunk <- scan(file=conn, what=col_types, sep="\t", nlines=chunk_size, quiet=TRUE); |
---|
| 80 | |
---|
| 81 | if ((rows <- length(chunk\$beg)) == 0) |
---|
| 82 | break; |
---|
| 83 | |
---|
| 84 | interval_count <- interval_count + rows; |
---|
| 85 | |
---|
| 86 | for (i in 1:rows) { |
---|
| 87 | if (chunk\$strand[i] == '+') { |
---|
| 88 | start <- chunk\$beg[i] + 1; |
---|
| 89 | stop <- chunk\$end[i]; |
---|
| 90 | } else if (chunk\$strand[i] == '-') { |
---|
| 91 | start <- chrom_len - chunk\$end[i] - 1; |
---|
| 92 | stop <- chrom_len - chunk\$beg[i]; |
---|
| 93 | } else { |
---|
| 94 | invalid_strand <- invalid_strand + 1; |
---|
| 95 | interval_count <- interval_count - 1; |
---|
| 96 | next; |
---|
| 97 | } |
---|
| 98 | vec[start:stop] <- ${value}; |
---|
| 99 | } |
---|
| 100 | } |
---|
| 101 | |
---|
| 102 | close(conn); |
---|
| 103 | |
---|
| 104 | hMat <- hilbertImage(vec, level=$hilbert_curve_level, mode="$summarization_mode"); |
---|
| 105 | pdf(file="$output_file", onefile=TRUE, width=8, height=10.5, paper="letter"); |
---|
| 106 | showHilbertImage(hMat); |
---|
| 107 | dev.off(); |
---|
| 108 | endOfCode |
---|
| 109 | |
---|