1 | #!/usr/bin/env bash |
---|
2 | |
---|
3 | input_file="$1" |
---|
4 | output_file="$2" |
---|
5 | chromInfo_file="$3" |
---|
6 | chrom="$4" |
---|
7 | score_col="$5" |
---|
8 | hilbert_curve_level="$6" |
---|
9 | summarization_mode="$7" |
---|
10 | chrom_col="$8" |
---|
11 | start_col="$9" |
---|
12 | end_col="${10}" |
---|
13 | strand_col="${11}" |
---|
14 | |
---|
15 | ## use first sequence if chrom filed is empty |
---|
16 | if [ -z "$chrom" ]; then |
---|
17 | chrom=$( head -n 1 "$input_file" | cut -f$chrom_col ) |
---|
18 | fi |
---|
19 | |
---|
20 | ## get sequence length |
---|
21 | if [ ! -r "$chromInfo_file" ]; then |
---|
22 | echo "Unable to read chromInfo_file $chromInfo_file" 1>&2 |
---|
23 | exit 1 |
---|
24 | fi |
---|
25 | |
---|
26 | chrom_len=$( awk '$1 == chrom {print $2}' chrom=$chrom $chromInfo_file ) |
---|
27 | |
---|
28 | ## error if we can't find the chrom_len |
---|
29 | if [ -z "$chrom_len" ]; then |
---|
30 | echo "Can't find length for sequence \"$chrom\" in chromInfo_file $chromInfo_file" 1>&2 |
---|
31 | exit 1 |
---|
32 | fi |
---|
33 | |
---|
34 | ## make sure chrom_len is positive |
---|
35 | if [ $chrom_len -le 0 ]; then |
---|
36 | echo "sequence \"$chrom\" length $chrom_len <= 0" 1>&2 |
---|
37 | exit 1 |
---|
38 | fi |
---|
39 | |
---|
40 | ## modify R script depending on the inclusion of a score column, strand information |
---|
41 | input_cols="\$${start_col}, \$${end_col}" |
---|
42 | col_types='beg=0, end=0, strand=""' |
---|
43 | |
---|
44 | # if strand_col == 0 (strandCol metadata is not set), assume everything's on the plus strand |
---|
45 | if [ $strand_col -ne 0 ]; then |
---|
46 | input_cols="${input_cols}, \$${strand_col}" |
---|
47 | else |
---|
48 | input_cols="${input_cols}, \\\"+\\\"" |
---|
49 | fi |
---|
50 | |
---|
51 | # set plot value (either from data or use a constant value) |
---|
52 | if [ $score_col -eq -1 ]; then |
---|
53 | value=1 |
---|
54 | else |
---|
55 | input_cols="${input_cols}, \$${score_col}" |
---|
56 | col_types="${col_types}, score=0" |
---|
57 | value='chunk$score[i]' |
---|
58 | fi |
---|
59 | |
---|
60 | R --vanilla &> /dev/null <<endOfCode |
---|
61 | library(HilbertVis); |
---|
62 | |
---|
63 | chrom_len <- ${chrom_len}; |
---|
64 | chunk_size <- 1000; |
---|
65 | interval_count <- 0; |
---|
66 | invalid_strand <- 0; |
---|
67 | |
---|
68 | awk_cmd <- paste( |
---|
69 | "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"}", |
---|
70 | "\$${chrom_col} == \"${chrom}\"", |
---|
71 | "{print ${input_cols}}' ${input_file}" |
---|
72 | ); |
---|
73 | |
---|
74 | col_types <- list(${col_types}); |
---|
75 | vec <- vector(mode="numeric", length=chrom_len); |
---|
76 | conn <- pipe(description=awk_cmd, open="r"); |
---|
77 | |
---|
78 | repeat { |
---|
79 | chunk <- scan(file=conn, what=col_types, sep="\t", nlines=chunk_size, quiet=TRUE); |
---|
80 | |
---|
81 | if ((rows <- length(chunk\$beg)) == 0) |
---|
82 | break; |
---|
83 | |
---|
84 | interval_count <- interval_count + rows; |
---|
85 | |
---|
86 | for (i in 1:rows) { |
---|
87 | if (chunk\$strand[i] == '+') { |
---|
88 | start <- chunk\$beg[i] + 1; |
---|
89 | stop <- chunk\$end[i]; |
---|
90 | } else if (chunk\$strand[i] == '-') { |
---|
91 | start <- chrom_len - chunk\$end[i] - 1; |
---|
92 | stop <- chrom_len - chunk\$beg[i]; |
---|
93 | } else { |
---|
94 | invalid_strand <- invalid_strand + 1; |
---|
95 | interval_count <- interval_count - 1; |
---|
96 | next; |
---|
97 | } |
---|
98 | vec[start:stop] <- ${value}; |
---|
99 | } |
---|
100 | } |
---|
101 | |
---|
102 | close(conn); |
---|
103 | |
---|
104 | hMat <- hilbertImage(vec, level=$hilbert_curve_level, mode="$summarization_mode"); |
---|
105 | pdf(file="$output_file", onefile=TRUE, width=8, height=10.5, paper="letter"); |
---|
106 | showHilbertImage(hMat); |
---|
107 | dev.off(); |
---|
108 | endOfCode |
---|
109 | |
---|