1 | #!/usr/bin/env python |
---|
2 | """ |
---|
3 | Cluster regions of intervals. |
---|
4 | |
---|
5 | usage: %prog in_file out_file |
---|
6 | -1, --cols1=N,N,N,N: Columns for start, end, strand in file |
---|
7 | -d, --distance=N: Maximum distance between clustered intervals |
---|
8 | -v, --overlap=N: Minimum overlap require (negative distance) |
---|
9 | -m, --minregions=N: Minimum regions per cluster |
---|
10 | -o, --output=N: 1)merged 2)filtered 3)clustered 4) minimum 5) maximum |
---|
11 | """ |
---|
12 | from galaxy import eggs |
---|
13 | import pkg_resources |
---|
14 | pkg_resources.require( "bx-python" ) |
---|
15 | import sys, traceback, fileinput |
---|
16 | from warnings import warn |
---|
17 | from bx.intervals import * |
---|
18 | from bx.intervals.io import * |
---|
19 | from bx.intervals.operations.find_clusters import * |
---|
20 | from bx.cookbook import doc_optparse |
---|
21 | from galaxy.tools.util.galaxyops import * |
---|
22 | |
---|
23 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
24 | |
---|
25 | def main(): |
---|
26 | distance = 0 |
---|
27 | minregions = 2 |
---|
28 | output = 1 |
---|
29 | upstream_pad = 0 |
---|
30 | downstream_pad = 0 |
---|
31 | |
---|
32 | options, args = doc_optparse.parse( __doc__ ) |
---|
33 | try: |
---|
34 | chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) |
---|
35 | if options.distance: distance = int( options.distance ) |
---|
36 | if options.overlap: distance = -1 * int( options.overlap ) |
---|
37 | if options.output: output = int( options.output ) |
---|
38 | if options.minregions: minregions = int( options.minregions ) |
---|
39 | in_fname, out_fname = args |
---|
40 | except: |
---|
41 | doc_optparse.exception() |
---|
42 | |
---|
43 | g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), |
---|
44 | chrom_col=chr_col_1, |
---|
45 | start_col=start_col_1, |
---|
46 | end_col=end_col_1, |
---|
47 | strand_col=strand_col_1, |
---|
48 | fix_strand=True ) |
---|
49 | |
---|
50 | # Get the cluster tree |
---|
51 | try: |
---|
52 | clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions) |
---|
53 | except ParseError, exc: |
---|
54 | fail( "Invalid file format: %s" % str( exc ) ) |
---|
55 | |
---|
56 | f1 = open( in_fname, "r" ) |
---|
57 | out_file = open( out_fname, "w" ) |
---|
58 | |
---|
59 | # If "merge" |
---|
60 | if output == 1: |
---|
61 | fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col)+1)] |
---|
62 | for chrom, tree in clusters.items(): |
---|
63 | for start, end, lines in tree.getregions(): |
---|
64 | fields[g1.chrom_col] = chrom |
---|
65 | fields[g1.start_col] = str(start) |
---|
66 | fields[g1.end_col] = str(end) |
---|
67 | out_file.write( "%s\n" % "\t".join( fields ) ) |
---|
68 | |
---|
69 | # If "filtered" we preserve order of file and comments, etc. |
---|
70 | if output == 2: |
---|
71 | linenums = dict() |
---|
72 | for chrom, tree in clusters.items(): |
---|
73 | for linenum in tree.getlines(): |
---|
74 | linenums[linenum] = 0 |
---|
75 | linenum = -1 |
---|
76 | f1.seek(0) |
---|
77 | for line in f1.readlines(): |
---|
78 | linenum += 1 |
---|
79 | if linenum in linenums or linenum in extra: |
---|
80 | out_file.write( "%s\n" % line.rstrip( "\n\r" ) ) |
---|
81 | |
---|
82 | # If "clustered" we output original intervals, but near each other (i.e. clustered) |
---|
83 | if output == 3: |
---|
84 | linenums = list() |
---|
85 | f1.seek(0) |
---|
86 | fileLines = f1.readlines() |
---|
87 | for chrom, tree in clusters.items(): |
---|
88 | for linenum in tree.getlines(): |
---|
89 | out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) ) |
---|
90 | |
---|
91 | # If "minimum" we output the smallest interval in each cluster |
---|
92 | if output == 4 or output == 5: |
---|
93 | linenums = list() |
---|
94 | f1.seek(0) |
---|
95 | fileLines = f1.readlines() |
---|
96 | for chrom, tree in clusters.items(): |
---|
97 | regions = tree.getregions() |
---|
98 | for start, end, lines in tree.getregions(): |
---|
99 | outsize = -1 |
---|
100 | outinterval = None |
---|
101 | for line in lines: |
---|
102 | # three nested for loops? |
---|
103 | # should only execute this code once per line |
---|
104 | fileline = fileLines[line].rstrip("\n\r") |
---|
105 | try: |
---|
106 | cluster_interval = GenomicInterval( g1, fileline.split("\t"), |
---|
107 | g1.chrom_col, |
---|
108 | g1.start_col, |
---|
109 | g1.end_col, |
---|
110 | g1.strand_col, |
---|
111 | g1.default_strand, |
---|
112 | g1.fix_strand ) |
---|
113 | except Exception, exc: |
---|
114 | print >> sys.stderr, str( exc ) |
---|
115 | f1.close() |
---|
116 | sys.exit() |
---|
117 | interval_size = cluster_interval.end - cluster_interval.start |
---|
118 | if outsize == -1 or \ |
---|
119 | ( outsize > interval_size and output == 4 ) or \ |
---|
120 | ( outsize < interval_size and output == 5 ) : |
---|
121 | outinterval = cluster_interval |
---|
122 | outsize = interval_size |
---|
123 | out_file.write( "%s\n" % outinterval ) |
---|
124 | |
---|
125 | f1.close() |
---|
126 | out_file.close() |
---|
127 | |
---|
128 | if g1.skipped > 0: |
---|
129 | print skipped( g1, filedesc="" ) |
---|
130 | |
---|
131 | if __name__ == "__main__": |
---|
132 | main() |
---|