[2] | 1 | #!/usr/bin/env python |
---|
| 2 | """ |
---|
| 3 | Cluster regions of intervals. |
---|
| 4 | |
---|
| 5 | usage: %prog in_file out_file |
---|
| 6 | -1, --cols1=N,N,N,N: Columns for start, end, strand in file |
---|
| 7 | -d, --distance=N: Maximum distance between clustered intervals |
---|
| 8 | -v, --overlap=N: Minimum overlap require (negative distance) |
---|
| 9 | -m, --minregions=N: Minimum regions per cluster |
---|
| 10 | -o, --output=N: 1)merged 2)filtered 3)clustered 4) minimum 5) maximum |
---|
| 11 | """ |
---|
| 12 | from galaxy import eggs |
---|
| 13 | import pkg_resources |
---|
| 14 | pkg_resources.require( "bx-python" ) |
---|
| 15 | import sys, traceback, fileinput |
---|
| 16 | from warnings import warn |
---|
| 17 | from bx.intervals import * |
---|
| 18 | from bx.intervals.io import * |
---|
| 19 | from bx.intervals.operations.find_clusters import * |
---|
| 20 | from bx.cookbook import doc_optparse |
---|
| 21 | from galaxy.tools.util.galaxyops import * |
---|
| 22 | |
---|
| 23 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 24 | |
---|
| 25 | def main(): |
---|
| 26 | distance = 0 |
---|
| 27 | minregions = 2 |
---|
| 28 | output = 1 |
---|
| 29 | upstream_pad = 0 |
---|
| 30 | downstream_pad = 0 |
---|
| 31 | |
---|
| 32 | options, args = doc_optparse.parse( __doc__ ) |
---|
| 33 | try: |
---|
| 34 | chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) |
---|
| 35 | if options.distance: distance = int( options.distance ) |
---|
| 36 | if options.overlap: distance = -1 * int( options.overlap ) |
---|
| 37 | if options.output: output = int( options.output ) |
---|
| 38 | if options.minregions: minregions = int( options.minregions ) |
---|
| 39 | in_fname, out_fname = args |
---|
| 40 | except: |
---|
| 41 | doc_optparse.exception() |
---|
| 42 | |
---|
| 43 | g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ), |
---|
| 44 | chrom_col=chr_col_1, |
---|
| 45 | start_col=start_col_1, |
---|
| 46 | end_col=end_col_1, |
---|
| 47 | strand_col=strand_col_1, |
---|
| 48 | fix_strand=True ) |
---|
| 49 | |
---|
| 50 | # Get the cluster tree |
---|
| 51 | try: |
---|
| 52 | clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions) |
---|
| 53 | except ParseError, exc: |
---|
| 54 | fail( "Invalid file format: %s" % str( exc ) ) |
---|
| 55 | |
---|
| 56 | f1 = open( in_fname, "r" ) |
---|
| 57 | out_file = open( out_fname, "w" ) |
---|
| 58 | |
---|
| 59 | # If "merge" |
---|
| 60 | if output == 1: |
---|
| 61 | fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col)+1)] |
---|
| 62 | for chrom, tree in clusters.items(): |
---|
| 63 | for start, end, lines in tree.getregions(): |
---|
| 64 | fields[g1.chrom_col] = chrom |
---|
| 65 | fields[g1.start_col] = str(start) |
---|
| 66 | fields[g1.end_col] = str(end) |
---|
| 67 | out_file.write( "%s\n" % "\t".join( fields ) ) |
---|
| 68 | |
---|
| 69 | # If "filtered" we preserve order of file and comments, etc. |
---|
| 70 | if output == 2: |
---|
| 71 | linenums = dict() |
---|
| 72 | for chrom, tree in clusters.items(): |
---|
| 73 | for linenum in tree.getlines(): |
---|
| 74 | linenums[linenum] = 0 |
---|
| 75 | linenum = -1 |
---|
| 76 | f1.seek(0) |
---|
| 77 | for line in f1.readlines(): |
---|
| 78 | linenum += 1 |
---|
| 79 | if linenum in linenums or linenum in extra: |
---|
| 80 | out_file.write( "%s\n" % line.rstrip( "\n\r" ) ) |
---|
| 81 | |
---|
| 82 | # If "clustered" we output original intervals, but near each other (i.e. clustered) |
---|
| 83 | if output == 3: |
---|
| 84 | linenums = list() |
---|
| 85 | f1.seek(0) |
---|
| 86 | fileLines = f1.readlines() |
---|
| 87 | for chrom, tree in clusters.items(): |
---|
| 88 | for linenum in tree.getlines(): |
---|
| 89 | out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) ) |
---|
| 90 | |
---|
| 91 | # If "minimum" we output the smallest interval in each cluster |
---|
| 92 | if output == 4 or output == 5: |
---|
| 93 | linenums = list() |
---|
| 94 | f1.seek(0) |
---|
| 95 | fileLines = f1.readlines() |
---|
| 96 | for chrom, tree in clusters.items(): |
---|
| 97 | regions = tree.getregions() |
---|
| 98 | for start, end, lines in tree.getregions(): |
---|
| 99 | outsize = -1 |
---|
| 100 | outinterval = None |
---|
| 101 | for line in lines: |
---|
| 102 | # three nested for loops? |
---|
| 103 | # should only execute this code once per line |
---|
| 104 | fileline = fileLines[line].rstrip("\n\r") |
---|
| 105 | try: |
---|
| 106 | cluster_interval = GenomicInterval( g1, fileline.split("\t"), |
---|
| 107 | g1.chrom_col, |
---|
| 108 | g1.start_col, |
---|
| 109 | g1.end_col, |
---|
| 110 | g1.strand_col, |
---|
| 111 | g1.default_strand, |
---|
| 112 | g1.fix_strand ) |
---|
| 113 | except Exception, exc: |
---|
| 114 | print >> sys.stderr, str( exc ) |
---|
| 115 | f1.close() |
---|
| 116 | sys.exit() |
---|
| 117 | interval_size = cluster_interval.end - cluster_interval.start |
---|
| 118 | if outsize == -1 or \ |
---|
| 119 | ( outsize > interval_size and output == 4 ) or \ |
---|
| 120 | ( outsize < interval_size and output == 5 ) : |
---|
| 121 | outinterval = cluster_interval |
---|
| 122 | outsize = interval_size |
---|
| 123 | out_file.write( "%s\n" % outinterval ) |
---|
| 124 | |
---|
| 125 | f1.close() |
---|
| 126 | out_file.close() |
---|
| 127 | |
---|
| 128 | if g1.skipped > 0: |
---|
| 129 | print skipped( g1, filedesc="" ) |
---|
| 130 | |
---|
| 131 | if __name__ == "__main__": |
---|
| 132 | main() |
---|