| 1 | #!/usr/bin/env python |
|---|
| 2 | #By: Guruprasad Ananda |
|---|
| 3 | """ |
|---|
| 4 | Fetch closest up/downstream interval from features corresponding to every interval in primary |
|---|
| 5 | |
|---|
| 6 | usage: %prog primary_file features_file out_file direction |
|---|
| 7 | -1, --cols1=N,N,N,N: Columns for start, end, strand in first file |
|---|
| 8 | -2, --cols2=N,N,N,N: Columns for start, end, strand in second file |
|---|
| 9 | -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval |
|---|
| 10 | -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval |
|---|
| 11 | """ |
|---|
| 12 | from galaxy import eggs |
|---|
| 13 | import pkg_resources |
|---|
| 14 | pkg_resources.require( "bx-python" ) |
|---|
| 15 | import sys, traceback, fileinput |
|---|
| 16 | from warnings import warn |
|---|
| 17 | from bx.cookbook import doc_optparse |
|---|
| 18 | from galaxy.tools.util.galaxyops import * |
|---|
| 19 | from bx.intervals.io import * |
|---|
| 20 | from bx.intervals.operations import quicksect |
|---|
| 21 | from galaxy.tools.util.gff_util import * |
|---|
| 22 | |
|---|
| 23 | assert sys.version_info[:2] >= ( 2, 4 ) |
|---|
| 24 | |
|---|
| 25 | def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down): |
|---|
| 26 | #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases |
|---|
| 27 | #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand |
|---|
| 28 | #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand |
|---|
| 29 | if direction == 1: |
|---|
| 30 | if node.maxend < threshold_up: |
|---|
| 31 | if node.end == node.maxend: |
|---|
| 32 | report_func_up(node) |
|---|
| 33 | elif node.right and node.left: |
|---|
| 34 | if node.right.maxend == node.maxend: |
|---|
| 35 | get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 36 | elif node.left.maxend == node.maxend: |
|---|
| 37 | get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 38 | elif node.right and node.right.maxend == node.maxend: |
|---|
| 39 | get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 40 | elif node.left and node.left.maxend == node.maxend: |
|---|
| 41 | get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 42 | elif node.minend < threshold_up: |
|---|
| 43 | if node.end < threshold_up: |
|---|
| 44 | report_func_up(node) |
|---|
| 45 | if node.left and node.right: |
|---|
| 46 | if node.right.minend < threshold_up: |
|---|
| 47 | get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 48 | if node.left.minend < threshold_up: |
|---|
| 49 | get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 50 | elif node.left: |
|---|
| 51 | if node.left.minend < threshold_up: |
|---|
| 52 | get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 53 | elif node.right: |
|---|
| 54 | if node.right.minend < threshold_up: |
|---|
| 55 | get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 56 | elif direction == 0: |
|---|
| 57 | if node.start > threshold_down: |
|---|
| 58 | report_func_down(node) |
|---|
| 59 | if node.left: |
|---|
| 60 | get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 61 | else: |
|---|
| 62 | if node.right: |
|---|
| 63 | get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down) |
|---|
| 64 | |
|---|
| 65 | def proximal_region_finder(readers, region, comments=True): |
|---|
| 66 | """ |
|---|
| 67 | Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ]. |
|---|
| 68 | Intervals are GenomicInterval objects. |
|---|
| 69 | """ |
|---|
| 70 | primary = readers[0] |
|---|
| 71 | features = readers[1] |
|---|
| 72 | either = False |
|---|
| 73 | if region == 'Upstream': |
|---|
| 74 | up, down = True, False |
|---|
| 75 | elif region == 'Downstream': |
|---|
| 76 | up, down = False, True |
|---|
| 77 | else: |
|---|
| 78 | up, down = True, True |
|---|
| 79 | if region == 'Either': |
|---|
| 80 | either = True |
|---|
| 81 | |
|---|
| 82 | # Read features into memory: |
|---|
| 83 | rightTree = quicksect.IntervalTree() |
|---|
| 84 | for item in features: |
|---|
| 85 | if type( item ) is GenomicInterval: |
|---|
| 86 | rightTree.insert( item, features.linenum, item ) |
|---|
| 87 | |
|---|
| 88 | for interval in primary: |
|---|
| 89 | if type( interval ) is Header: |
|---|
| 90 | yield interval |
|---|
| 91 | if type( interval ) is Comment and comments: |
|---|
| 92 | yield interval |
|---|
| 93 | elif type( interval ) == GenomicInterval: |
|---|
| 94 | chrom = interval.chrom |
|---|
| 95 | start = int(interval.start) |
|---|
| 96 | end = int(interval.end) |
|---|
| 97 | strand = interval.strand |
|---|
| 98 | if chrom not in rightTree.chroms: |
|---|
| 99 | continue |
|---|
| 100 | else: |
|---|
| 101 | root = rightTree.chroms[chrom] #root node for the chrom tree |
|---|
| 102 | result_up = [] |
|---|
| 103 | result_down = [] |
|---|
| 104 | if (strand == '+' and up) or (strand == '-' and down): |
|---|
| 105 | #upstream +ve strand and downstream -ve strand cases |
|---|
| 106 | get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None) |
|---|
| 107 | |
|---|
| 108 | if (strand == '+' and down) or (strand == '-' and up): |
|---|
| 109 | #downstream +ve strand and upstream -ve strand case |
|---|
| 110 | get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node )) |
|---|
| 111 | |
|---|
| 112 | if result_up: |
|---|
| 113 | if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval. |
|---|
| 114 | ends = [] |
|---|
| 115 | for n in result_up: |
|---|
| 116 | ends.append(n.end) |
|---|
| 117 | res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list |
|---|
| 118 | else: |
|---|
| 119 | res_ind = 0 |
|---|
| 120 | if not(either): |
|---|
| 121 | yield [ interval, result_up[res_ind].other ] |
|---|
| 122 | |
|---|
| 123 | if result_down: |
|---|
| 124 | if not(either): |
|---|
| 125 | #The last element of result_down will be the closest element to the given interval |
|---|
| 126 | yield [ interval, result_down[-1].other ] |
|---|
| 127 | |
|---|
| 128 | if either and (result_up or result_down): |
|---|
| 129 | iter_val = [] |
|---|
| 130 | if result_up and result_down: |
|---|
| 131 | if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)): |
|---|
| 132 | iter_val = [ interval, result_up[res_ind].other ] |
|---|
| 133 | else: |
|---|
| 134 | #The last element of result_down will be the closest element to the given interval |
|---|
| 135 | iter_val = [ interval, result_down[-1].other ] |
|---|
| 136 | elif result_up: |
|---|
| 137 | iter_val = [ interval, result_up[res_ind].other ] |
|---|
| 138 | elif result_down: |
|---|
| 139 | #The last element of result_down will be the closest element to the given interval |
|---|
| 140 | iter_val = [ interval, result_down[-1].other ] |
|---|
| 141 | yield iter_val |
|---|
| 142 | |
|---|
| 143 | def main(): |
|---|
| 144 | options, args = doc_optparse.parse( __doc__ ) |
|---|
| 145 | try: |
|---|
| 146 | chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 ) |
|---|
| 147 | chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 ) |
|---|
| 148 | in1_gff_format = bool( options.gff1 ) |
|---|
| 149 | in2_gff_format = bool( options.gff2 ) |
|---|
| 150 | in_fname, in2_fname, out_fname, direction = args |
|---|
| 151 | except: |
|---|
| 152 | doc_optparse.exception() |
|---|
| 153 | |
|---|
| 154 | # Set readers to handle either GFF or default format. |
|---|
| 155 | if in1_gff_format: |
|---|
| 156 | in1_reader_wrapper = GFFReaderWrapper |
|---|
| 157 | else: |
|---|
| 158 | in1_reader_wrapper = NiceReaderWrapper |
|---|
| 159 | if in2_gff_format: |
|---|
| 160 | in2_reader_wrapper = GFFReaderWrapper |
|---|
| 161 | else: |
|---|
| 162 | in2_reader_wrapper = NiceReaderWrapper |
|---|
| 163 | |
|---|
| 164 | g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ), |
|---|
| 165 | chrom_col=chr_col_1, |
|---|
| 166 | start_col=start_col_1, |
|---|
| 167 | end_col=end_col_1, |
|---|
| 168 | strand_col=strand_col_1, |
|---|
| 169 | fix_strand=True ) |
|---|
| 170 | g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ), |
|---|
| 171 | chrom_col=chr_col_2, |
|---|
| 172 | start_col=start_col_2, |
|---|
| 173 | end_col=end_col_2, |
|---|
| 174 | strand_col=strand_col_2, |
|---|
| 175 | fix_strand=True ) |
|---|
| 176 | |
|---|
| 177 | # Find flanking features. |
|---|
| 178 | out_file = open( out_fname, "w" ) |
|---|
| 179 | try: |
|---|
| 180 | for result in proximal_region_finder([g1,g2], direction): |
|---|
| 181 | if type( result ) is list: |
|---|
| 182 | line, closest_feature = result |
|---|
| 183 | # Need to join outputs differently depending on file types. |
|---|
| 184 | if in1_gff_format: |
|---|
| 185 | # Output is GFF with added attribute 'closest feature.' |
|---|
| 186 | |
|---|
| 187 | # Invervals are in BED coordinates; need to convert to GFF. |
|---|
| 188 | line = convert_bed_coords_to_gff( line ) |
|---|
| 189 | closest_feature = convert_bed_coords_to_gff( closest_feature ) |
|---|
| 190 | |
|---|
| 191 | # Replace double quotes with single quotes in closest feature's attributes. |
|---|
| 192 | out_file.write( "%s closest_feature \"%s\" \n" % |
|---|
| 193 | ( "\t".join( line.fields ), \ |
|---|
| 194 | "\t".join( closest_feature.fields ).replace( "\"", "\\\"" ) |
|---|
| 195 | ) ) |
|---|
| 196 | else: |
|---|
| 197 | # Output is BED + closest feature fields. |
|---|
| 198 | output_line_fields = [] |
|---|
| 199 | output_line_fields.extend( line.fields ) |
|---|
| 200 | output_line_fields.extend( closest_feature.fields ) |
|---|
| 201 | out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) ) |
|---|
| 202 | else: |
|---|
| 203 | out_file.write( "%s\n" % result ) |
|---|
| 204 | except ParseError, exc: |
|---|
| 205 | fail( "Invalid file format: %s" % str( exc ) ) |
|---|
| 206 | |
|---|
| 207 | print "Direction: %s" %(direction) |
|---|
| 208 | if g1.skipped > 0: |
|---|
| 209 | print skipped( g1, filedesc=" of 1st dataset" ) |
|---|
| 210 | if g2.skipped > 0: |
|---|
| 211 | print skipped( g2, filedesc=" of 2nd dataset" ) |
|---|
| 212 | |
|---|
| 213 | if __name__ == "__main__": |
|---|
| 214 | main() |
|---|