root/galaxy-central/tools/new_operations/flanking_features.py @ 2

リビジョン 2, 10.2 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2#By: Guruprasad Ananda
3"""
4Fetch closest up/downstream interval from features corresponding to every interval in primary
5
6usage: %prog primary_file features_file out_file direction
7    -1, --cols1=N,N,N,N: Columns for start, end, strand in first file
8    -2, --cols2=N,N,N,N: Columns for start, end, strand in second file
9    -G, --gff1: input 1 is GFF format, meaning start and end coordinates are 1-based, closed interval
10    -H, --gff2: input 2 is GFF format, meaning start and end coordinates are 1-based, closed interval
11"""
12from galaxy import eggs
13import pkg_resources
14pkg_resources.require( "bx-python" )
15import sys, traceback, fileinput
16from warnings import warn
17from bx.cookbook import doc_optparse
18from galaxy.tools.util.galaxyops import *
19from bx.intervals.io import *
20from bx.intervals.operations import quicksect
21from galaxy.tools.util.gff_util import *
22
23assert sys.version_info[:2] >= ( 2, 4 )
24
25def get_closest_feature (node, direction, threshold_up, threshold_down, report_func_up, report_func_down):
26    #direction=1 for +ve strand upstream and -ve strand downstream cases; and it is 0 for +ve strand downstream and -ve strand upstream cases
27    #threhold_Up is equal to the interval start for +ve strand, and interval end for -ve strand
28    #threhold_down is equal to the interval end for +ve strand, and interval start for -ve strand
29    if direction == 1:
30        if node.maxend < threshold_up:
31            if node.end == node.maxend:
32                report_func_up(node)
33            elif node.right and node.left:
34                if node.right.maxend == node.maxend:
35                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
36                elif node.left.maxend == node.maxend:
37                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
38            elif node.right and node.right.maxend == node.maxend:
39                get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
40            elif node.left and node.left.maxend == node.maxend:
41                get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
42        elif node.minend < threshold_up:
43            if node.end < threshold_up:
44                report_func_up(node)
45            if node.left and node.right:
46                if node.right.minend < threshold_up:
47                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
48                if node.left.minend < threshold_up:
49                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
50            elif node.left:
51                if node.left.minend < threshold_up:
52                    get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
53            elif node.right:
54                if node.right.minend < threshold_up:
55                    get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
56    elif direction == 0:
57        if node.start > threshold_down:
58            report_func_down(node)
59            if node.left:
60                get_closest_feature(node.left, direction, threshold_up, threshold_down, report_func_up, report_func_down)
61        else:
62            if node.right:
63                get_closest_feature(node.right, direction, threshold_up, threshold_down, report_func_up, report_func_down)
64
65def proximal_region_finder(readers, region, comments=True):
66    """
67    Returns an iterator that yields elements of the form [ <original_interval>, <closest_feature> ].
68    Intervals are GenomicInterval objects.
69    """
70    primary = readers[0]
71    features = readers[1]
72    either = False
73    if region == 'Upstream':
74        up, down = True, False
75    elif region == 'Downstream':
76        up, down = False, True
77    else:
78        up, down = True, True
79        if region == 'Either':
80            either = True
81       
82    # Read features into memory:
83    rightTree = quicksect.IntervalTree()
84    for item in features:
85        if type( item ) is GenomicInterval:
86            rightTree.insert( item, features.linenum, item )
87           
88    for interval in primary:
89        if type( interval ) is Header:
90            yield interval
91        if type( interval ) is Comment and comments:
92            yield interval
93        elif type( interval ) == GenomicInterval:
94            chrom = interval.chrom
95            start = int(interval.start)
96            end = int(interval.end)
97            strand = interval.strand
98            if chrom not in rightTree.chroms:
99                continue
100            else:
101                root = rightTree.chroms[chrom]    #root node for the chrom tree
102                result_up = []
103                result_down = []
104                if (strand == '+' and up) or (strand == '-' and down):
105                    #upstream +ve strand and downstream -ve strand cases
106                    get_closest_feature (root, 1, start, None, lambda node: result_up.append( node ), None)
107                   
108                if (strand == '+' and down) or (strand == '-' and up):
109                    #downstream +ve strand and upstream -ve strand case
110                    get_closest_feature (root, 0, None, end-1, None, lambda node: result_down.append( node ))
111               
112                if result_up:
113                    if len(result_up) > 1: #The results_up list has a list of intervals upstream to the given interval.
114                        ends = []
115                        for n in result_up:
116                            ends.append(n.end)
117                        res_ind = ends.index(max(ends)) #fetch the index of the closest interval i.e. the interval with the max end from the results_up list
118                    else:
119                        res_ind = 0
120                    if not(either):
121                        yield [ interval, result_up[res_ind].other ]
122               
123                if result_down:   
124                    if not(either):
125                        #The last element of result_down will be the closest element to the given interval
126                        yield [ interval, result_down[-1].other ]
127               
128                if either and (result_up or result_down):
129                    iter_val = []
130                    if result_up and result_down:
131                        if abs(start - int(result_up[res_ind].end)) <= abs(end - int(result_down[-1].start)):
132                            iter_val = [ interval, result_up[res_ind].other ]
133                        else:
134                            #The last element of result_down will be the closest element to the given interval
135                            iter_val = [ interval, result_down[-1].other ]
136                    elif result_up:
137                        iter_val = [ interval, result_up[res_ind].other ]
138                    elif result_down:
139                        #The last element of result_down will be the closest element to the given interval
140                        iter_val = [ interval, result_down[-1].other ]
141                    yield iter_val
142                       
143def main():
144    options, args = doc_optparse.parse( __doc__ )
145    try:
146        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
147        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
148        in1_gff_format = bool( options.gff1 )
149        in2_gff_format = bool( options.gff2 )
150        in_fname, in2_fname, out_fname, direction = args
151    except:
152        doc_optparse.exception()
153       
154    # Set readers to handle either GFF or default format.
155    if in1_gff_format:
156        in1_reader_wrapper = GFFReaderWrapper
157    else:
158        in1_reader_wrapper = NiceReaderWrapper
159    if in2_gff_format:
160        in2_reader_wrapper = GFFReaderWrapper
161    else:
162        in2_reader_wrapper = NiceReaderWrapper
163
164    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
165                            chrom_col=chr_col_1,
166                            start_col=start_col_1,
167                            end_col=end_col_1,
168                            strand_col=strand_col_1,
169                            fix_strand=True )
170    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
171                            chrom_col=chr_col_2,
172                            start_col=start_col_2,
173                            end_col=end_col_2,
174                            strand_col=strand_col_2,
175                            fix_strand=True )
176
177    # Find flanking features.
178    out_file = open( out_fname, "w" )
179    try:
180        for result in proximal_region_finder([g1,g2], direction):
181            if type( result ) is list:
182                line, closest_feature = result
183                # Need to join outputs differently depending on file types.
184                if in1_gff_format:
185                    # Output is GFF with added attribute 'closest feature.'
186
187                    # Invervals are in BED coordinates; need to convert to GFF.
188                    line = convert_bed_coords_to_gff( line )
189                    closest_feature = convert_bed_coords_to_gff( closest_feature )
190                   
191                    # Replace double quotes with single quotes in closest feature's attributes.
192                    out_file.write( "%s closest_feature \"%s\" \n" %
193                                    ( "\t".join( line.fields ), \
194                                      "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
195                                     ) )
196                else:
197                    # Output is BED + closest feature fields.
198                    output_line_fields = []
199                    output_line_fields.extend( line.fields )
200                    output_line_fields.extend( closest_feature.fields )
201                    out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) )
202            else:
203                out_file.write( "%s\n" % result )
204    except ParseError, exc:
205        fail( "Invalid file format: %s" % str( exc ) )
206
207    print "Direction: %s" %(direction)
208    if g1.skipped > 0:
209        print skipped( g1, filedesc=" of 1st dataset" )
210    if g2.skipped > 0:
211        print skipped( g2, filedesc=" of 2nd dataset" )
212
213if __name__ == "__main__":
214    main()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。