root/galaxy-central/tools/filters/gff/gff_filter_by_attribute.py @ 2

リビジョン 2, 4.5 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2# This tool takes a gff file as input and creates filters on attributes based on certain properties.
3# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
4# TODO: much of this code is copied from the Filter1 tool (filtering.py in tools/stats/). The commonalities should be
5# abstracted and leveraged in each filtering tool.
6
7from __future__ import division
8import sys, re, os.path
9
10# Older py compatibility
11try:
12    set()
13except:
14    from sets import Set as set
15
16assert sys.version_info[:2] >= ( 2, 4 )
17
18def get_operands( filter_condition ):
19    # Note that the order of all_operators is important
20    items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
21    for item in items_to_strip:
22        if filter_condition.find( item ) >= 0:
23            filter_condition = filter_condition.replace( item, ' ' )
24    operands = set( filter_condition.split( ' ' ) )
25    return operands
26
27def stop_err( msg ):
28    sys.stderr.write( msg )
29    sys.exit()
30
31in_fname = sys.argv[1]
32out_fname = sys.argv[2]
33attribute_type = sys.argv[3]
34attribute_name = sys.argv[4]
35cond_text = sys.argv[5]
36
37# Unescape if input has been escaped
38mapped_str = {
39    '__lt__': '<',
40    '__le__': '<=',
41    '__eq__': '==',
42    '__ne__': '!=',
43    '__gt__': '>',
44    '__ge__': '>=',
45    '__sq__': '\'',
46    '__dq__': '"',
47}
48for key, value in mapped_str.items():
49    cond_text = cond_text.replace( key, value )
50   
51# Condition text is 'attribute meets condition.'
52cond_text = attribute_name + cond_text
53   
54# Attempt to determine if the condition includes executable stuff and, if so, exit
55secured = dir()
56operands = get_operands(cond_text)
57for operand in operands:
58    try:
59        check = int( operand )
60    except:
61        if operand in secured:
62            stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
63           
64# Set up assignment.
65assignment = "%s = attributes.get('%s', None)" % ( attribute_name, attribute_name )
66           
67# Set up type casting based on attribute type.
68type_cast = "%s = %s(%s)" % ( attribute_name, attribute_type, attribute_name)
69
70# Stats
71skipped_lines = 0
72first_invalid_line = 0
73invalid_line = None
74lines_kept = 0
75total_lines = 0
76out = open( out_fname, 'wt' )
77   
78# Read and filter input file, skipping invalid lines
79code = '''
80for i, line in enumerate( file( in_fname ) ):
81    total_lines += 1
82    line = line.rstrip( '\\r\\n' )
83    if not line or line.startswith( '#' ):
84        skipped_lines += 1
85        if not invalid_line:
86            first_invalid_line = i + 1
87            invalid_line = line
88        continue
89    try:
90        # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
91        # Attributes format: name1 "value1" ; name2 "value2" ; ...
92        elems = line.split( '\t' )
93        attributes_list = elems[8].split(";")
94        attributes = {}
95        for name_value_pair in attributes_list:
96            pair = name_value_pair.strip().split(" ")
97            if pair == '':
98                continue
99            name = pair[0].strip()
100            if name == '':
101                continue
102            # Need to strip double quote from values
103            value = pair[1].strip(" \\"")
104            attributes[name] = value
105        %s
106        if %s:
107            %s
108            if %s:
109                lines_kept += 1
110                print >> out, line
111    except Exception, e:
112        skipped_lines += 1
113        if not invalid_line:
114            first_invalid_line = i + 1
115            invalid_line = line
116''' % ( assignment, attribute_name, type_cast, cond_text )
117
118
119valid_filter = True
120try:
121    exec code
122except Exception, e:
123    out.close()
124    if str( e ).startswith( 'invalid syntax' ):
125        valid_filter = False
126        stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
127    else:
128        stop_err( str( e ) )
129
130if valid_filter:
131    out.close()
132    valid_lines = total_lines - skipped_lines
133    print 'Filtering with %s, ' % ( cond_text )
134    if valid_lines > 0:
135        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
136    else:
137        print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
138    if skipped_lines > 0:
139        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。