root/galaxy-central/tools/stats/filtering.py

リビジョン 2, 4.1 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2# This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties.
3# The tool will skip over invalid lines within the file, informing the user about the number of lines skipped.
4
5from __future__ import division
6import sys, re, os.path
7from galaxy import eggs
8
9# Older py compatibility
10try:
11    set()
12except:
13    from sets import Set as set
14
15assert sys.version_info[:2] >= ( 2, 4 )
16
17def get_operands( filter_condition ):
18    # Note that the order of all_operators is important
19    items_to_strip = ['+', '-', '**', '*', '//', '/', '%', '<<', '>>', '&', '|', '^', '~', '<=', '<', '>=', '>', '==', '!=', '<>', ' and ', ' or ', ' not ', ' is ', ' is not ', ' in ', ' not in ']
20    for item in items_to_strip:
21        if filter_condition.find( item ) >= 0:
22            filter_condition = filter_condition.replace( item, ' ' )
23    operands = set( filter_condition.split( ' ' ) )
24    return operands
25
26def stop_err( msg ):
27    sys.stderr.write( msg )
28    sys.exit()
29
30in_fname = sys.argv[1]
31out_fname = sys.argv[2]
32cond_text = sys.argv[3]
33try:
34    in_columns = int( sys.argv[4] )
35    assert sys.argv[5]  #check to see that the column types varaible isn't null
36    in_column_types = sys.argv[5].split( ',' )
37except:
38    stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
39
40# Unescape if input has been escaped
41mapped_str = {
42    '__lt__': '<',
43    '__le__': '<=',
44    '__eq__': '==',
45    '__ne__': '!=',
46    '__gt__': '>',
47    '__ge__': '>=',
48    '__sq__': '\'',
49    '__dq__': '"',
50}
51for key, value in mapped_str.items():
52    cond_text = cond_text.replace( key, value )
53   
54# Attempt to determine if the condition includes executable stuff and, if so, exit
55secured = dir()
56operands = get_operands(cond_text)
57for operand in operands:
58    try:
59        check = int( operand )
60    except:
61        if operand in secured:
62            stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
63
64# Prepare the column variable names and wrappers for column data types
65cols, type_casts = [], []
66for col in range( 1, in_columns + 1 ):
67    col_name = "c%d" % col
68    cols.append( col_name )
69    col_type = in_column_types[ col - 1 ]
70    type_cast = "%s(%s)" % ( col_type, col_name )
71    type_casts.append( type_cast )
72 
73col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
74type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
75assign = "%s = line.split( '\\t' )" % col_str
76wrap = "%s = %s" % ( col_str, type_cast_str )
77skipped_lines = 0
78first_invalid_line = 0
79invalid_line = None
80lines_kept = 0
81total_lines = 0
82out = open( out_fname, 'wt' )
83   
84# Read and filter input file, skipping invalid lines
85code = '''
86for i, line in enumerate( file( in_fname ) ):
87    total_lines += 1
88    line = line.rstrip( '\\r\\n' )
89    if not line or line.startswith( '#' ):
90        skipped_lines += 1
91        if not invalid_line:
92            first_invalid_line = i + 1
93            invalid_line = line
94        continue
95    try:
96        %s
97        %s
98        if %s:
99            lines_kept += 1
100            print >> out, line
101    except:
102        skipped_lines += 1
103        if not invalid_line:
104            first_invalid_line = i + 1
105            invalid_line = line
106''' % ( assign, wrap, cond_text )
107
108valid_filter = True
109try:
110    exec code
111except Exception, e:
112    out.close()
113    if str( e ).startswith( 'invalid syntax' ):
114        valid_filter = False
115        stop_err( 'Filter condition "%s" likely invalid. See tool tips, syntax and examples.' % cond_text )
116    else:
117        stop_err( str( e ) )
118
119if valid_filter:
120    out.close()
121    valid_lines = total_lines - skipped_lines
122    print 'Filtering with %s, ' % cond_text
123    if valid_lines > 0:
124        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
125    else:
126        print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
127    if skipped_lines > 0:
128        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。