root/galaxy-central/tools/stats/column_maker.py

リビジョン 2, 4.4 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2# This tool takes a tab-delimited textfile as input and creates another column in the file which is the result of
3# a computation performed on every row in the original file.  The tool will skip over invalid lines within the file,
4# informing the user about the number of lines skipped. 
5import sys, re, os.path
6from galaxy import eggs
7from galaxy.tools import validation
8from galaxy.datatypes import metadata
9from math import log,exp,sqrt,ceil,floor
10
11assert sys.version_info[:2] >= ( 2, 4 )
12
13def stop_err( msg ):
14    sys.stderr.write( msg )
15    sys.exit()
16
17inp_file = sys.argv[1]
18out_file = sys.argv[2]
19expr = sys.argv[3]
20round_result = sys.argv[4]
21try:
22    in_columns = int( sys.argv[5] )
23except:
24    stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
25if in_columns < 2:
26    # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method.
27    stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
28try:
29    in_column_types = sys.argv[6].split( ',' )
30except:
31    stop_err( "Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
32if len( in_column_types ) != in_columns:
33    stop_err( "The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it.  This tool can only be used with tab-delimited data." )
34   
35# Unescape if input has been escaped
36mapped_str = {
37    '__lt__': '<',
38    '__le__': '<=',
39    '__eq__': '==',
40    '__ne__': '!=',
41    '__gt__': '>',
42    '__ge__': '>=',
43    '__sq__': '\'',
44    '__dq__': '"',
45}
46for key, value in mapped_str.items():
47    expr = expr.replace( key, value )
48
49# Prepare the column variable names and wrappers for column data types
50cols, type_casts = [], []
51for col in range( 1, in_columns + 1 ):
52    col_name = "c%d" % col
53    cols.append( col_name )
54    col_type = in_column_types[ col - 1 ].strip()
55    if round_result == 'no' and col_type == 'int':
56        col_type = 'float'
57    type_cast = "%s(%s)" % ( col_type, col_name )
58    type_casts.append( type_cast )
59       
60col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
61type_cast_str = ', '.join( type_casts )  # 'str(c1), int(c2), int(c3), str(c4)'
62assign = "%s = line.split( '\\t' )" % col_str
63wrap = "%s = %s" % ( col_str, type_cast_str )
64skipped_lines = 0
65first_invalid_line = 0
66invalid_line = None
67lines_kept = 0
68total_lines = 0
69out = open( out_file, 'wt' )
70
71# Read input file, skipping invalid lines, and perform computation that will result in a new column
72code = '''
73for i, line in enumerate( file( inp_file ) ):
74    total_lines += 1
75    line = line.rstrip( '\\r\\n' )
76    if not line or line.startswith( '#' ):
77        skipped_lines += 1
78        if not invalid_line:
79            first_invalid_line = i + 1
80            invalid_line = line
81        continue
82    try:
83        %s
84        %s
85        new_val = %s
86        if round_result == "yes":
87            new_val = int( round( new_val ) )
88        new_line = line + '\\t' + str( new_val )
89        print >> out, new_line
90        lines_kept += 1
91    except:
92        skipped_lines += 1
93        if not invalid_line:
94            first_invalid_line = i + 1
95            invalid_line = line
96''' % ( assign, wrap, expr )
97
98valid_expr = True
99try:
100    exec code
101except Exception, e:
102    out.close()
103    if str( e ).startswith( 'invalid syntax' ):
104        valid_expr = False
105        stop_err( 'Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr )
106    else:
107        stop_err( str( e ) )
108
109if valid_expr:
110    out.close()
111    valid_lines = total_lines - skipped_lines
112    print 'Creating column %d with expression %s' % ( in_columns + 1, expr )
113    if valid_lines > 0:
114        print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
115    else:
116        print 'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr
117    if skipped_lines > 0:
118        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。