1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | import sys, re, tempfile |
---|
4 | from rpy import * |
---|
5 | # Older py compatibility |
---|
6 | try: |
---|
7 | set() |
---|
8 | except: |
---|
9 | from sets import Set as set |
---|
10 | |
---|
11 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
12 | |
---|
13 | def stop_err( msg ): |
---|
14 | sys.stderr.write( msg ) |
---|
15 | sys.exit() |
---|
16 | |
---|
17 | def S3_METHODS( all="key" ): |
---|
18 | Group_Math = [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif", |
---|
19 | "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh", |
---|
20 | "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma", |
---|
21 | "cumsum", "cumprod", "cummax", "cummin", "c" ] |
---|
22 | Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ] |
---|
23 | if all is "key": |
---|
24 | return { 'Math' : Group_Math, 'Ops' : Group_Ops } |
---|
25 | |
---|
26 | def main(): |
---|
27 | try: |
---|
28 | datafile = sys.argv[1] |
---|
29 | outfile_name = sys.argv[2] |
---|
30 | expression = sys.argv[3] |
---|
31 | except: |
---|
32 | stop_err( 'Usage: python gsummary.py input_file ouput_file expression' ) |
---|
33 | |
---|
34 | math_allowed = S3_METHODS()[ 'Math' ] |
---|
35 | ops_allowed = S3_METHODS()[ 'Ops' ] |
---|
36 | |
---|
37 | # Check for invalid expressions |
---|
38 | for word in re.compile( '[a-zA-Z]+' ).findall( expression ): |
---|
39 | if word and not word in math_allowed: |
---|
40 | stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) ) |
---|
41 | symbols = set() |
---|
42 | for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ): |
---|
43 | if symbol and not symbol in ops_allowed: |
---|
44 | stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) ) |
---|
45 | else: |
---|
46 | symbols.add( symbol ) |
---|
47 | if len( symbols ) == 1 and ',' in symbols: |
---|
48 | # User may have entered a comma-separated list r_data_frame columns |
---|
49 | stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression ) |
---|
50 | |
---|
51 | # Find all column references in the expression |
---|
52 | cols = [] |
---|
53 | for col in re.compile( 'c[0-9]+' ).findall( expression ): |
---|
54 | try: |
---|
55 | cols.append( int( col[1:] ) - 1 ) |
---|
56 | except: |
---|
57 | pass |
---|
58 | |
---|
59 | tmp_file = tempfile.NamedTemporaryFile( 'w+b' ) |
---|
60 | # Write the R header row to the temporary file |
---|
61 | hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols ) |
---|
62 | tmp_file.write( "%s\n" % hdr_str ) |
---|
63 | skipped_lines = 0 |
---|
64 | first_invalid_line = 0 |
---|
65 | i = 0 |
---|
66 | for i, line in enumerate( file( datafile ) ): |
---|
67 | line = line.rstrip( '\r\n' ) |
---|
68 | if line and not line.startswith( '#' ): |
---|
69 | valid = True |
---|
70 | fields = line.split( '\t' ) |
---|
71 | # Write the R data row to the temporary file |
---|
72 | for col in cols: |
---|
73 | try: |
---|
74 | float( fields[ col ] ) |
---|
75 | except: |
---|
76 | skipped_lines += 1 |
---|
77 | if not first_invalid_line: |
---|
78 | first_invalid_line = i + 1 |
---|
79 | valid = False |
---|
80 | break |
---|
81 | if valid: |
---|
82 | data_str = "\t".join( fields[ col ] for col in cols ) |
---|
83 | tmp_file.write( "%s\n" % data_str ) |
---|
84 | tmp_file.flush() |
---|
85 | |
---|
86 | if skipped_lines == i + 1: |
---|
87 | stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." ) |
---|
88 | else: |
---|
89 | # summary function and return labels |
---|
90 | summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) |
---|
91 | headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ] |
---|
92 | headings_str = "\t".join( headings ) |
---|
93 | |
---|
94 | set_default_mode( NO_CONVERSION ) |
---|
95 | r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" ) |
---|
96 | |
---|
97 | outfile = open( outfile_name, 'w' ) |
---|
98 | |
---|
99 | for col in re.compile( 'c[0-9]+' ).findall( expression ): |
---|
100 | r.assign( col, r[ "$" ]( r_data_frame, col ) ) |
---|
101 | try: |
---|
102 | summary = summary_func( r( expression ) ) |
---|
103 | except RException, s: |
---|
104 | outfile.close() |
---|
105 | stop_err( "Computation resulted in the following error: %s" % str( s ) ) |
---|
106 | summary = summary.as_py( BASIC_CONVERSION ) |
---|
107 | outfile.write( "#%s\n" % headings_str ) |
---|
108 | outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary[ k ] ) for k in headings ] ) ) |
---|
109 | outfile.close() |
---|
110 | |
---|
111 | if skipped_lines: |
---|
112 | print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line ) |
---|
113 | |
---|
114 | if __name__ == "__main__": main() |
---|