[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | import sys, re, tempfile |
---|
| 4 | from rpy import * |
---|
| 5 | # Older py compatibility |
---|
| 6 | try: |
---|
| 7 | set() |
---|
| 8 | except: |
---|
| 9 | from sets import Set as set |
---|
| 10 | |
---|
| 11 | assert sys.version_info[:2] >= ( 2, 4 ) |
---|
| 12 | |
---|
| 13 | def stop_err( msg ): |
---|
| 14 | sys.stderr.write( msg ) |
---|
| 15 | sys.exit() |
---|
| 16 | |
---|
| 17 | def S3_METHODS( all="key" ): |
---|
| 18 | Group_Math = [ "abs", "sign", "sqrt", "floor", "ceiling", "trunc", "round", "signif", |
---|
| 19 | "exp", "log", "cos", "sin", "tan", "acos", "asin", "atan", "cosh", "sinh", "tanh", |
---|
| 20 | "acosh", "asinh", "atanh", "lgamma", "gamma", "gammaCody", "digamma", "trigamma", |
---|
| 21 | "cumsum", "cumprod", "cummax", "cummin", "c" ] |
---|
| 22 | Group_Ops = [ "+", "-", "*", "/", "^", "%%", "%/%", "&", "|", "!", "==", "!=", "<", "<=", ">=", ">", "(", ")", "~", "," ] |
---|
| 23 | if all is "key": |
---|
| 24 | return { 'Math' : Group_Math, 'Ops' : Group_Ops } |
---|
| 25 | |
---|
| 26 | def main(): |
---|
| 27 | try: |
---|
| 28 | datafile = sys.argv[1] |
---|
| 29 | outfile_name = sys.argv[2] |
---|
| 30 | expression = sys.argv[3] |
---|
| 31 | except: |
---|
| 32 | stop_err( 'Usage: python gsummary.py input_file ouput_file expression' ) |
---|
| 33 | |
---|
| 34 | math_allowed = S3_METHODS()[ 'Math' ] |
---|
| 35 | ops_allowed = S3_METHODS()[ 'Ops' ] |
---|
| 36 | |
---|
| 37 | # Check for invalid expressions |
---|
| 38 | for word in re.compile( '[a-zA-Z]+' ).findall( expression ): |
---|
| 39 | if word and not word in math_allowed: |
---|
| 40 | stop_err( "Invalid expression '%s': term '%s' is not recognized or allowed" %( expression, word ) ) |
---|
| 41 | symbols = set() |
---|
| 42 | for symbol in re.compile( '[^a-z0-9\s]+' ).findall( expression ): |
---|
| 43 | if symbol and not symbol in ops_allowed: |
---|
| 44 | stop_err( "Invalid expression '%s': operator '%s' is not recognized or allowed" % ( expression, symbol ) ) |
---|
| 45 | else: |
---|
| 46 | symbols.add( symbol ) |
---|
| 47 | if len( symbols ) == 1 and ',' in symbols: |
---|
| 48 | # User may have entered a comma-separated list r_data_frame columns |
---|
| 49 | stop_err( "Invalid columns '%s': this tool requires a single column or expression" % expression ) |
---|
| 50 | |
---|
| 51 | # Find all column references in the expression |
---|
| 52 | cols = [] |
---|
| 53 | for col in re.compile( 'c[0-9]+' ).findall( expression ): |
---|
| 54 | try: |
---|
| 55 | cols.append( int( col[1:] ) - 1 ) |
---|
| 56 | except: |
---|
| 57 | pass |
---|
| 58 | |
---|
| 59 | tmp_file = tempfile.NamedTemporaryFile( 'w+b' ) |
---|
| 60 | # Write the R header row to the temporary file |
---|
| 61 | hdr_str = "\t".join( "c%s" % str( col+1 ) for col in cols ) |
---|
| 62 | tmp_file.write( "%s\n" % hdr_str ) |
---|
| 63 | skipped_lines = 0 |
---|
| 64 | first_invalid_line = 0 |
---|
| 65 | i = 0 |
---|
| 66 | for i, line in enumerate( file( datafile ) ): |
---|
| 67 | line = line.rstrip( '\r\n' ) |
---|
| 68 | if line and not line.startswith( '#' ): |
---|
| 69 | valid = True |
---|
| 70 | fields = line.split( '\t' ) |
---|
| 71 | # Write the R data row to the temporary file |
---|
| 72 | for col in cols: |
---|
| 73 | try: |
---|
| 74 | float( fields[ col ] ) |
---|
| 75 | except: |
---|
| 76 | skipped_lines += 1 |
---|
| 77 | if not first_invalid_line: |
---|
| 78 | first_invalid_line = i + 1 |
---|
| 79 | valid = False |
---|
| 80 | break |
---|
| 81 | if valid: |
---|
| 82 | data_str = "\t".join( fields[ col ] for col in cols ) |
---|
| 83 | tmp_file.write( "%s\n" % data_str ) |
---|
| 84 | tmp_file.flush() |
---|
| 85 | |
---|
| 86 | if skipped_lines == i + 1: |
---|
| 87 | stop_err( "Invalid column or column data values invalid for computation. See tool tips and syntax for data requirements." ) |
---|
| 88 | else: |
---|
| 89 | # summary function and return labels |
---|
| 90 | summary_func = r( "function( x ) { c( sum=sum( as.numeric( x ), na.rm=T ), mean=mean( as.numeric( x ), na.rm=T ), stdev=sd( as.numeric( x ), na.rm=T ), quantile( as.numeric( x ), na.rm=TRUE ) ) }" ) |
---|
| 91 | headings = [ 'sum', 'mean', 'stdev', '0%', '25%', '50%', '75%', '100%' ] |
---|
| 92 | headings_str = "\t".join( headings ) |
---|
| 93 | |
---|
| 94 | set_default_mode( NO_CONVERSION ) |
---|
| 95 | r_data_frame = r.read_table( tmp_file.name, header=True, sep="\t" ) |
---|
| 96 | |
---|
| 97 | outfile = open( outfile_name, 'w' ) |
---|
| 98 | |
---|
| 99 | for col in re.compile( 'c[0-9]+' ).findall( expression ): |
---|
| 100 | r.assign( col, r[ "$" ]( r_data_frame, col ) ) |
---|
| 101 | try: |
---|
| 102 | summary = summary_func( r( expression ) ) |
---|
| 103 | except RException, s: |
---|
| 104 | outfile.close() |
---|
| 105 | stop_err( "Computation resulted in the following error: %s" % str( s ) ) |
---|
| 106 | summary = summary.as_py( BASIC_CONVERSION ) |
---|
| 107 | outfile.write( "#%s\n" % headings_str ) |
---|
| 108 | outfile.write( "%s\n" % "\t".join( [ "%g" % ( summary[ k ] ) for k in headings ] ) ) |
---|
| 109 | outfile.close() |
---|
| 110 | |
---|
| 111 | if skipped_lines: |
---|
| 112 | print "Skipped %d invalid lines beginning with line #%d. See tool tips for data requirements." % ( skipped_lines, first_invalid_line ) |
---|
| 113 | |
---|
| 114 | if __name__ == "__main__": main() |
---|