[2] | 1 | #!/usr/bin/env python |
---|
| 2 | #Greg Von Kuster |
---|
| 3 | """ |
---|
| 4 | Calculate correlations between numeric columns in a tab delim file. |
---|
| 5 | usage: %prog infile output.txt columns method |
---|
| 6 | """ |
---|
| 7 | |
---|
| 8 | import sys |
---|
| 9 | from rpy import * |
---|
| 10 | |
---|
| 11 | def stop_err(msg): |
---|
| 12 | sys.stderr.write(msg) |
---|
| 13 | sys.exit() |
---|
| 14 | |
---|
| 15 | def main(): |
---|
| 16 | method = sys.argv[4] |
---|
| 17 | assert method in ( "pearson", "kendall", "spearman" ) |
---|
| 18 | |
---|
| 19 | try: |
---|
| 20 | columns = map( int, sys.argv[3].split( ',' ) ) |
---|
| 21 | except: |
---|
| 22 | stop_err( "Problem determining columns, perhaps your query does not contain a column of numerical data." ) |
---|
| 23 | |
---|
| 24 | matrix = [] |
---|
| 25 | skipped_lines = 0 |
---|
| 26 | first_invalid_line = 0 |
---|
| 27 | invalid_value = '' |
---|
| 28 | invalid_column = 0 |
---|
| 29 | |
---|
| 30 | for i, line in enumerate( file( sys.argv[1] ) ): |
---|
| 31 | valid = True |
---|
| 32 | line = line.rstrip('\n\r') |
---|
| 33 | |
---|
| 34 | if line and not line.startswith( '#' ): |
---|
| 35 | # Extract values and convert to floats |
---|
| 36 | row = [] |
---|
| 37 | for column in columns: |
---|
| 38 | column -= 1 |
---|
| 39 | fields = line.split( "\t" ) |
---|
| 40 | if len( fields ) <= column: |
---|
| 41 | valid = False |
---|
| 42 | else: |
---|
| 43 | val = fields[column] |
---|
| 44 | if val.lower() == "na": |
---|
| 45 | row.append( float( "nan" ) ) |
---|
| 46 | else: |
---|
| 47 | try: |
---|
| 48 | row.append( float( fields[column] ) ) |
---|
| 49 | except: |
---|
| 50 | valid = False |
---|
| 51 | skipped_lines += 1 |
---|
| 52 | if not first_invalid_line: |
---|
| 53 | first_invalid_line = i+1 |
---|
| 54 | invalid_value = fields[column] |
---|
| 55 | invalid_column = column+1 |
---|
| 56 | else: |
---|
| 57 | valid = False |
---|
| 58 | skipped_lines += 1 |
---|
| 59 | if not first_invalid_line: |
---|
| 60 | first_invalid_line = i+1 |
---|
| 61 | |
---|
| 62 | if valid: |
---|
| 63 | matrix.append( row ) |
---|
| 64 | |
---|
| 65 | if skipped_lines < i: |
---|
| 66 | try: |
---|
| 67 | out = open( sys.argv[2], "w" ) |
---|
| 68 | except: |
---|
| 69 | stop_err( "Unable to open output file" ) |
---|
| 70 | |
---|
| 71 | # Run correlation |
---|
| 72 | try: |
---|
| 73 | value = r.cor( array( matrix ), use="pairwise.complete.obs", method=method ) |
---|
| 74 | except Exception, exc: |
---|
| 75 | out.close() |
---|
| 76 | stop_err("%s" %str( exc )) |
---|
| 77 | for row in value: |
---|
| 78 | print >> out, "\t".join( map( str, row ) ) |
---|
| 79 | out.close() |
---|
| 80 | |
---|
| 81 | if skipped_lines > 0: |
---|
| 82 | msg = "..Skipped %d lines starting with line #%d. " %( skipped_lines, first_invalid_line ) |
---|
| 83 | if invalid_value and invalid_column > 0: |
---|
| 84 | msg += "Value '%s' in column %d is not numeric." % ( invalid_value, invalid_column ) |
---|
| 85 | print msg |
---|
| 86 | |
---|
| 87 | if __name__ == "__main__": |
---|
| 88 | main() |
---|