[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | from galaxy import eggs |
---|
| 4 | |
---|
| 5 | import sys, string |
---|
| 6 | from rpy import * |
---|
| 7 | import numpy |
---|
| 8 | |
---|
| 9 | def stop_err(msg): |
---|
| 10 | sys.stderr.write(msg) |
---|
| 11 | sys.exit() |
---|
| 12 | |
---|
| 13 | def sscombs(s): |
---|
| 14 | if len(s) == 1: |
---|
| 15 | return [s] |
---|
| 16 | else: |
---|
| 17 | ssc = sscombs(s[1:]) |
---|
| 18 | return [s[0]] + [s[0]+comb for comb in ssc] + ssc |
---|
| 19 | |
---|
| 20 | |
---|
| 21 | infile = sys.argv[1] |
---|
| 22 | y_col = int(sys.argv[2])-1 |
---|
| 23 | x_cols = sys.argv[3].split(',') |
---|
| 24 | outfile = sys.argv[4] |
---|
| 25 | |
---|
| 26 | print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) |
---|
| 27 | fout = open(outfile,'w') |
---|
| 28 | |
---|
| 29 | for i, line in enumerate( file ( infile )): |
---|
| 30 | line = line.rstrip('\r\n') |
---|
| 31 | if len( line )>0 and not line.startswith( '#' ): |
---|
| 32 | elems = line.split( '\t' ) |
---|
| 33 | break |
---|
| 34 | if i == 30: |
---|
| 35 | break # Hopefully we'll never get here... |
---|
| 36 | |
---|
| 37 | if len( elems )<1: |
---|
| 38 | stop_err( "The data in your input dataset is either missing or not formatted properly." ) |
---|
| 39 | |
---|
| 40 | y_vals = [] |
---|
| 41 | x_vals = [] |
---|
| 42 | |
---|
| 43 | for k,col in enumerate(x_cols): |
---|
| 44 | x_cols[k] = int(col)-1 |
---|
| 45 | x_vals.append([]) |
---|
| 46 | """ |
---|
| 47 | try: |
---|
| 48 | float( elems[x_cols[k]] ) |
---|
| 49 | except: |
---|
| 50 | try: |
---|
| 51 | msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] ) |
---|
| 52 | except: |
---|
| 53 | msg = "This operation cannot be performed on non-numeric data." |
---|
| 54 | stop_err( msg ) |
---|
| 55 | """ |
---|
| 56 | NA = 'NA' |
---|
| 57 | for ind,line in enumerate( file( infile )): |
---|
| 58 | if line and not line.startswith( '#' ): |
---|
| 59 | try: |
---|
| 60 | fields = line.split("\t") |
---|
| 61 | try: |
---|
| 62 | yval = float(fields[y_col]) |
---|
| 63 | except Exception, ey: |
---|
| 64 | yval = r('NA') |
---|
| 65 | #print >>sys.stderr, "ey = %s" %ey |
---|
| 66 | y_vals.append(yval) |
---|
| 67 | for k,col in enumerate(x_cols): |
---|
| 68 | try: |
---|
| 69 | xval = float(fields[col]) |
---|
| 70 | except Exception, ex: |
---|
| 71 | xval = r('NA') |
---|
| 72 | #print >>sys.stderr, "ex = %s" %ex |
---|
| 73 | x_vals[k].append(xval) |
---|
| 74 | except: |
---|
| 75 | pass |
---|
| 76 | |
---|
| 77 | x_vals1 = numpy.asarray(x_vals).transpose() |
---|
| 78 | dat= r.list(x=array(x_vals1), y=y_vals) |
---|
| 79 | |
---|
| 80 | set_default_mode(NO_CONVERSION) |
---|
| 81 | try: |
---|
| 82 | full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user |
---|
| 83 | except RException, rex: |
---|
| 84 | stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") |
---|
| 85 | set_default_mode(BASIC_CONVERSION) |
---|
| 86 | |
---|
| 87 | summary = r.summary(full) |
---|
| 88 | fullr2 = summary.get('r.squared','NA') |
---|
| 89 | |
---|
| 90 | if fullr2 == 'NA': |
---|
| 91 | stop_error("Error in linear regression") |
---|
| 92 | |
---|
| 93 | if len(x_vals) < 10: |
---|
| 94 | s = "" |
---|
| 95 | for ch in range(len(x_vals)): |
---|
| 96 | s += str(ch) |
---|
| 97 | else: |
---|
| 98 | stop_err("This tool only works with less than 10 predictors.") |
---|
| 99 | |
---|
| 100 | print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value" |
---|
| 101 | all_combos = sorted(sscombs(s), key=len) |
---|
| 102 | all_combos.reverse() |
---|
| 103 | for j,cols in enumerate(all_combos): |
---|
| 104 | #if len(cols) == len(s): #Same as the full model above |
---|
| 105 | # continue |
---|
| 106 | if len(cols) == 1: |
---|
| 107 | x_vals1 = x_vals[int(cols)] |
---|
| 108 | else: |
---|
| 109 | x_v = [] |
---|
| 110 | for col in cols: |
---|
| 111 | x_v.append(x_vals[int(col)]) |
---|
| 112 | x_vals1 = numpy.asarray(x_v).transpose() |
---|
| 113 | dat= r.list(x=array(x_vals1), y=y_vals) |
---|
| 114 | set_default_mode(NO_CONVERSION) |
---|
| 115 | red = r.lm(r("y ~ x"), data= dat) #Reduced model |
---|
| 116 | set_default_mode(BASIC_CONVERSION) |
---|
| 117 | summary = r.summary(red) |
---|
| 118 | redr2 = summary.get('r.squared','NA') |
---|
| 119 | try: |
---|
| 120 | rcve = (float(fullr2)-float(redr2))/float(fullr2) |
---|
| 121 | except: |
---|
| 122 | rcve = 'NA' |
---|
| 123 | col_str = "" |
---|
| 124 | for col in cols: |
---|
| 125 | col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " |
---|
| 126 | col_str.strip() |
---|
| 127 | rcve_col_str = "" |
---|
| 128 | for col in s: |
---|
| 129 | if col not in cols: |
---|
| 130 | rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " " |
---|
| 131 | rcve_col_str.strip() |
---|
| 132 | if len(cols) == len(s): #full model |
---|
| 133 | rcve_col_str = "-" |
---|
| 134 | rcve = "-" |
---|
| 135 | try: |
---|
| 136 | redr2 = "%.4f" %(float(redr2)) |
---|
| 137 | except: |
---|
| 138 | pass |
---|
| 139 | try: |
---|
| 140 | rcve = "%.4f" %(float(rcve)) |
---|
| 141 | except: |
---|
| 142 | pass |
---|
| 143 | print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve) |
---|