1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | from galaxy import eggs |
---|
4 | |
---|
5 | import sys, string |
---|
6 | from rpy import * |
---|
7 | import numpy |
---|
8 | |
---|
9 | def stop_err(msg): |
---|
10 | sys.stderr.write(msg) |
---|
11 | sys.exit() |
---|
12 | |
---|
13 | def sscombs(s): |
---|
14 | if len(s) == 1: |
---|
15 | return [s] |
---|
16 | else: |
---|
17 | ssc = sscombs(s[1:]) |
---|
18 | return [s[0]] + [s[0]+comb for comb in ssc] + ssc |
---|
19 | |
---|
20 | |
---|
21 | infile = sys.argv[1] |
---|
22 | y_col = int(sys.argv[2])-1 |
---|
23 | x_cols = sys.argv[3].split(',') |
---|
24 | outfile = sys.argv[4] |
---|
25 | |
---|
26 | print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) |
---|
27 | fout = open(outfile,'w') |
---|
28 | |
---|
29 | for i, line in enumerate( file ( infile )): |
---|
30 | line = line.rstrip('\r\n') |
---|
31 | if len( line )>0 and not line.startswith( '#' ): |
---|
32 | elems = line.split( '\t' ) |
---|
33 | break |
---|
34 | if i == 30: |
---|
35 | break # Hopefully we'll never get here... |
---|
36 | |
---|
37 | if len( elems )<1: |
---|
38 | stop_err( "The data in your input dataset is either missing or not formatted properly." ) |
---|
39 | |
---|
40 | y_vals = [] |
---|
41 | x_vals = [] |
---|
42 | |
---|
43 | for k,col in enumerate(x_cols): |
---|
44 | x_cols[k] = int(col)-1 |
---|
45 | x_vals.append([]) |
---|
46 | """ |
---|
47 | try: |
---|
48 | float( elems[x_cols[k]] ) |
---|
49 | except: |
---|
50 | try: |
---|
51 | msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] ) |
---|
52 | except: |
---|
53 | msg = "This operation cannot be performed on non-numeric data." |
---|
54 | stop_err( msg ) |
---|
55 | """ |
---|
56 | NA = 'NA' |
---|
57 | for ind,line in enumerate( file( infile )): |
---|
58 | if line and not line.startswith( '#' ): |
---|
59 | try: |
---|
60 | fields = line.split("\t") |
---|
61 | try: |
---|
62 | yval = float(fields[y_col]) |
---|
63 | except Exception, ey: |
---|
64 | yval = r('NA') |
---|
65 | #print >>sys.stderr, "ey = %s" %ey |
---|
66 | y_vals.append(yval) |
---|
67 | for k,col in enumerate(x_cols): |
---|
68 | try: |
---|
69 | xval = float(fields[col]) |
---|
70 | except Exception, ex: |
---|
71 | xval = r('NA') |
---|
72 | #print >>sys.stderr, "ex = %s" %ex |
---|
73 | x_vals[k].append(xval) |
---|
74 | except: |
---|
75 | pass |
---|
76 | |
---|
77 | x_vals1 = numpy.asarray(x_vals).transpose() |
---|
78 | dat= r.list(x=array(x_vals1), y=y_vals) |
---|
79 | |
---|
80 | set_default_mode(NO_CONVERSION) |
---|
81 | try: |
---|
82 | full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user |
---|
83 | except RException, rex: |
---|
84 | stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") |
---|
85 | set_default_mode(BASIC_CONVERSION) |
---|
86 | |
---|
87 | summary = r.summary(full) |
---|
88 | fullr2 = summary.get('r.squared','NA') |
---|
89 | |
---|
90 | if fullr2 == 'NA': |
---|
91 | stop_error("Error in linear regression") |
---|
92 | |
---|
93 | if len(x_vals) < 10: |
---|
94 | s = "" |
---|
95 | for ch in range(len(x_vals)): |
---|
96 | s += str(ch) |
---|
97 | else: |
---|
98 | stop_err("This tool only works with less than 10 predictors.") |
---|
99 | |
---|
100 | print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value" |
---|
101 | all_combos = sorted(sscombs(s), key=len) |
---|
102 | all_combos.reverse() |
---|
103 | for j,cols in enumerate(all_combos): |
---|
104 | #if len(cols) == len(s): #Same as the full model above |
---|
105 | # continue |
---|
106 | if len(cols) == 1: |
---|
107 | x_vals1 = x_vals[int(cols)] |
---|
108 | else: |
---|
109 | x_v = [] |
---|
110 | for col in cols: |
---|
111 | x_v.append(x_vals[int(col)]) |
---|
112 | x_vals1 = numpy.asarray(x_v).transpose() |
---|
113 | dat= r.list(x=array(x_vals1), y=y_vals) |
---|
114 | set_default_mode(NO_CONVERSION) |
---|
115 | red = r.lm(r("y ~ x"), data= dat) #Reduced model |
---|
116 | set_default_mode(BASIC_CONVERSION) |
---|
117 | summary = r.summary(red) |
---|
118 | redr2 = summary.get('r.squared','NA') |
---|
119 | try: |
---|
120 | rcve = (float(fullr2)-float(redr2))/float(fullr2) |
---|
121 | except: |
---|
122 | rcve = 'NA' |
---|
123 | col_str = "" |
---|
124 | for col in cols: |
---|
125 | col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " |
---|
126 | col_str.strip() |
---|
127 | rcve_col_str = "" |
---|
128 | for col in s: |
---|
129 | if col not in cols: |
---|
130 | rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " " |
---|
131 | rcve_col_str.strip() |
---|
132 | if len(cols) == len(s): #full model |
---|
133 | rcve_col_str = "-" |
---|
134 | rcve = "-" |
---|
135 | try: |
---|
136 | redr2 = "%.4f" %(float(redr2)) |
---|
137 | except: |
---|
138 | pass |
---|
139 | try: |
---|
140 | rcve = "%.4f" %(float(rcve)) |
---|
141 | except: |
---|
142 | pass |
---|
143 | print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve) |
---|