| 1 | #!/usr/bin/env python |
|---|
| 2 | |
|---|
| 3 | from galaxy import eggs |
|---|
| 4 | import sys, string |
|---|
| 5 | from rpy import * |
|---|
| 6 | import numpy |
|---|
| 7 | |
|---|
| 8 | def stop_err(msg): |
|---|
| 9 | sys.stderr.write(msg) |
|---|
| 10 | sys.exit() |
|---|
| 11 | |
|---|
| 12 | infile = sys.argv[1] |
|---|
| 13 | x_cols = sys.argv[2].split(',') |
|---|
| 14 | y_cols = sys.argv[3].split(',') |
|---|
| 15 | |
|---|
| 16 | x_scale = x_center = "FALSE" |
|---|
| 17 | if sys.argv[4] == 'both': |
|---|
| 18 | x_scale = x_center = "TRUE" |
|---|
| 19 | elif sys.argv[4] == 'center': |
|---|
| 20 | x_center = "TRUE" |
|---|
| 21 | elif sys.argv[4] == 'scale': |
|---|
| 22 | x_scale = "TRUE" |
|---|
| 23 | |
|---|
| 24 | y_scale = y_center = "FALSE" |
|---|
| 25 | if sys.argv[5] == 'both': |
|---|
| 26 | y_scale = y_center = "TRUE" |
|---|
| 27 | elif sys.argv[5] == 'center': |
|---|
| 28 | y_center = "TRUE" |
|---|
| 29 | elif sys.argv[5] == 'scale': |
|---|
| 30 | y_scale = "TRUE" |
|---|
| 31 | |
|---|
| 32 | std_scores = "FALSE" |
|---|
| 33 | if sys.argv[6] == "yes": |
|---|
| 34 | std_scores = "TRUE" |
|---|
| 35 | |
|---|
| 36 | outfile = sys.argv[7] |
|---|
| 37 | outfile2 = sys.argv[8] |
|---|
| 38 | |
|---|
| 39 | fout = open(outfile,'w') |
|---|
| 40 | elems = [] |
|---|
| 41 | for i, line in enumerate( file ( infile )): |
|---|
| 42 | line = line.rstrip('\r\n') |
|---|
| 43 | if len( line )>0 and not line.startswith( '#' ): |
|---|
| 44 | elems = line.split( '\t' ) |
|---|
| 45 | break |
|---|
| 46 | if i == 30: |
|---|
| 47 | break # Hopefully we'll never get here... |
|---|
| 48 | |
|---|
| 49 | if len( elems )<1: |
|---|
| 50 | stop_err( "The data in your input dataset is either missing or not formatted properly." ) |
|---|
| 51 | |
|---|
| 52 | x_vals = [] |
|---|
| 53 | |
|---|
| 54 | for k,col in enumerate(x_cols): |
|---|
| 55 | x_cols[k] = int(col)-1 |
|---|
| 56 | x_vals.append([]) |
|---|
| 57 | |
|---|
| 58 | y_vals = [] |
|---|
| 59 | |
|---|
| 60 | for k,col in enumerate(y_cols): |
|---|
| 61 | y_cols[k] = int(col)-1 |
|---|
| 62 | y_vals.append([]) |
|---|
| 63 | |
|---|
| 64 | skipped = 0 |
|---|
| 65 | for ind,line in enumerate( file( infile )): |
|---|
| 66 | if line and not line.startswith( '#' ): |
|---|
| 67 | try: |
|---|
| 68 | fields = line.strip().split("\t") |
|---|
| 69 | valid_line = True |
|---|
| 70 | for col in x_cols+y_cols: |
|---|
| 71 | try: |
|---|
| 72 | assert float(fields[col]) |
|---|
| 73 | except: |
|---|
| 74 | skipped += 1 |
|---|
| 75 | valid_line = False |
|---|
| 76 | break |
|---|
| 77 | if valid_line: |
|---|
| 78 | for k,col in enumerate(x_cols): |
|---|
| 79 | try: |
|---|
| 80 | xval = float(fields[col]) |
|---|
| 81 | except: |
|---|
| 82 | xval = NaN# |
|---|
| 83 | x_vals[k].append(xval) |
|---|
| 84 | for k,col in enumerate(y_cols): |
|---|
| 85 | try: |
|---|
| 86 | yval = float(fields[col]) |
|---|
| 87 | except: |
|---|
| 88 | yval = NaN# |
|---|
| 89 | y_vals[k].append(yval) |
|---|
| 90 | except: |
|---|
| 91 | skipped += 1 |
|---|
| 92 | |
|---|
| 93 | x_vals1 = numpy.asarray(x_vals).transpose() |
|---|
| 94 | y_vals1 = numpy.asarray(y_vals).transpose() |
|---|
| 95 | |
|---|
| 96 | x_dat= r.list(array(x_vals1)) |
|---|
| 97 | y_dat= r.list(array(y_vals1)) |
|---|
| 98 | |
|---|
| 99 | try: |
|---|
| 100 | r.suppressWarnings(r.library("yacca")) |
|---|
| 101 | except: |
|---|
| 102 | stop_err("Missing R library yacca.") |
|---|
| 103 | |
|---|
| 104 | set_default_mode(NO_CONVERSION) |
|---|
| 105 | try: |
|---|
| 106 | xcolnames = ["c%d" %(el+1) for el in x_cols] |
|---|
| 107 | ycolnames = ["c%d" %(el+1) for el in y_cols] |
|---|
| 108 | cc = r.cca(x=x_dat, y=y_dat, xlab=xcolnames, ylab=ycolnames, xcenter=r(x_center), ycenter=r(y_center), xscale=r(x_scale), yscale=r(y_scale), standardize_scores=r(std_scores)) |
|---|
| 109 | ftest = r.F_test_cca(cc) |
|---|
| 110 | except RException, rex: |
|---|
| 111 | stop_err("Encountered error while performing CCA on the input data: %s" %(rex)) |
|---|
| 112 | |
|---|
| 113 | set_default_mode(BASIC_CONVERSION) |
|---|
| 114 | summary = r.summary(cc) |
|---|
| 115 | |
|---|
| 116 | ncomps = len(summary['corr']) |
|---|
| 117 | comps = summary['corr'].keys() |
|---|
| 118 | corr = summary['corr'].values() |
|---|
| 119 | xlab = summary['xlab'] |
|---|
| 120 | ylab = summary['ylab'] |
|---|
| 121 | |
|---|
| 122 | for i in range(ncomps): |
|---|
| 123 | corr[comps.index('CV %s' %(i+1))] = summary['corr'].values()[i] |
|---|
| 124 | |
|---|
| 125 | ftest=ftest.as_py() |
|---|
| 126 | print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
|---|
| 127 | print >>fout, "#Correlation\t%s" %("\t".join(["%.4g" % el for el in corr])) |
|---|
| 128 | print >>fout, "#F-statistic\t%s" %("\t".join(["%.4g" % el for el in ftest['statistic']])) |
|---|
| 129 | print >>fout, "#p-value\t%s" %("\t".join(["%.4g" % el for el in ftest['p.value']])) |
|---|
| 130 | |
|---|
| 131 | print >>fout, "#X-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
|---|
| 132 | for i,val in enumerate(summary['xcoef']): |
|---|
| 133 | print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val])) |
|---|
| 134 | |
|---|
| 135 | print >>fout, "#Y-Coefficients\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
|---|
| 136 | for i,val in enumerate(summary['ycoef']): |
|---|
| 137 | print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val])) |
|---|
| 138 | |
|---|
| 139 | print >>fout, "#X-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
|---|
| 140 | for i,val in enumerate(summary['xstructcorr']): |
|---|
| 141 | print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val])) |
|---|
| 142 | |
|---|
| 143 | print >>fout, "#Y-Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
|---|
| 144 | for i,val in enumerate(summary['ystructcorr']): |
|---|
| 145 | print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val])) |
|---|
| 146 | |
|---|
| 147 | print >>fout, "#X-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
|---|
| 148 | for i,val in enumerate(summary['xcrosscorr']): |
|---|
| 149 | print >>fout, "%s\t%s" %(xlab[i], "\t".join(["%.4g" % el for el in val])) |
|---|
| 150 | |
|---|
| 151 | print >>fout, "#Y-CrossLoadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
|---|
| 152 | for i,val in enumerate(summary['ycrosscorr']): |
|---|
| 153 | print >>fout, "%s\t%s" %(ylab[i], "\t".join(["%.4g" % el for el in val])) |
|---|
| 154 | |
|---|
| 155 | r.pdf( outfile2, 8, 8 ) |
|---|
| 156 | #r.plot(cc) |
|---|
| 157 | for i in range(ncomps): |
|---|
| 158 | r.helio_plot(cc, cv = i+1, main = r.paste("Explained Variance for CV",i+1), type = "variance") |
|---|
| 159 | r.dev_off() |
|---|