[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | from galaxy import eggs |
---|
| 4 | import sys, string |
---|
| 5 | from rpy import * |
---|
| 6 | import numpy |
---|
| 7 | |
---|
| 8 | def stop_err(msg): |
---|
| 9 | sys.stderr.write(msg) |
---|
| 10 | sys.exit() |
---|
| 11 | |
---|
| 12 | infile = sys.argv[1] |
---|
| 13 | x_cols = sys.argv[2].split(',') |
---|
| 14 | method = sys.argv[3] |
---|
| 15 | outfile = sys.argv[4] |
---|
| 16 | outfile2 = sys.argv[5] |
---|
| 17 | |
---|
| 18 | if method == 'svd': |
---|
| 19 | scale = center = "FALSE" |
---|
| 20 | if sys.argv[6] == 'both': |
---|
| 21 | scale = center = "TRUE" |
---|
| 22 | elif sys.argv[6] == 'center': |
---|
| 23 | center = "TRUE" |
---|
| 24 | elif sys.argv[6] == 'scale': |
---|
| 25 | scale = "TRUE" |
---|
| 26 | |
---|
| 27 | fout = open(outfile,'w') |
---|
| 28 | elems = [] |
---|
| 29 | for i, line in enumerate( file ( infile )): |
---|
| 30 | line = line.rstrip('\r\n') |
---|
| 31 | if len( line )>0 and not line.startswith( '#' ): |
---|
| 32 | elems = line.split( '\t' ) |
---|
| 33 | break |
---|
| 34 | if i == 30: |
---|
| 35 | break # Hopefully we'll never get here... |
---|
| 36 | |
---|
| 37 | if len( elems )<1: |
---|
| 38 | stop_err( "The data in your input dataset is either missing or not formatted properly." ) |
---|
| 39 | |
---|
| 40 | x_vals = [] |
---|
| 41 | |
---|
| 42 | for k,col in enumerate(x_cols): |
---|
| 43 | x_cols[k] = int(col)-1 |
---|
| 44 | x_vals.append([]) |
---|
| 45 | |
---|
| 46 | NA = 'NA' |
---|
| 47 | skipped = 0 |
---|
| 48 | for ind,line in enumerate( file( infile )): |
---|
| 49 | if line and not line.startswith( '#' ): |
---|
| 50 | try: |
---|
| 51 | fields = line.strip().split("\t") |
---|
| 52 | valid_line = True |
---|
| 53 | for k,col in enumerate(x_cols): |
---|
| 54 | try: |
---|
| 55 | xval = float(fields[col]) |
---|
| 56 | except: |
---|
| 57 | skipped += 1 |
---|
| 58 | valid_line = False |
---|
| 59 | break |
---|
| 60 | if valid_line: |
---|
| 61 | for k,col in enumerate(x_cols): |
---|
| 62 | xval = float(fields[col]) |
---|
| 63 | x_vals[k].append(xval) |
---|
| 64 | except: |
---|
| 65 | skipped += 1 |
---|
| 66 | |
---|
| 67 | x_vals1 = numpy.asarray(x_vals).transpose() |
---|
| 68 | dat= r.list(array(x_vals1)) |
---|
| 69 | |
---|
| 70 | set_default_mode(NO_CONVERSION) |
---|
| 71 | try: |
---|
| 72 | if method == "cor": |
---|
| 73 | pc = r.princomp(r.na_exclude(dat), cor = r("TRUE")) |
---|
| 74 | elif method == "cov": |
---|
| 75 | pc = r.princomp(r.na_exclude(dat), cor = r("FALSE")) |
---|
| 76 | elif method=="svd": |
---|
| 77 | pc = r.prcomp(r.na_exclude(dat), center = r(center), scale = r(scale)) |
---|
| 78 | except RException, rex: |
---|
| 79 | stop_err("Encountered error while performing PCA on the input data: %s" %(rex)) |
---|
| 80 | |
---|
| 81 | set_default_mode(BASIC_CONVERSION) |
---|
| 82 | summary = r.summary(pc, loadings="TRUE") |
---|
| 83 | ncomps = len(summary['sdev']) |
---|
| 84 | |
---|
| 85 | if type(summary['sdev']) == type({}): |
---|
| 86 | comps = summary['sdev'].keys() |
---|
| 87 | sd = summary['sdev'].values() |
---|
| 88 | for i in range(ncomps): |
---|
| 89 | sd[comps.index('Comp.%s' %(i+1))] = summary['sdev'].values()[i] |
---|
| 90 | elif type(summary['sdev']) == type([]): |
---|
| 91 | comps=[] |
---|
| 92 | for i in range(ncomps): |
---|
| 93 | comps.append('Comp.%s' %(i+1)) |
---|
| 94 | sd = summary['sdev'] |
---|
| 95 | |
---|
| 96 | print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
---|
| 97 | print >>fout, "#Std. deviation\t%s" %("\t".join(["%.4g" % el for el in sd])) |
---|
| 98 | total_var = 0 |
---|
| 99 | vars = [] |
---|
| 100 | for s in sd: |
---|
| 101 | var = s*s |
---|
| 102 | total_var += var |
---|
| 103 | vars.append(var) |
---|
| 104 | for i,var in enumerate(vars): |
---|
| 105 | vars[i] = vars[i]/total_var |
---|
| 106 | |
---|
| 107 | print >>fout, "#Proportion of variance explained\t%s" %("\t".join(["%.4g" % el for el in vars])) |
---|
| 108 | |
---|
| 109 | print >>fout, "#Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
---|
| 110 | xcolnames = ["c%d" %(el+1) for el in x_cols] |
---|
| 111 | if 'loadings' in summary: #in case of princomp |
---|
| 112 | loadings = 'loadings' |
---|
| 113 | elif 'rotation' in summary: #in case of prcomp |
---|
| 114 | loadings = 'rotation' |
---|
| 115 | for i,val in enumerate(summary[loadings]): |
---|
| 116 | print >>fout, "%s\t%s" %(xcolnames[i], "\t".join(["%.4g" % el for el in val])) |
---|
| 117 | |
---|
| 118 | print >>fout, "#Scores\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)])) |
---|
| 119 | if 'scores' in summary: #in case of princomp |
---|
| 120 | scores = 'scores' |
---|
| 121 | elif 'x' in summary: #in case of prcomp |
---|
| 122 | scores = 'x' |
---|
| 123 | for obs,sc in enumerate(summary[scores]): |
---|
| 124 | print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in sc])) |
---|
| 125 | |
---|
| 126 | r.pdf( outfile2, 8, 8 ) |
---|
| 127 | r.biplot(pc) |
---|
| 128 | r.dev_off() |
---|