root/galaxy-central/tools/multivariate_stats/pca.py

リビジョン 2, 3.7 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python
2
3from galaxy import eggs
4import sys, string
5from rpy import *
6import numpy
7
8def stop_err(msg):
9    sys.stderr.write(msg)
10    sys.exit()
11
12infile = sys.argv[1]
13x_cols = sys.argv[2].split(',')
14method = sys.argv[3]
15outfile = sys.argv[4]
16outfile2 = sys.argv[5]
17
18if method == 'svd':
19    scale = center = "FALSE"
20    if sys.argv[6] == 'both':
21        scale = center = "TRUE"
22    elif sys.argv[6] == 'center':
23        center = "TRUE"
24    elif sys.argv[6] == 'scale':
25        scale = "TRUE"
26   
27fout = open(outfile,'w')
28elems = []
29for i, line in enumerate( file ( infile )):
30    line = line.rstrip('\r\n')
31    if len( line )>0 and not line.startswith( '#' ):
32        elems = line.split( '\t' )
33        break
34    if i == 30:
35        break # Hopefully we'll never get here...
36
37if len( elems )<1:
38    stop_err( "The data in your input dataset is either missing or not formatted properly." )
39
40x_vals = []
41
42for k,col in enumerate(x_cols):
43    x_cols[k] = int(col)-1
44    x_vals.append([])
45
46NA = 'NA'
47skipped = 0
48for ind,line in enumerate( file( infile )):
49    if line and not line.startswith( '#' ):
50        try:
51            fields = line.strip().split("\t")
52            valid_line = True
53            for k,col in enumerate(x_cols):
54                try:
55                    xval = float(fields[col])
56                except:
57                    skipped += 1
58                    valid_line = False
59                    break
60            if valid_line:
61                for k,col in enumerate(x_cols):
62                    xval = float(fields[col])
63                    x_vals[k].append(xval)
64        except:
65            skipped += 1
66
67x_vals1 = numpy.asarray(x_vals).transpose()
68dat= r.list(array(x_vals1))
69
70set_default_mode(NO_CONVERSION)
71try:
72    if method == "cor":
73        pc = r.princomp(r.na_exclude(dat), cor = r("TRUE"))
74    elif method == "cov":
75        pc = r.princomp(r.na_exclude(dat), cor = r("FALSE"))
76    elif method=="svd":
77        pc = r.prcomp(r.na_exclude(dat), center = r(center), scale = r(scale))
78except RException, rex:
79    stop_err("Encountered error while performing PCA on the input data: %s" %(rex))
80
81set_default_mode(BASIC_CONVERSION)
82summary = r.summary(pc, loadings="TRUE")
83ncomps = len(summary['sdev'])
84
85if type(summary['sdev']) == type({}):
86    comps = summary['sdev'].keys()
87    sd = summary['sdev'].values()
88    for i in range(ncomps):
89        sd[comps.index('Comp.%s' %(i+1))] = summary['sdev'].values()[i]
90elif type(summary['sdev']) == type([]):
91    comps=[]
92    for i in range(ncomps):
93        comps.append('Comp.%s' %(i+1))
94        sd = summary['sdev']
95
96print >>fout, "#Component\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
97print >>fout, "#Std. deviation\t%s" %("\t".join(["%.4g" % el for el in sd]))
98total_var = 0
99vars = []
100for s in sd:
101    var = s*s
102    total_var += var
103    vars.append(var)
104for i,var in enumerate(vars):
105    vars[i] = vars[i]/total_var
106       
107print >>fout, "#Proportion of variance explained\t%s" %("\t".join(["%.4g" % el for el in vars]))
108
109print >>fout, "#Loadings\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
110xcolnames = ["c%d" %(el+1) for el in x_cols]
111if 'loadings' in summary: #in case of princomp
112    loadings = 'loadings'
113elif 'rotation' in summary: #in case of prcomp
114    loadings = 'rotation'
115for i,val in enumerate(summary[loadings]):
116    print >>fout, "%s\t%s" %(xcolnames[i], "\t".join(["%.4g" % el for el in val]))
117
118print >>fout, "#Scores\t%s" %("\t".join(["%s" % el for el in range(1,ncomps+1)]))
119if 'scores' in summary: #in case of princomp
120    scores = 'scores'
121elif 'x' in summary: #in case of prcomp
122    scores = 'x'
123for obs,sc in enumerate(summary[scores]):
124    print >>fout, "%s\t%s" %(obs+1, "\t".join(["%.4g" % el for el in sc]))
125
126r.pdf( outfile2, 8, 8 )
127r.biplot(pc)
128r.dev_off()
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。