Context Navigation

rgutils.py

リビジョン 2, 11.2 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	# utilities for rgenetics
2	#
3	# copyright 2009 ross lazarus
4	# released under the LGPL
5	#
6
7	import subprocess, os, sys, time, tempfile,string,plinkbinJZ
8
9	galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
10	<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
11	<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
12	<head>
13	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
14	<meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
15	<title></title>
16	<link rel="stylesheet" href="/static/style/base.css" type="text/css" />
17	</head>
18	<body>
19	<div class="document">
20	"""
21	galhtmlattr = """<h3><a href="http://rgenetics.org">Rgenetics</a> tool %s run at %s</h3>"""
22	galhtmlpostfix = """</div></body></html>\n"""
23
24	plinke = 'plink' # changed jan 2010 - all exes must be on path
25	rexe = 'R' # to avoid cluster/platform dependencies
26	smartpca = 'smartpca.perl'
27
28	def timenow():
29	"""return current time as a string
30	"""
31	return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
32
33	def fail( message ):
34	print >> sys.stderr, message
35	return -1
36
37	def whereis(program):
38	for path in os.environ.get('PATH', '').split(':'):
39	if os.path.exists(os.path.join(path, program)) and \
40	not os.path.isdir(os.path.join(path, program)):
41	return os.path.join(path, program)
42	return None
43
44
45	def oRRun(rcmd=[],outdir=None,title='myR',rexe='R'):
46	"""
47	run an r script, lines in rcmd,
48	in a temporary directory
49	move everything, r script and all back to outdir which will be an html file
50
51
52	# test
53	RRun(rcmd=['print("hello cruel world")','q()'],title='test')
54	"""
55	rlog = []
56	print '### rexe = %s' % rexe
57	assert os.path.isfile(rexe)
58	rname = '%s.R' % title
59	stoname = '%s.R.log' % title
60	rfname = rname
61	stofname = stoname
62	if outdir: # want a specific path
63	rfname = os.path.join(outdir,rname)
64	stofname = os.path.join(outdir,stoname)
65	try:
66	os.makedirs(outdir) # might not be there yet...
67	except:
68	pass
69	else:
70	outdir = tempfile.mkdtemp(prefix=title)
71	rfname = os.path.join(outdir,rname)
72	stofname = os.path.join(outdir,stoname)
73	rmoutdir = True
74	f = open(rfname,'w')
75	if type(rcmd) == type([]):
76	f.write('\n'.join(rcmd))
77	else: # string
78	f.write(rcmd)
79	f.write('\n')
80	f.close()
81	sto = file(stofname,'w')
82	vcl = [rexe,"--vanilla --slave", '<', rfname ]
83	x = subprocess.Popen(' '.join(vcl),shell=True,stderr=sto,stdout=sto,cwd=outdir)
84	retval = x.wait()
85	sto.close()
86	rlog = file(stofname,'r').readlines()
87	rlog.insert(0,'## found R at %s' % rexe)
88	if outdir <> None:
89	flist = os.listdir(outdir)
90	else:
91	flist = os.listdir('.')
92	flist.sort
93	flist = [(x,x) for x in flist]
94	for i,x in enumerate(flist):
95	if x == rname:
96	flist[i] = (x,'R script for %s' % title)
97	elif x == stoname:
98	flist[i] = (x,'R log for %s' % title)
99	if False and rmoutdir:
100	os.removedirs(outdir)
101	return rlog,flist # for html layout
102
103
104
105
106	def RRun(rcmd=[],outdir=None,title='myR',tidy=True):
107	"""
108	run an r script, lines in rcmd,
109	in a temporary directory
110	move everything, r script and all back to outdir which will be an html file
111
112
113	# test
114	RRun(rcmd=['print("hello cruel world")','q()'],title='test')
115	echo "a <- c(5, 5); b <- c(0.5, 0.5)" \| cat - RScript.R \| R --slave \ --vanilla
116	suggested by http://tolstoy.newcastle.edu.au/R/devel/05/09/2448.html
117	"""
118	killme = string.punctuation + string.whitespace
119	trantab = string.maketrans(killme,'_'*len(killme))
120	title = title.translate(trantab)
121	rlog = []
122	tempout=False
123	rname = '%s.R' % title
124	stoname = '%s.R.log' % title
125	cwd = os.getcwd()
126	if outdir: # want a specific path
127	try:
128	os.makedirs(outdir) # might not be there yet...
129	except:
130	pass
131	os.chdir(outdir)
132	if type(rcmd) == type([]):
133	script = '\n'.join(rcmd)
134	else: # string
135	script = rcmd
136	sto = file(stoname,'w')
137	rscript = file(rname,'w')
138	rscript.write(script)
139	rscript.write('\n#R script autogenerated by rgenetics/rgutils.py on %s\n' % timenow())
140	rscript.close()
141	vcl = '%s --slave --vanilla < %s' % (rexe,rname)
142	if outdir:
143	x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto,cwd=outdir)
144	else:
145	x = subprocess.Popen(vcl,shell=True,stderr=sto,stdout=sto)
146	retval = x.wait()
147	sto.close()
148	rlog = file(stoname,'r').readlines()
149	if retval <> 0:
150	rlog.insert(0,'Nonzero exit code = %d' % retval) # indicate failure
151	if outdir:
152	flist = os.listdir(outdir)
153	else:
154	flist = os.listdir(os.getcwd())
155	flist.sort
156	flist = [(x,x) for x in flist]
157	for i,x in enumerate(flist):
158	if x == rname:
159	flist[i] = (x,'R script for %s' % title)
160	elif x == stoname:
161	flist[i] = (x,'R log for %s' % title)
162	if outdir:
163	os.chdir(cwd)
164	return rlog,flist # for html layout
165
166	def runPlink(bfn='bar',ofn='foo',logf=None,plinktasks=[],cd='./',vclbase = []):
167	"""run a series of plink tasks and append log results to stdout
168	vcl has a list of parameters for the spawnv
169	common settings can all go in the vclbase list and are added to each plinktask
170	"""
171	# root for all
172	fplog,plog = tempfile.mkstemp()
173	if type(logf) == type(' '): # open otherwise assume is file - ugh I'm in a hurry
174	mylog = file(logf,'a+')
175	else:
176	mylog = logf
177	mylog.write('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink runner\n')
178	for task in plinktasks: # each is a list
179	vcl = vclbase + task
180	sto = file(plog,'w')
181	x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)
182	retval = x.wait()
183	sto.close()
184	try:
185	lplog = file(plog,'r').read()
186	mylog.write(lplog)
187	os.unlink(plog) # no longer needed
188	except:
189	mylog.write('### %s Strange - no std out from plink when running command line\n%s' % (timenow(),' '.join(vcl)))
190
191	def pruneLD(plinktasks=[],cd='./',vclbase = []):
192	"""
193	plink blathers when doing pruning - ignore
194	Linkage disequilibrium based SNP pruning
195	if a million snps in 3 billion base pairs, have mean 3k spacing
196	assume 40-60k of ld in ceu, a window of 120k width is about 40 snps
197	so lots more is perhaps less efficient - each window computational cost is
198	ON^2 unless the code is smart enough to avoid unecessary computation where
199	allele frequencies make it impossible to see ld > the r^2 cutoff threshold
200	So, do a window and move forward 20?
201	The fine Plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune
202	reproduced below
203
204	Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is similar, except it is based only on pairwise genotypic correlation.
205
206	Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or --exclude option is necessary to actually perform the pruning.
207
208	The VIF pruning routine is performed:
209	plink --file data --indep 50 5 2
210
211	will create files
212
213	plink.prune.in
214	plink.prune.out
215
216	Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for
217	a --extract or --exclude command.
218
219	The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the
220	window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window size is too large, too many SNPs may be removed.
221
222	The second procedure is performed:
223	plink --file data --indep-pairwise 50 5 0.5
224
225	This generates the same output files as the first version; the only difference is that a
226	simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic correlation, i.e. it does not involve phasing.
227
228	To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a
229	window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 SNPs forward and repeat the procedure.
230
231	To make a new, pruned file, then use something like (in this example, we also convert the
232	standard PED fileset to a binary one):
233	plink --file data --extract plink.prune.in --make-bed --out pruneddata
234	"""
235	fplog,plog = tempfile.mkstemp()
236	alog = []
237	alog.append('## Rgenetics: http://rgenetics.org Galaxy Tools rgQC.py Plink pruneLD runner\n')
238	for task in plinktasks: # each is a list
239	vcl = vclbase + task
240	sto = file(plog,'w')
241	x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)
242	retval = x.wait()
243	sto.close()
244	try:
245	lplog = file(plog,'r').readlines()
246	lplog = [x for x in lplog if x.find('Pruning SNP') == -1]
247	alog += lplog
248	alog.append('\n')
249	os.unlink(plog) # no longer needed
250	except:
251	alog.append('### %s Strange - no std out from plink when running command line\n%s\n' % (timenow(),' '.join(vcl)))
252	return alog
253
254	def readMap(mapfile=None,allmarkers=False,rsdict={},c=None,spos=None,epos=None):
255	"""abstract out - keeps reappearing
256	"""
257	mfile = open(mapfile, 'r')
258	markers = []
259	snpcols = {}
260	snpIndex = 0 # in case empty or comment lines
261	for rownum,row in enumerate(mfile):
262	line = row.strip()
263	if not line or line[0]=='#': continue
264	chrom, snp, genpos, abspos = line.split()[:4] # just in case more cols
265	try:
266	abspos = int(abspos)
267	except:
268	abspos = 0 # stupid framingham data grumble grumble
269	if allmarkers or rsdict.get(snp,None) or (chrom == c and (spos <= abspos <= epos)):
270	markers.append((chrom,abspos,snp)) # decorate for sort into genomic
271	snpcols[snp] = snpIndex # so we know which col to find genos for this marker
272	snpIndex += 1
273	markers.sort()
274	rslist = [x[2] for x in markers] # drop decoration
275	rsdict = dict(zip(rslist,rslist))
276	mfile.close()
277	return markers,snpcols,rslist,rsdict
278

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/rgenetics/rgutils.py

異なるフォーマットでダウンロード: