Context Navigation

rgLDIndep.py @ 3

リビジョン 2, 8.8 KB (コミッタ: hatakeyama, 15 年前)
import galaxy-central

行番号
1	"""
2	# oct 2009 - must make a map file in case later usage requires it...
3	# galaxy tool xml files can define a galaxy supplied output filename
4	# that must be passed to the tool and used to return output
5	# here, the plink log file is copied to that file and removed
6	# took a while to figure this out!
7	# use exec_before_job to give files sensible names
8	#
9	# ross april 14 2007
10	# plink cleanup script
11	# ross lazarus March 2007 for camp illumina whole genome data
12	# note problems with multiple commands being ignored - eg --freq --missing --mendel
13	# only the first seems to get done...
14	#
15	##Summary statistics versus inclusion criteria
16	##
17	##Feature As summary statistic As inclusion criteria
18	##Missingness per individual --missing --mind N
19	##Missingness per marker --missing --geno N
20	##Allele frequency --freq --maf N
21	##Hardy-Weinberg equilibrium --hardy --hwe N
22	##Mendel error rates --mendel --me N M
23	#
24	# this is rgLDIndep.py - main task is to decrease LD by filtering high LD pairs
25	# remove that function from rgClean.py as it may not be needed.
26
27	"""
28	import sys,shutil,os,subprocess, glob, string, tempfile, time
29	from rgutils import plinke, timenow, galhtmlprefix
30
31	prog = os.path.split(sys.argv[0])[-1]
32	myversion = 'January 4 2010'
33
34
35	def pruneld(plinktasks=[] ,cd='./',vclbase = []):
36	"""
37	plink blathers when doing pruning - ignore
38	Linkage disequilibrium based SNP pruning
39	if a million snps in 3 billion base pairs, have mean 3k spacing
40	assume 40-60k of ld in ceu, a window of 120k width is about 40 snps
41	so lots more is perhaps less efficient - each window computational cost is
42	ON^2 unless the code is smart enough to avoid unecessary computation where
43	allele frequencies make it impossible to see ld > the r^2 cutoff threshold
44	So, do a window and move forward 20?
45	from the plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune
46
47	Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is similar, except it is based only on pairwise genotypic correlation.
48
49	Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or --exclude option is necessary to actually perform the pruning.
50
51	The VIF pruning routine is performed:
52	plink --file data --indep 50 5 2
53
54	will create files
55
56	plink.prune.in
57	plink.prune.out
58
59	Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for
60	a --extract or --exclude command.
61
62	The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the
63	window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window size is too large, too many SNPs may be removed.
64
65	The second procedure is performed:
66	plink --file data --indep-pairwise 50 5 0.5
67
68	This generates the same output files as the first version; the only difference is that a
69	simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic correlation, i.e. it does not involve phasing.
70
71	To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a
72	window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 SNPs forward and repeat the procedure.
73
74	To make a new, pruned file, then use something like (in this example, we also convert the
75	standard PED fileset to a binary one):
76	plink --file data --extract plink.prune.in --make-bed --out pruneddata
77	"""
78	logres = ['## Rgenetics %s: http://rgenetics.org Galaxy Tools rgLDIndep.py Plink pruneLD runner\n' % myversion,]
79	for task in plinktasks: # each is a list
80	fplog,plog = tempfile.mkstemp()
81	sto = open(plog,'w') # to catch the blather
82	vcl = vclbase + task
83	s = '## ldindep now executing %s\n' % ' '.join(vcl)
84	print s
85	logres.append(s)
86	x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)
87	retval = x.wait()
88	sto.close()
89	sto = open(plog,'r') # read
90	try:
91	lplog = sto.readlines()
92	lplog = [x for x in lplog if x.find('Pruning SNP') == -1]
93	logres += lplog
94	logres.append('\n')
95	except:
96	logres.append('### %s Strange - no std out from plink when running command line\n%s' % (timenow(),' '.join(vcl)))
97	sto.close()
98	os.unlink(plog) # no longer needed
99	return logres
100
101
102
103	def clean():
104	"""
105	"""
106	if len(sys.argv) < 14:
107	print >> sys.stdout, '## %s expected 14 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv)
108	print >> sys.stdout, """this script will filter a linkage format ped
109	and map file containing genotypes. It takes 14 parameters - the plink --f parameter and"
110	a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei"
111	documented in the plink docs plus the file to be returned to Galaxy
112	Called as:
113	<command interpreter="python">
114	rgLDIndep.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
115	'$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1'
116	'$out_file1.extra_files_path' '$window' '$step' '$r2'
117	</command>
118	"""
119	sys.exit(1)
120	plog = ['## Rgenetics: http://rgenetics.org Galaxy Tools rgLDIndep.py started %s\n' % timenow()]
121	inpath = sys.argv[1]
122	inbase = sys.argv[2]
123	killme = string.punctuation + string.whitespace
124	trantab = string.maketrans(killme,'_'*len(killme))
125	title = sys.argv[3].translate(trantab)
126	mind = sys.argv[4]
127	geno = sys.argv[5]
128	hwe = sys.argv[6]
129	maf = sys.argv[7]
130	me1 = sys.argv[8]
131	me2 = sys.argv[9]
132	outfname = sys.argv[10]
133	outfpath = sys.argv[11]
134	winsize = sys.argv[12]
135	step = sys.argv[13]
136	r2 = sys.argv[14]
137	output = os.path.join(outfpath,outfname)
138	outpath = os.path.join(outfpath,title)
139	outprunepath = os.path.join(outfpath,'ldprune_%s' % title)
140	try:
141	os.makedirs(outfpath)
142	except:
143	pass
144	bfile = os.path.join(inpath,inbase)
145	filterout = os.path.join(outpath,'filtered_%s' % inbase)
146	outf = file(outfname,'w')
147	outf.write(galhtmlprefix % prog)
148	ldin = bfile
149	plinktasks = [['--bfile',ldin,'--indep-pairwise %s %s %s' % (winsize,step,r2),'--out',outpath,
150	'--mind',mind,'--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2,],
151	['--bfile',ldin,'--extract %s.prune.in --make-bed --out %s' % (outpath,outpath)],
152	['--bfile',outpath,'--recode --out',outpath]] # make map file - don't really need ped but...
153	# subset of ld independent markers for eigenstrat and other requirements
154	vclbase = [plinke,'--noweb']
155	prunelog = pruneld(plinktasks=plinktasks,cd=outfpath,vclbase = vclbase)
156	"""This generates the same output files as the first version;
157	the only difference is that a simple pairwise threshold is used.
158	The first two parameters (50 and 5) are the same as above (window size and step);
159	the third parameter represents the r^2 threshold.
160	Note: this represents the pairwise SNP-SNP metric now, not the
161	multiple correlation coefficient; also note, this is based on the
162	genotypic correlation, i.e. it does not involve phasing.
163	"""
164	plog += prunelog
165	flog = '%s.log' % outpath
166	flogf = open(flog,'w')
167	flogf.write(''.join(plog))
168	flogf.write('\n')
169	flogf.close()
170	globme = os.path.join(outfpath,'*')
171	flist = glob.glob(globme)
172	flist.sort()
173	for i, data in enumerate( flist ):
174	outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
175	outf.write('</ol></div>\n')
176	outf.write("</div></body></html>")
177	outf.close()
178
179
180	if __name__ == "__main__":
181	clean()
182

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/tools/rgenetics/rgLDIndep.py @ 3

異なるフォーマットでダウンロード: