root/galaxy-central/tools/rgenetics/rgLDIndep.py

リビジョン 2, 8.8 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2# oct 2009 - must make a map file in case later usage requires it...
3# galaxy tool xml files can define a galaxy supplied output filename
4# that must be passed to the tool and used to return output
5# here, the plink log file is copied to that file and removed
6# took a while to figure this out!
7# use exec_before_job to give files sensible names
8#
9# ross april 14 2007
10# plink cleanup script
11# ross lazarus March 2007 for camp illumina whole genome data
12# note problems with multiple commands being ignored - eg --freq --missing --mendel
13# only the first seems to get done...
14#
15##Summary statistics versus inclusion criteria
16##
17##Feature                         As summary statistic    As inclusion criteria
18##Missingness per individual      --missing               --mind N
19##Missingness per marker          --missing               --geno N       
20##Allele frequency                --freq                  --maf N
21##Hardy-Weinberg equilibrium      --hardy                 --hwe N
22##Mendel error rates              --mendel                --me N M
23#
24# this is rgLDIndep.py - main task is to decrease LD by filtering high LD pairs
25# remove that function from rgClean.py as it may not be needed.
26 
27"""
28import sys,shutil,os,subprocess, glob, string, tempfile, time
29from rgutils import plinke, timenow, galhtmlprefix
30
31prog = os.path.split(sys.argv[0])[-1]
32myversion = 'January 4 2010'
33
34
35def pruneld(plinktasks=[] ,cd='./',vclbase = []):
36    """
37    plink blathers when doing pruning - ignore
38    Linkage disequilibrium based SNP pruning
39    if a million snps in 3 billion base pairs, have mean 3k spacing
40    assume 40-60k of ld in ceu, a window of 120k width is about 40 snps
41    so lots more is perhaps less efficient - each window computational cost is
42    ON^2 unless the code is smart enough to avoid unecessary computation where
43    allele frequencies make it impossible to see ld > the r^2 cutoff threshold
44    So, do a window and move forward 20?
45    from the plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune
46   
47Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is similar, except it is based only on pairwise genotypic correlation.
48
49Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or --exclude option is necessary to actually perform the pruning.
50
51The VIF pruning routine is performed:
52plink --file data --indep 50 5 2
53
54will create files
55
56     plink.prune.in
57     plink.prune.out
58
59Each is a simlpe list of SNP IDs; both these files can subsequently be specified as the argument for
60a --extract or --exclude command.
61
62The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the
63window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window size is too large, too many SNPs may be removed.
64
65The second procedure is performed:
66plink --file data --indep-pairwise 50 5 0.5
67
68This generates the same output files as the first version; the only difference is that a
69simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic correlation, i.e. it does not involve phasing.
70
71To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a
72window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 SNPs forward and repeat the procedure.
73
74To make a new, pruned file, then use something like (in this example, we also convert the
75standard PED fileset to a binary one):
76plink --file data --extract plink.prune.in --make-bed --out pruneddata
77    """
78    logres = ['## Rgenetics %s: http://rgenetics.org Galaxy Tools rgLDIndep.py Plink pruneLD runner\n' % myversion,]
79    for task in plinktasks: # each is a list
80        fplog,plog = tempfile.mkstemp()
81        sto = open(plog,'w') # to catch the blather
82        vcl = vclbase + task
83        s = '## ldindep now executing %s\n' % ' '.join(vcl)
84        print s
85        logres.append(s)
86        x = subprocess.Popen(' '.join(vcl),shell=True,stdout=sto,stderr=sto,cwd=cd)
87        retval = x.wait()
88        sto.close()
89        sto = open(plog,'r') # read
90        try:
91            lplog = sto.readlines()
92            lplog = [x for x in lplog if x.find('Pruning SNP') == -1]
93            logres += lplog
94            logres.append('\n')
95        except:
96            logres.append('### %s Strange - no std out from plink when running command line\n%s' % (timenow(),' '.join(vcl)))
97        sto.close()
98        os.unlink(plog) # no longer needed
99    return logres
100
101
102
103def clean():
104    """
105    """
106    if len(sys.argv) < 14:
107        print >> sys.stdout, '## %s expected 14 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv)
108        print >> sys.stdout, """this script will filter a linkage format ped
109        and map file containing genotypes. It takes 14 parameters - the plink --f parameter and"
110        a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei"
111        documented in the plink docs plus the file to be returned to Galaxy
112        Called as:
113        <command interpreter="python">
114        rgLDIndep.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
115        '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1'
116        '$out_file1.extra_files_path'  '$window' '$step' '$r2'
117        </command>
118        """
119        sys.exit(1)
120    plog = ['## Rgenetics: http://rgenetics.org Galaxy Tools rgLDIndep.py started %s\n' % timenow()]
121    inpath = sys.argv[1]
122    inbase = sys.argv[2]
123    killme = string.punctuation + string.whitespace
124    trantab = string.maketrans(killme,'_'*len(killme))
125    title = sys.argv[3].translate(trantab)
126    mind = sys.argv[4]
127    geno = sys.argv[5]
128    hwe = sys.argv[6]
129    maf = sys.argv[7]
130    me1 = sys.argv[8]
131    me2 = sys.argv[9]
132    outfname = sys.argv[10]
133    outfpath = sys.argv[11]
134    winsize = sys.argv[12]
135    step = sys.argv[13]
136    r2 = sys.argv[14]
137    output = os.path.join(outfpath,outfname)
138    outpath = os.path.join(outfpath,title)
139    outprunepath = os.path.join(outfpath,'ldprune_%s' % title)
140    try:
141      os.makedirs(outfpath)
142    except:
143      pass
144    bfile = os.path.join(inpath,inbase)
145    filterout = os.path.join(outpath,'filtered_%s' % inbase)
146    outf = file(outfname,'w')
147    outf.write(galhtmlprefix % prog)
148    ldin = bfile
149    plinktasks = [['--bfile',ldin,'--indep-pairwise %s %s %s' % (winsize,step,r2),'--out',outpath,
150    '--mind',mind,'--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2,],
151    ['--bfile',ldin,'--extract %s.prune.in --make-bed --out %s' % (outpath,outpath)],
152    ['--bfile',outpath,'--recode --out',outpath]] # make map file - don't really need ped but...
153    # subset of ld independent markers for eigenstrat and other requirements
154    vclbase = [plinke,'--noweb']
155    prunelog = pruneld(plinktasks=plinktasks,cd=outfpath,vclbase = vclbase)
156    """This generates the same output files as the first version;
157    the only difference is that a simple pairwise threshold is used.
158    The first two parameters (50 and 5) are the same as above (window size and step);
159    the third parameter represents the r^2 threshold.
160    Note: this represents the pairwise SNP-SNP metric now, not the
161    multiple correlation coefficient; also note, this is based on the
162    genotypic correlation, i.e. it does not involve phasing.
163    """
164    plog += prunelog
165    flog = '%s.log' % outpath
166    flogf = open(flog,'w')
167    flogf.write(''.join(plog))
168    flogf.write('\n')
169    flogf.close()
170    globme = os.path.join(outfpath,'*')
171    flist = glob.glob(globme)
172    flist.sort()
173    for i, data in enumerate( flist ):
174        outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
175    outf.write('</ol></div>\n')
176    outf.write("</div></body></html>")
177    outf.close()
178
179
180if __name__ == "__main__":
181    clean()
182
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。