root/galaxy-central/tools/rgenetics/rgClean.py @ 2

リビジョン 2, 5.7 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1"""
2# galaxy tool xml files can define a galaxy supplied output filename
3# that must be passed to the tool and used to return output
4# here, the plink log file is copied to that file and removed
5# took a while to figure this out!
6# use exec_before_job to give files sensible names
7#
8# ross april 14 2007
9# plink cleanup script
10# ross lazarus March 2007 for camp illumina whole genome data
11# note problems with multiple commands being ignored - eg --freq --missing --mendel
12# only the first seems to get done...
13#
14##Summary statistics versus inclusion criteria
15##
16##Feature                         As summary statistic    As inclusion criteria
17##Missingness per individual      --missing               --mind N
18##Missingness per marker          --missing               --geno N       
19##Allele frequency                --freq                  --maf N
20##Hardy-Weinberg equilibrium      --hardy                 --hwe N
21##Mendel error rates              --mendel                --me N M
22#
23# call as plinkClean.py $i $o $mind $geno $hwe $maf $mef $mei $outfile
24# note plinkClean_code.py does some renaming before the job starts
25
26   
27    <command interpreter="python2.4">
28        rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' '$geno' '$hwe' '$maf'
29        '$mef' '$mei' '$out_file1' '$out_file1.files_path' '$userId'
30 
31 
32"""
33import sys,shutil,os,subprocess, glob, string, tempfile, time
34from rgutils import galhtmlprefix, timenow, plinke
35prog = os.path.split(sys.argv[0])[-1]
36myversion = 'January 4 2010'
37verbose=False
38
39
40def fixoutaff(outpath='',newaff='1'):
41    """ quick way to create test data sets - set all aff to 1 or 2 for
42    some hapmap data and then merge
43    [rerla@beast galaxy]$ head tool-data/rg/library/pbed/affyHM_CEU.fam
44    1341 14 0 0 2 1
45    1341 2 13 14 2 1
46    1341 13 0 0 1 1
47    1340 9 0 0 1 1
48    1340 10 0 0 2 1
49    """
50    nchanged = 0
51    fam = '%s.fam' % outpath
52    famf = open(fam,'r')
53    fl = famf.readlines()
54    famf.close()
55    for i,row in enumerate(fl):
56        lrow = row.split()
57        if lrow[-1] <> newaff:
58            lrow[-1] = newaff
59            fl[i] = ' '.join(lrow)
60            fl[i] += '\n'
61            nchanged += 1
62    fo = open(fam,'w')
63    fo.write(''.join(fl))
64    fo.close()
65    return nchanged
66           
67
68
69def clean():
70    """
71    """
72    if len(sys.argv) < 16:
73        print >> sys.stdout, '## %s expected 12 params in sys.argv, got %d - %s' % (prog,len(sys.argv),sys.argv)
74        print >> sys.stdout, """this script will filter a linkage format ped
75        and map file containing genotypes. It takes 14 parameters - the plink --f parameter and"
76        a new filename root for the output clean data followed by the mind,geno,hwe,maf, mef and mei"
77        documented in the plink docs plus the file to be returned to Galaxy
78        called as:
79        <command interpreter="python">
80        rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
81        '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path'
82        '$relfilter' '$afffilter' '$sexfilter' '$fixaff'
83        </command>
84
85        """
86        sys.exit(1)
87    plog = []
88    inpath = sys.argv[1]
89    inbase = sys.argv[2]
90    killme = string.punctuation + string.whitespace
91    trantab = string.maketrans(killme,'_'*len(killme))
92    title = sys.argv[3].translate(trantab)
93    mind = sys.argv[4]
94    geno = sys.argv[5]
95    hwe = sys.argv[6]
96    maf = sys.argv[7]
97    me1 = sys.argv[8]
98    me2 = sys.argv[9]
99    outfname = sys.argv[10]
100    outfpath = sys.argv[11]
101    relf = sys.argv[12]
102    afff = sys.argv[13]
103    sexf = sys.argv[14]
104    fixaff = sys.argv[15]
105    output = os.path.join(outfpath,outfname)
106    outpath = os.path.join(outfpath,title)
107    outprunepath = os.path.join(outfpath,'ldprune_%s' % title)
108    try:
109      os.makedirs(outfpath)
110    except:
111      pass
112    bfile = os.path.join(inpath,inbase)
113    outf = file(outfname,'w')
114    vcl = [plinke,'--noweb','--bfile',bfile,'--make-bed','--out',
115          outpath,'--set-hh-missing','--mind',mind,
116          '--geno',geno,'--maf',maf,'--hwe',hwe,'--me',me1,me2]
117    # yes - the --me parameter takes 2 values - mendels per snp and per family
118    if relf == 'oo': # plink filters are what they leave...
119        vcl.append('--filter-nonfounders') # leave only offspring
120    elif relf == 'fo':
121        vcl.append('--filter-founders')
122    if afff == 'affonly':
123        vcl.append('--filter-controls')
124    elif relf == 'unaffonly':
125        vcl.append('--filter-cases')
126    if sexf == 'fsex':
127        vcl.append('--filter-females')
128    elif relf == 'msex':
129        vcl.append('--filter-males')       
130    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath)
131    retval = p.wait()
132    plog.append('%s started, called as %s' % (prog,' '.join(sys.argv)))
133    outf.write(galhtmlprefix % prog)
134    outf.write('<ul>\n')
135    plogf = '%s.log' % os.path.join(outfpath,title)
136    try:
137        plogl = file(plogf,'r').readlines()
138        plog += [x.strip() for x in plogl]
139    except:
140        plog += ['###Cannot open plink log file %s' % plogf,]
141    # if fixaff, want to 'fix' the fam file
142    if fixaff <> '0':
143        nchanged = fixoutaff(outpath=outpath,newaff=fixaff)
144        plog += ['## fixaff was requested  %d subjects affection status changed to %s' % (nchanged,fixaff)]
145    pf = file(plogf,'w')
146    pf.write('\n'.join(plog))
147    pf.close()
148    globme = os.path.join(outfpath,'*')
149    flist = glob.glob(globme)
150    flist.sort()
151    for i, data in enumerate( flist ):
152        outf.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1]))
153    outf.write('</ul>\n')
154    outf.write("</ul></br></div></body></html>")
155    outf.close()
156
157
158if __name__ == "__main__":
159    clean()
160
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。