root/galaxy-central/tools/human_genome_variation/pagetag.py @ 3

リビジョン 2, 9.1 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

  • 属性 svn:executable の設定値 *
行番号 
1#!/usr/bin/env python
2
3"""
4This accepts as input a file of the following format:
5
6    Site   Sample   Allele1   Allele2
7
8for example:
9
10    000834   D001    G       G
11    000834   D002    G       G
12    000834   D003    G       G
13    000834   D004    G       G
14    000834   D005    N       N
15    000834   E001    G       G
16    000834   E002    G       G
17    000834   E003    G       G
18    000834   E004    G       G
19    000834   E005    G       G
20    000963   D001    T       T
21    000963   D002    T       T
22    000963   D003    T       T
23    000963   D004    T       T
24    000963   D005    N       N
25    000963   E001    T       T
26    000963   E002    N       N
27    000963   E003    G       T
28    000963   E004    G       G
29    000963   E005    G       T
30
31and a rsquare threshold and outputs two files:
32
33a) a file of input snps (one on each line). A SNP is identified by the "Site"
34column in the input file
35
36b) a file where each line has the following:
37    SNP     list
38where SNP is one of  the SNPs and the "list" is a comma separated list of SNPs
39that exceed the rsquare threshold with the first SNP.
40"""
41
42from sys import argv, stderr, exit
43from getopt import getopt, GetoptError
44
45__author__ = "Aakrosh Ratan"
46__email__  = "ratan@bx.psu.edu"
47
48# do we want the debug information to be printed?
49debug_flag = False
50
51# denote different combos of alleles in code
52HOMC  = str(1)
53HOMR  = str(2)
54HETE  = str(3)
55OTHER = str(4)
56
57indexcalculator = {(HOMC,HOMC) : 0,
58                   (HOMC,HOMR) : 1,
59                   (HOMC,HETE) : 2,
60                   (HOMR,HOMC) : 3,
61                   (HOMR,HOMR) : 4,
62                   (HOMR,HETE) : 5,
63                   (HETE,HOMC) : 6,
64                   (HETE,HOMR) : 7,
65                   (HETE,HETE) : 8}
66
67def read_inputfile(filename, samples):
68    input = {}
69
70    file = open(filename, "r")
71
72    for line in file:
73        position,sample,allele1,allele2 = line.split()
74
75        # if the user specified a list of samples, then only use those samples
76        if samples != None and sample not in samples: continue
77           
78        if position in input:
79            v = input[position]
80            v[sample] = (allele1,allele2)
81        else:
82            v = {sample : (allele1, allele2)}
83            input[position] = v
84
85    file.close()
86    return input
87
88def annotate_locus(input, minorallelefrequency, snpsfile):
89    locus = {}
90    for k,v in input.items():
91        genotypes = [x for x in v.values()]
92        alleles   = [y for x in genotypes for y in x]
93        alleleset = list(set(alleles))
94        alleleset = list(set(alleles) - set(["N","X"]))
95
96        if len(alleleset) == 2:
97            genotypevec = ""
98            num1 = len([x for x in alleles if x == alleleset[0]])
99            num2 = len([x for x in alleles if x == alleleset[1]])
100 
101            if num1 > num2:
102                major = alleleset[0]   
103                minor = alleleset[1]
104                minorfreq = (num2 * 1.0)/(num1 + num2)
105            else:
106                major = alleleset[1]
107                minor = alleleset[0]
108                minorfreq = (num1 * 1.0)/(num1 + num2)
109
110            if minorfreq < minorallelefrequency: continue
111               
112            for gen in genotypes:
113                if gen == (major,major):
114                    genotypevec += HOMC
115                elif gen == (minor,minor):
116                    genotypevec += HOMR
117                elif gen == (major, minor) or gen == (minor, major):
118                    genotypevec += HETE
119                else: 
120                    genotypevec += OTHER
121
122            locus[k] = genotypevec,minorfreq
123        elif len(alleleset) > 2:
124            print >> snpsfile, k
125    return locus
126
127def calculateLD(loci, rsqthreshold):
128    snps = list(loci)
129    rsquare = {}
130
131    for index,loc1 in enumerate(snps):
132        for loc2 in snps[index + 1:]:
133            matrix = [0]*9
134
135            vec1 = loci[loc1][0]
136            vec2 = loci[loc2][0]
137
138            for gen in zip(vec1,vec2):
139                if gen[0] == OTHER or gen[1] == OTHER: continue
140                matrix[indexcalculator[gen]] += 1
141
142            n   = sum(matrix)
143            x11 = 2*matrix[0] + matrix[2] + matrix[6]
144            x12 = 2*matrix[1] + matrix[2] + matrix[7]
145            x21 = 2*matrix[3] + matrix[6] + matrix[5]
146            x22 = 2*matrix[4] + matrix[6] + matrix[5]
147
148            p   = (x11 + x12 + matrix[8] * 1.0) / (2 * n)
149            q   = (x11 + x21 + matrix[8] * 1.0) / (2 * n)
150     
151            p11    = p * q
152
153            oldp11 = p11
154            range  = 0.0
155            converged         = False
156            convergentcounter = 0
157            if p11 > 0.0:
158                while converged == False and convergentcounter < 100:
159                    if (1.0 - p - q + p11) != 0.0 and oldp11 != 0.0:   
160                        num = matrix[8] * p11 * (1.0 - p - q + p11)
161                        den = p11 * (1.0 - p - q + p11) + (p - p11)*(q - p11)
162                        p11 = (x11 + (num/den))/(2.0*n)
163                        range = p11/oldp11
164                        if range >= 0.9999 and range <= 1.001:
165                            converged = True
166                        oldp11 = p11
167                        convergentcounter += 1
168                    else:
169                        converged = True
170 
171            dvalue = 0.0
172            if converged == True:
173                dvalue = p11 - (p * q)
174   
175            if dvalue != 0.0:
176                rsq = (dvalue**2)/(p*q*(1-p)*(1-q))
177                if rsq >= rsqthreshold:
178                    rsquare["%s %s" % (loc1,loc2)] = rsq
179
180    return rsquare
181
182def main(inputfile, snpsfile, neigborhoodfile, \
183         rsquare, minorallelefrequency, samples):
184    # read the input file
185    input = read_inputfile(inputfile, samples)     
186    print >> stderr, "Read %d locations" % len(input)
187
188    # open the snpsfile to print
189    file = open(snpsfile, "w")
190
191    # annotate the inputs, remove the abnormal loci (which do not have 2 alleles
192    # and add the major and minor allele to each loci
193    loci = annotate_locus(input, minorallelefrequency, file)
194    print >> stderr, "Read %d interesting locations" % len(loci)
195       
196    # print all the interesting loci as candidate snps
197    for k in loci.keys(): print >> file, k
198    file.close()
199    print >> stderr, "Finished creating the snpsfile"
200
201    # calculate the LD values and store it if it exceeds the threshold
202    lds = calculateLD(loci, rsquare)
203    print >> stderr, "Calculated all the LD values"
204
205    # create a list of SNPs   
206    snps   = {}
207    ldvals = {}
208    for k,v in lds.items():
209        s1,s2 = k.split()
210        if s1 in snps: snps[s1].append(s2)
211        else         : snps[s1] = [s2]   
212        if s2 in snps: snps[s2].append(s1)
213        else         : snps[s2] = [s1]   
214
215        if s1 in ldvals: ldvals[s1].append(str(v))
216        else           : ldvals[s1] = [str(v)]
217        if s2 in ldvals: ldvals[s2].append(str(v))
218        else           : ldvals[s2] = [str(v)]
219           
220    # print the snps to the output file
221    file = open(neigborhoodfile, "w")
222
223    for k,v in snps.items():
224        ldv = ldvals[k]
225        if debug_flag == True:
226            print >> file, "%s\t%s\t%s" % (k, ",".join(v), ",".join(ldv))
227        else:           
228            print >> file, "%s\t%s" % (k, ",".join(v))
229
230    file.close()
231 
232
233def read_list(filename):
234    file = open(filename, "r")
235    list = {}   
236
237    for line in file:
238        list[line.strip()] = 1
239
240    file.close()
241    return list
242
243def usage():
244    f = stderr
245    print >> f, "usage:"
246    print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt"
247    print >> f, "where input.txt is the prettybase file"
248    print >> f, "where snps.txt is the first output file with the snps"
249    print >> f, "where neighborhood.txt is the output neighborhood file"
250    print >> f, "where the options are:"
251    print >> f, "-h,--help : print usage and quit"
252    print >> f, "-d,--debug: print debug information"
253    print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)"
254    print >> f, "-f,--freq : the minimum MAF required (default: 0.0)"
255    print >> f, "-s,--sample : a list of samples to be clustered"   
256
257if __name__ == "__main__":
258    try:
259        opts, args = getopt(argv[1:], "hds:r:f:",\
260                    ["help", "debug", "rsquare=","freq=", "sample="])
261    except GetoptError, err:
262        print str(err)
263        usage()
264        exit(2)
265
266    rsquare = 0.64
267    minorallelefrequency = 0.0
268    samples = None
269
270    for o, a in opts:
271        if o in ("-h", "--help"):
272            usage()
273            exit()
274        elif o in ("-d", "--debug"):
275            debug_flag = True
276        elif o in ("-r", "--rsquare"):
277            rsquare = float(a)
278        elif o in ("-f", "--freq"):
279            minorallelefrequency = float(a)
280        elif o in ("-s", "--sample"):
281            samples = read_list(a)
282        else:
283            assert False, "unhandled option"
284
285    if rsquare < 0.00 or rsquare > 1.00:
286        print >> stderr, "input value of rsquare should be in [0.00, 1.00]"
287        exit(3)
288
289    if minorallelefrequency < 0.0 or minorallelefrequency > 0.5:
290        print >> stderr, "input value of MAF should be (0.00,0.50]"
291        exit(4)
292
293    if len(args) != 3:
294        usage()
295        exit(5)
296
297    main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples)
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。