1 | #!/usr/bin/env python |
---|
2 | |
---|
3 | """ |
---|
4 | This accepts as input a file of the following format: |
---|
5 | |
---|
6 | Site Sample Allele1 Allele2 |
---|
7 | |
---|
8 | for example: |
---|
9 | |
---|
10 | 000834 D001 G G |
---|
11 | 000834 D002 G G |
---|
12 | 000834 D003 G G |
---|
13 | 000834 D004 G G |
---|
14 | 000834 D005 N N |
---|
15 | 000834 E001 G G |
---|
16 | 000834 E002 G G |
---|
17 | 000834 E003 G G |
---|
18 | 000834 E004 G G |
---|
19 | 000834 E005 G G |
---|
20 | 000963 D001 T T |
---|
21 | 000963 D002 T T |
---|
22 | 000963 D003 T T |
---|
23 | 000963 D004 T T |
---|
24 | 000963 D005 N N |
---|
25 | 000963 E001 T T |
---|
26 | 000963 E002 N N |
---|
27 | 000963 E003 G T |
---|
28 | 000963 E004 G G |
---|
29 | 000963 E005 G T |
---|
30 | |
---|
31 | and a rsquare threshold and outputs two files: |
---|
32 | |
---|
33 | a) a file of input snps (one on each line). A SNP is identified by the "Site" |
---|
34 | column in the input file |
---|
35 | |
---|
36 | b) a file where each line has the following: |
---|
37 | SNP list |
---|
38 | where SNP is one of the SNPs and the "list" is a comma separated list of SNPs |
---|
39 | that exceed the rsquare threshold with the first SNP. |
---|
40 | """ |
---|
41 | |
---|
42 | from sys import argv, stderr, exit |
---|
43 | from getopt import getopt, GetoptError |
---|
44 | |
---|
45 | __author__ = "Aakrosh Ratan" |
---|
46 | __email__ = "ratan@bx.psu.edu" |
---|
47 | |
---|
48 | # do we want the debug information to be printed? |
---|
49 | debug_flag = False |
---|
50 | |
---|
51 | # denote different combos of alleles in code |
---|
52 | HOMC = str(1) |
---|
53 | HOMR = str(2) |
---|
54 | HETE = str(3) |
---|
55 | OTHER = str(4) |
---|
56 | |
---|
57 | indexcalculator = {(HOMC,HOMC) : 0, |
---|
58 | (HOMC,HOMR) : 1, |
---|
59 | (HOMC,HETE) : 2, |
---|
60 | (HOMR,HOMC) : 3, |
---|
61 | (HOMR,HOMR) : 4, |
---|
62 | (HOMR,HETE) : 5, |
---|
63 | (HETE,HOMC) : 6, |
---|
64 | (HETE,HOMR) : 7, |
---|
65 | (HETE,HETE) : 8} |
---|
66 | |
---|
67 | def read_inputfile(filename, samples): |
---|
68 | input = {} |
---|
69 | |
---|
70 | file = open(filename, "r") |
---|
71 | |
---|
72 | for line in file: |
---|
73 | position,sample,allele1,allele2 = line.split() |
---|
74 | |
---|
75 | # if the user specified a list of samples, then only use those samples |
---|
76 | if samples != None and sample not in samples: continue |
---|
77 | |
---|
78 | if position in input: |
---|
79 | v = input[position] |
---|
80 | v[sample] = (allele1,allele2) |
---|
81 | else: |
---|
82 | v = {sample : (allele1, allele2)} |
---|
83 | input[position] = v |
---|
84 | |
---|
85 | file.close() |
---|
86 | return input |
---|
87 | |
---|
88 | def annotate_locus(input, minorallelefrequency, snpsfile): |
---|
89 | locus = {} |
---|
90 | for k,v in input.items(): |
---|
91 | genotypes = [x for x in v.values()] |
---|
92 | alleles = [y for x in genotypes for y in x] |
---|
93 | alleleset = list(set(alleles)) |
---|
94 | alleleset = list(set(alleles) - set(["N","X"])) |
---|
95 | |
---|
96 | if len(alleleset) == 2: |
---|
97 | genotypevec = "" |
---|
98 | num1 = len([x for x in alleles if x == alleleset[0]]) |
---|
99 | num2 = len([x for x in alleles if x == alleleset[1]]) |
---|
100 | |
---|
101 | if num1 > num2: |
---|
102 | major = alleleset[0] |
---|
103 | minor = alleleset[1] |
---|
104 | minorfreq = (num2 * 1.0)/(num1 + num2) |
---|
105 | else: |
---|
106 | major = alleleset[1] |
---|
107 | minor = alleleset[0] |
---|
108 | minorfreq = (num1 * 1.0)/(num1 + num2) |
---|
109 | |
---|
110 | if minorfreq < minorallelefrequency: continue |
---|
111 | |
---|
112 | for gen in genotypes: |
---|
113 | if gen == (major,major): |
---|
114 | genotypevec += HOMC |
---|
115 | elif gen == (minor,minor): |
---|
116 | genotypevec += HOMR |
---|
117 | elif gen == (major, minor) or gen == (minor, major): |
---|
118 | genotypevec += HETE |
---|
119 | else: |
---|
120 | genotypevec += OTHER |
---|
121 | |
---|
122 | locus[k] = genotypevec,minorfreq |
---|
123 | elif len(alleleset) > 2: |
---|
124 | print >> snpsfile, k |
---|
125 | return locus |
---|
126 | |
---|
127 | def calculateLD(loci, rsqthreshold): |
---|
128 | snps = list(loci) |
---|
129 | rsquare = {} |
---|
130 | |
---|
131 | for index,loc1 in enumerate(snps): |
---|
132 | for loc2 in snps[index + 1:]: |
---|
133 | matrix = [0]*9 |
---|
134 | |
---|
135 | vec1 = loci[loc1][0] |
---|
136 | vec2 = loci[loc2][0] |
---|
137 | |
---|
138 | for gen in zip(vec1,vec2): |
---|
139 | if gen[0] == OTHER or gen[1] == OTHER: continue |
---|
140 | matrix[indexcalculator[gen]] += 1 |
---|
141 | |
---|
142 | n = sum(matrix) |
---|
143 | x11 = 2*matrix[0] + matrix[2] + matrix[6] |
---|
144 | x12 = 2*matrix[1] + matrix[2] + matrix[7] |
---|
145 | x21 = 2*matrix[3] + matrix[6] + matrix[5] |
---|
146 | x22 = 2*matrix[4] + matrix[6] + matrix[5] |
---|
147 | |
---|
148 | p = (x11 + x12 + matrix[8] * 1.0) / (2 * n) |
---|
149 | q = (x11 + x21 + matrix[8] * 1.0) / (2 * n) |
---|
150 | |
---|
151 | p11 = p * q |
---|
152 | |
---|
153 | oldp11 = p11 |
---|
154 | range = 0.0 |
---|
155 | converged = False |
---|
156 | convergentcounter = 0 |
---|
157 | if p11 > 0.0: |
---|
158 | while converged == False and convergentcounter < 100: |
---|
159 | if (1.0 - p - q + p11) != 0.0 and oldp11 != 0.0: |
---|
160 | num = matrix[8] * p11 * (1.0 - p - q + p11) |
---|
161 | den = p11 * (1.0 - p - q + p11) + (p - p11)*(q - p11) |
---|
162 | p11 = (x11 + (num/den))/(2.0*n) |
---|
163 | range = p11/oldp11 |
---|
164 | if range >= 0.9999 and range <= 1.001: |
---|
165 | converged = True |
---|
166 | oldp11 = p11 |
---|
167 | convergentcounter += 1 |
---|
168 | else: |
---|
169 | converged = True |
---|
170 | |
---|
171 | dvalue = 0.0 |
---|
172 | if converged == True: |
---|
173 | dvalue = p11 - (p * q) |
---|
174 | |
---|
175 | if dvalue != 0.0: |
---|
176 | rsq = (dvalue**2)/(p*q*(1-p)*(1-q)) |
---|
177 | if rsq >= rsqthreshold: |
---|
178 | rsquare["%s %s" % (loc1,loc2)] = rsq |
---|
179 | |
---|
180 | return rsquare |
---|
181 | |
---|
182 | def main(inputfile, snpsfile, neigborhoodfile, \ |
---|
183 | rsquare, minorallelefrequency, samples): |
---|
184 | # read the input file |
---|
185 | input = read_inputfile(inputfile, samples) |
---|
186 | print >> stderr, "Read %d locations" % len(input) |
---|
187 | |
---|
188 | # open the snpsfile to print |
---|
189 | file = open(snpsfile, "w") |
---|
190 | |
---|
191 | # annotate the inputs, remove the abnormal loci (which do not have 2 alleles |
---|
192 | # and add the major and minor allele to each loci |
---|
193 | loci = annotate_locus(input, minorallelefrequency, file) |
---|
194 | print >> stderr, "Read %d interesting locations" % len(loci) |
---|
195 | |
---|
196 | # print all the interesting loci as candidate snps |
---|
197 | for k in loci.keys(): print >> file, k |
---|
198 | file.close() |
---|
199 | print >> stderr, "Finished creating the snpsfile" |
---|
200 | |
---|
201 | # calculate the LD values and store it if it exceeds the threshold |
---|
202 | lds = calculateLD(loci, rsquare) |
---|
203 | print >> stderr, "Calculated all the LD values" |
---|
204 | |
---|
205 | # create a list of SNPs |
---|
206 | snps = {} |
---|
207 | ldvals = {} |
---|
208 | for k,v in lds.items(): |
---|
209 | s1,s2 = k.split() |
---|
210 | if s1 in snps: snps[s1].append(s2) |
---|
211 | else : snps[s1] = [s2] |
---|
212 | if s2 in snps: snps[s2].append(s1) |
---|
213 | else : snps[s2] = [s1] |
---|
214 | |
---|
215 | if s1 in ldvals: ldvals[s1].append(str(v)) |
---|
216 | else : ldvals[s1] = [str(v)] |
---|
217 | if s2 in ldvals: ldvals[s2].append(str(v)) |
---|
218 | else : ldvals[s2] = [str(v)] |
---|
219 | |
---|
220 | # print the snps to the output file |
---|
221 | file = open(neigborhoodfile, "w") |
---|
222 | |
---|
223 | for k,v in snps.items(): |
---|
224 | ldv = ldvals[k] |
---|
225 | if debug_flag == True: |
---|
226 | print >> file, "%s\t%s\t%s" % (k, ",".join(v), ",".join(ldv)) |
---|
227 | else: |
---|
228 | print >> file, "%s\t%s" % (k, ",".join(v)) |
---|
229 | |
---|
230 | file.close() |
---|
231 | |
---|
232 | |
---|
233 | def read_list(filename): |
---|
234 | file = open(filename, "r") |
---|
235 | list = {} |
---|
236 | |
---|
237 | for line in file: |
---|
238 | list[line.strip()] = 1 |
---|
239 | |
---|
240 | file.close() |
---|
241 | return list |
---|
242 | |
---|
243 | def usage(): |
---|
244 | f = stderr |
---|
245 | print >> f, "usage:" |
---|
246 | print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt" |
---|
247 | print >> f, "where input.txt is the prettybase file" |
---|
248 | print >> f, "where snps.txt is the first output file with the snps" |
---|
249 | print >> f, "where neighborhood.txt is the output neighborhood file" |
---|
250 | print >> f, "where the options are:" |
---|
251 | print >> f, "-h,--help : print usage and quit" |
---|
252 | print >> f, "-d,--debug: print debug information" |
---|
253 | print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)" |
---|
254 | print >> f, "-f,--freq : the minimum MAF required (default: 0.0)" |
---|
255 | print >> f, "-s,--sample : a list of samples to be clustered" |
---|
256 | |
---|
257 | if __name__ == "__main__": |
---|
258 | try: |
---|
259 | opts, args = getopt(argv[1:], "hds:r:f:",\ |
---|
260 | ["help", "debug", "rsquare=","freq=", "sample="]) |
---|
261 | except GetoptError, err: |
---|
262 | print str(err) |
---|
263 | usage() |
---|
264 | exit(2) |
---|
265 | |
---|
266 | rsquare = 0.64 |
---|
267 | minorallelefrequency = 0.0 |
---|
268 | samples = None |
---|
269 | |
---|
270 | for o, a in opts: |
---|
271 | if o in ("-h", "--help"): |
---|
272 | usage() |
---|
273 | exit() |
---|
274 | elif o in ("-d", "--debug"): |
---|
275 | debug_flag = True |
---|
276 | elif o in ("-r", "--rsquare"): |
---|
277 | rsquare = float(a) |
---|
278 | elif o in ("-f", "--freq"): |
---|
279 | minorallelefrequency = float(a) |
---|
280 | elif o in ("-s", "--sample"): |
---|
281 | samples = read_list(a) |
---|
282 | else: |
---|
283 | assert False, "unhandled option" |
---|
284 | |
---|
285 | if rsquare < 0.00 or rsquare > 1.00: |
---|
286 | print >> stderr, "input value of rsquare should be in [0.00, 1.00]" |
---|
287 | exit(3) |
---|
288 | |
---|
289 | if minorallelefrequency < 0.0 or minorallelefrequency > 0.5: |
---|
290 | print >> stderr, "input value of MAF should be (0.00,0.50]" |
---|
291 | exit(4) |
---|
292 | |
---|
293 | if len(args) != 3: |
---|
294 | usage() |
---|
295 | exit(5) |
---|
296 | |
---|
297 | main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples) |
---|