[2] | 1 | #!/usr/bin/env python |
---|
| 2 | |
---|
| 3 | """ |
---|
| 4 | This accepts as input a file of the following format: |
---|
| 5 | |
---|
| 6 | Site Sample Allele1 Allele2 |
---|
| 7 | |
---|
| 8 | for example: |
---|
| 9 | |
---|
| 10 | 000834 D001 G G |
---|
| 11 | 000834 D002 G G |
---|
| 12 | 000834 D003 G G |
---|
| 13 | 000834 D004 G G |
---|
| 14 | 000834 D005 N N |
---|
| 15 | 000834 E001 G G |
---|
| 16 | 000834 E002 G G |
---|
| 17 | 000834 E003 G G |
---|
| 18 | 000834 E004 G G |
---|
| 19 | 000834 E005 G G |
---|
| 20 | 000963 D001 T T |
---|
| 21 | 000963 D002 T T |
---|
| 22 | 000963 D003 T T |
---|
| 23 | 000963 D004 T T |
---|
| 24 | 000963 D005 N N |
---|
| 25 | 000963 E001 T T |
---|
| 26 | 000963 E002 N N |
---|
| 27 | 000963 E003 G T |
---|
| 28 | 000963 E004 G G |
---|
| 29 | 000963 E005 G T |
---|
| 30 | |
---|
| 31 | and a rsquare threshold and outputs two files: |
---|
| 32 | |
---|
| 33 | a) a file of input snps (one on each line). A SNP is identified by the "Site" |
---|
| 34 | column in the input file |
---|
| 35 | |
---|
| 36 | b) a file where each line has the following: |
---|
| 37 | SNP list |
---|
| 38 | where SNP is one of the SNPs and the "list" is a comma separated list of SNPs |
---|
| 39 | that exceed the rsquare threshold with the first SNP. |
---|
| 40 | """ |
---|
| 41 | |
---|
| 42 | from sys import argv, stderr, exit |
---|
| 43 | from getopt import getopt, GetoptError |
---|
| 44 | |
---|
| 45 | __author__ = "Aakrosh Ratan" |
---|
| 46 | __email__ = "ratan@bx.psu.edu" |
---|
| 47 | |
---|
| 48 | # do we want the debug information to be printed? |
---|
| 49 | debug_flag = False |
---|
| 50 | |
---|
| 51 | # denote different combos of alleles in code |
---|
| 52 | HOMC = str(1) |
---|
| 53 | HOMR = str(2) |
---|
| 54 | HETE = str(3) |
---|
| 55 | OTHER = str(4) |
---|
| 56 | |
---|
| 57 | indexcalculator = {(HOMC,HOMC) : 0, |
---|
| 58 | (HOMC,HOMR) : 1, |
---|
| 59 | (HOMC,HETE) : 2, |
---|
| 60 | (HOMR,HOMC) : 3, |
---|
| 61 | (HOMR,HOMR) : 4, |
---|
| 62 | (HOMR,HETE) : 5, |
---|
| 63 | (HETE,HOMC) : 6, |
---|
| 64 | (HETE,HOMR) : 7, |
---|
| 65 | (HETE,HETE) : 8} |
---|
| 66 | |
---|
| 67 | def read_inputfile(filename, samples): |
---|
| 68 | input = {} |
---|
| 69 | |
---|
| 70 | file = open(filename, "r") |
---|
| 71 | |
---|
| 72 | for line in file: |
---|
| 73 | position,sample,allele1,allele2 = line.split() |
---|
| 74 | |
---|
| 75 | # if the user specified a list of samples, then only use those samples |
---|
| 76 | if samples != None and sample not in samples: continue |
---|
| 77 | |
---|
| 78 | if position in input: |
---|
| 79 | v = input[position] |
---|
| 80 | v[sample] = (allele1,allele2) |
---|
| 81 | else: |
---|
| 82 | v = {sample : (allele1, allele2)} |
---|
| 83 | input[position] = v |
---|
| 84 | |
---|
| 85 | file.close() |
---|
| 86 | return input |
---|
| 87 | |
---|
| 88 | def annotate_locus(input, minorallelefrequency, snpsfile): |
---|
| 89 | locus = {} |
---|
| 90 | for k,v in input.items(): |
---|
| 91 | genotypes = [x for x in v.values()] |
---|
| 92 | alleles = [y for x in genotypes for y in x] |
---|
| 93 | alleleset = list(set(alleles)) |
---|
| 94 | alleleset = list(set(alleles) - set(["N","X"])) |
---|
| 95 | |
---|
| 96 | if len(alleleset) == 2: |
---|
| 97 | genotypevec = "" |
---|
| 98 | num1 = len([x for x in alleles if x == alleleset[0]]) |
---|
| 99 | num2 = len([x for x in alleles if x == alleleset[1]]) |
---|
| 100 | |
---|
| 101 | if num1 > num2: |
---|
| 102 | major = alleleset[0] |
---|
| 103 | minor = alleleset[1] |
---|
| 104 | minorfreq = (num2 * 1.0)/(num1 + num2) |
---|
| 105 | else: |
---|
| 106 | major = alleleset[1] |
---|
| 107 | minor = alleleset[0] |
---|
| 108 | minorfreq = (num1 * 1.0)/(num1 + num2) |
---|
| 109 | |
---|
| 110 | if minorfreq < minorallelefrequency: continue |
---|
| 111 | |
---|
| 112 | for gen in genotypes: |
---|
| 113 | if gen == (major,major): |
---|
| 114 | genotypevec += HOMC |
---|
| 115 | elif gen == (minor,minor): |
---|
| 116 | genotypevec += HOMR |
---|
| 117 | elif gen == (major, minor) or gen == (minor, major): |
---|
| 118 | genotypevec += HETE |
---|
| 119 | else: |
---|
| 120 | genotypevec += OTHER |
---|
| 121 | |
---|
| 122 | locus[k] = genotypevec,minorfreq |
---|
| 123 | elif len(alleleset) > 2: |
---|
| 124 | print >> snpsfile, k |
---|
| 125 | return locus |
---|
| 126 | |
---|
| 127 | def calculateLD(loci, rsqthreshold): |
---|
| 128 | snps = list(loci) |
---|
| 129 | rsquare = {} |
---|
| 130 | |
---|
| 131 | for index,loc1 in enumerate(snps): |
---|
| 132 | for loc2 in snps[index + 1:]: |
---|
| 133 | matrix = [0]*9 |
---|
| 134 | |
---|
| 135 | vec1 = loci[loc1][0] |
---|
| 136 | vec2 = loci[loc2][0] |
---|
| 137 | |
---|
| 138 | for gen in zip(vec1,vec2): |
---|
| 139 | if gen[0] == OTHER or gen[1] == OTHER: continue |
---|
| 140 | matrix[indexcalculator[gen]] += 1 |
---|
| 141 | |
---|
| 142 | n = sum(matrix) |
---|
| 143 | x11 = 2*matrix[0] + matrix[2] + matrix[6] |
---|
| 144 | x12 = 2*matrix[1] + matrix[2] + matrix[7] |
---|
| 145 | x21 = 2*matrix[3] + matrix[6] + matrix[5] |
---|
| 146 | x22 = 2*matrix[4] + matrix[6] + matrix[5] |
---|
| 147 | |
---|
| 148 | p = (x11 + x12 + matrix[8] * 1.0) / (2 * n) |
---|
| 149 | q = (x11 + x21 + matrix[8] * 1.0) / (2 * n) |
---|
| 150 | |
---|
| 151 | p11 = p * q |
---|
| 152 | |
---|
| 153 | oldp11 = p11 |
---|
| 154 | range = 0.0 |
---|
| 155 | converged = False |
---|
| 156 | convergentcounter = 0 |
---|
| 157 | if p11 > 0.0: |
---|
| 158 | while converged == False and convergentcounter < 100: |
---|
| 159 | if (1.0 - p - q + p11) != 0.0 and oldp11 != 0.0: |
---|
| 160 | num = matrix[8] * p11 * (1.0 - p - q + p11) |
---|
| 161 | den = p11 * (1.0 - p - q + p11) + (p - p11)*(q - p11) |
---|
| 162 | p11 = (x11 + (num/den))/(2.0*n) |
---|
| 163 | range = p11/oldp11 |
---|
| 164 | if range >= 0.9999 and range <= 1.001: |
---|
| 165 | converged = True |
---|
| 166 | oldp11 = p11 |
---|
| 167 | convergentcounter += 1 |
---|
| 168 | else: |
---|
| 169 | converged = True |
---|
| 170 | |
---|
| 171 | dvalue = 0.0 |
---|
| 172 | if converged == True: |
---|
| 173 | dvalue = p11 - (p * q) |
---|
| 174 | |
---|
| 175 | if dvalue != 0.0: |
---|
| 176 | rsq = (dvalue**2)/(p*q*(1-p)*(1-q)) |
---|
| 177 | if rsq >= rsqthreshold: |
---|
| 178 | rsquare["%s %s" % (loc1,loc2)] = rsq |
---|
| 179 | |
---|
| 180 | return rsquare |
---|
| 181 | |
---|
| 182 | def main(inputfile, snpsfile, neigborhoodfile, \ |
---|
| 183 | rsquare, minorallelefrequency, samples): |
---|
| 184 | # read the input file |
---|
| 185 | input = read_inputfile(inputfile, samples) |
---|
| 186 | print >> stderr, "Read %d locations" % len(input) |
---|
| 187 | |
---|
| 188 | # open the snpsfile to print |
---|
| 189 | file = open(snpsfile, "w") |
---|
| 190 | |
---|
| 191 | # annotate the inputs, remove the abnormal loci (which do not have 2 alleles |
---|
| 192 | # and add the major and minor allele to each loci |
---|
| 193 | loci = annotate_locus(input, minorallelefrequency, file) |
---|
| 194 | print >> stderr, "Read %d interesting locations" % len(loci) |
---|
| 195 | |
---|
| 196 | # print all the interesting loci as candidate snps |
---|
| 197 | for k in loci.keys(): print >> file, k |
---|
| 198 | file.close() |
---|
| 199 | print >> stderr, "Finished creating the snpsfile" |
---|
| 200 | |
---|
| 201 | # calculate the LD values and store it if it exceeds the threshold |
---|
| 202 | lds = calculateLD(loci, rsquare) |
---|
| 203 | print >> stderr, "Calculated all the LD values" |
---|
| 204 | |
---|
| 205 | # create a list of SNPs |
---|
| 206 | snps = {} |
---|
| 207 | ldvals = {} |
---|
| 208 | for k,v in lds.items(): |
---|
| 209 | s1,s2 = k.split() |
---|
| 210 | if s1 in snps: snps[s1].append(s2) |
---|
| 211 | else : snps[s1] = [s2] |
---|
| 212 | if s2 in snps: snps[s2].append(s1) |
---|
| 213 | else : snps[s2] = [s1] |
---|
| 214 | |
---|
| 215 | if s1 in ldvals: ldvals[s1].append(str(v)) |
---|
| 216 | else : ldvals[s1] = [str(v)] |
---|
| 217 | if s2 in ldvals: ldvals[s2].append(str(v)) |
---|
| 218 | else : ldvals[s2] = [str(v)] |
---|
| 219 | |
---|
| 220 | # print the snps to the output file |
---|
| 221 | file = open(neigborhoodfile, "w") |
---|
| 222 | |
---|
| 223 | for k,v in snps.items(): |
---|
| 224 | ldv = ldvals[k] |
---|
| 225 | if debug_flag == True: |
---|
| 226 | print >> file, "%s\t%s\t%s" % (k, ",".join(v), ",".join(ldv)) |
---|
| 227 | else: |
---|
| 228 | print >> file, "%s\t%s" % (k, ",".join(v)) |
---|
| 229 | |
---|
| 230 | file.close() |
---|
| 231 | |
---|
| 232 | |
---|
| 233 | def read_list(filename): |
---|
| 234 | file = open(filename, "r") |
---|
| 235 | list = {} |
---|
| 236 | |
---|
| 237 | for line in file: |
---|
| 238 | list[line.strip()] = 1 |
---|
| 239 | |
---|
| 240 | file.close() |
---|
| 241 | return list |
---|
| 242 | |
---|
| 243 | def usage(): |
---|
| 244 | f = stderr |
---|
| 245 | print >> f, "usage:" |
---|
| 246 | print >> f, "pagetag [options] input.txt snps.txt neighborhood.txt" |
---|
| 247 | print >> f, "where input.txt is the prettybase file" |
---|
| 248 | print >> f, "where snps.txt is the first output file with the snps" |
---|
| 249 | print >> f, "where neighborhood.txt is the output neighborhood file" |
---|
| 250 | print >> f, "where the options are:" |
---|
| 251 | print >> f, "-h,--help : print usage and quit" |
---|
| 252 | print >> f, "-d,--debug: print debug information" |
---|
| 253 | print >> f, "-r,--rsquare: the rsquare threshold (default : 0.64)" |
---|
| 254 | print >> f, "-f,--freq : the minimum MAF required (default: 0.0)" |
---|
| 255 | print >> f, "-s,--sample : a list of samples to be clustered" |
---|
| 256 | |
---|
| 257 | if __name__ == "__main__": |
---|
| 258 | try: |
---|
| 259 | opts, args = getopt(argv[1:], "hds:r:f:",\ |
---|
| 260 | ["help", "debug", "rsquare=","freq=", "sample="]) |
---|
| 261 | except GetoptError, err: |
---|
| 262 | print str(err) |
---|
| 263 | usage() |
---|
| 264 | exit(2) |
---|
| 265 | |
---|
| 266 | rsquare = 0.64 |
---|
| 267 | minorallelefrequency = 0.0 |
---|
| 268 | samples = None |
---|
| 269 | |
---|
| 270 | for o, a in opts: |
---|
| 271 | if o in ("-h", "--help"): |
---|
| 272 | usage() |
---|
| 273 | exit() |
---|
| 274 | elif o in ("-d", "--debug"): |
---|
| 275 | debug_flag = True |
---|
| 276 | elif o in ("-r", "--rsquare"): |
---|
| 277 | rsquare = float(a) |
---|
| 278 | elif o in ("-f", "--freq"): |
---|
| 279 | minorallelefrequency = float(a) |
---|
| 280 | elif o in ("-s", "--sample"): |
---|
| 281 | samples = read_list(a) |
---|
| 282 | else: |
---|
| 283 | assert False, "unhandled option" |
---|
| 284 | |
---|
| 285 | if rsquare < 0.00 or rsquare > 1.00: |
---|
| 286 | print >> stderr, "input value of rsquare should be in [0.00, 1.00]" |
---|
| 287 | exit(3) |
---|
| 288 | |
---|
| 289 | if minorallelefrequency < 0.0 or minorallelefrequency > 0.5: |
---|
| 290 | print >> stderr, "input value of MAF should be (0.00,0.50]" |
---|
| 291 | exit(4) |
---|
| 292 | |
---|
| 293 | if len(args) != 3: |
---|
| 294 | usage() |
---|
| 295 | exit(5) |
---|
| 296 | |
---|
| 297 | main(args[0], args[1], args[2], rsquare, minorallelefrequency, samples) |
---|