[2] | 1 | #!/usr/bin/env python2.4
|
---|
| 2 | """
|
---|
| 3 | """
|
---|
| 4 |
|
---|
| 5 | import optparse,os,subprocess,gzip,struct,time,commands
|
---|
| 6 | from array import array
|
---|
| 7 |
|
---|
| 8 | #from AIMS import util
|
---|
| 9 | #from pga import util as pgautil
|
---|
| 10 |
|
---|
| 11 | __FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $'
|
---|
| 12 |
|
---|
| 13 | VERBOSE = True
|
---|
| 14 |
|
---|
| 15 | MISSING_ALLELES = set(['N', '0', '.', '-',''])
|
---|
| 16 |
|
---|
| 17 | AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)])
|
---|
| 18 |
|
---|
| 19 | MAGIC_BYTE1 = '00110110'
|
---|
| 20 | MAGIC_BYTE2 = '11011000'
|
---|
| 21 | FORMAT_SNP_MAJOR_BYTE = '10000000'
|
---|
| 22 | FORMAT_IND_MAJOR_BYTE = '00000000'
|
---|
| 23 | MAGIC1 = (0, 3, 1, 2)
|
---|
| 24 | MAGIC2 = (3, 1, 2, 0)
|
---|
| 25 | FORMAT_SNP_MAJOR = (2, 0, 0, 0)
|
---|
| 26 | FORMAT_IND_MAJOR = (0, 0, 0, 0)
|
---|
| 27 | HEADER_LENGTH = 3
|
---|
| 28 |
|
---|
| 29 | HOM0 = 3
|
---|
| 30 | HOM1 = 0
|
---|
| 31 | MISS = 2
|
---|
| 32 | HET = 1
|
---|
| 33 | HOM0_GENO = (0, 0)
|
---|
| 34 | HOM1_GENO = (1, 1)
|
---|
| 35 | HET_GENO = (0, 1)
|
---|
| 36 | MISS_GENO = (-9, -9)
|
---|
| 37 |
|
---|
| 38 | GENO_TO_GCODE = {
|
---|
| 39 | HOM0_GENO: HOM0,
|
---|
| 40 | HET_GENO: HET,
|
---|
| 41 | HOM1_GENO: HOM1,
|
---|
| 42 | MISS_GENO: MISS,
|
---|
| 43 | }
|
---|
| 44 |
|
---|
| 45 | CHROM_REPLACE = {
|
---|
| 46 | 'X': '23',
|
---|
| 47 | 'Y': '24',
|
---|
| 48 | 'XY': '25',
|
---|
| 49 | 'MT': '26',
|
---|
| 50 | 'M': '26',
|
---|
| 51 | }
|
---|
| 52 |
|
---|
| 53 | MAP_LINE_EXCEPTION_TEXT = """
|
---|
| 54 | One or more lines in the *.map file has only three fields.
|
---|
| 55 | The line was:
|
---|
| 56 |
|
---|
| 57 | %s
|
---|
| 58 |
|
---|
| 59 | If you are running rgGRR through EPMP, this is usually a
|
---|
| 60 | sign that you are using an old version of the map file.
|
---|
| 61 | You can correct the problem by re-running Subject QC. If
|
---|
| 62 | you have already tried this, please contact the developers,
|
---|
| 63 | or file a bug.
|
---|
| 64 | """
|
---|
| 65 |
|
---|
| 66 | INT_TO_GCODE = {
|
---|
| 67 | 0: array('i', (0, 0, 0, 0)), 1: array('i', (2, 0, 0, 0)), 2: array('i', (1, 0, 0, 0)), 3: array('i', (3, 0, 0, 0)),
|
---|
| 68 | 4: array('i', (0, 2, 0, 0)), 5: array('i', (2, 2, 0, 0)), 6: array('i', (1, 2, 0, 0)), 7: array('i', (3, 2, 0, 0)),
|
---|
| 69 | 8: array('i', (0, 1, 0, 0)), 9: array('i', (2, 1, 0, 0)), 10: array('i', (1, 1, 0, 0)), 11: array('i', (3, 1, 0, 0)),
|
---|
| 70 | 12: array('i', (0, 3, 0, 0)), 13: array('i', (2, 3, 0, 0)), 14: array('i', (1, 3, 0, 0)), 15: array('i', (3, 3, 0, 0)),
|
---|
| 71 | 16: array('i', (0, 0, 2, 0)), 17: array('i', (2, 0, 2, 0)), 18: array('i', (1, 0, 2, 0)), 19: array('i', (3, 0, 2, 0)),
|
---|
| 72 | 20: array('i', (0, 2, 2, 0)), 21: array('i', (2, 2, 2, 0)), 22: array('i', (1, 2, 2, 0)), 23: array('i', (3, 2, 2, 0)),
|
---|
| 73 | 24: array('i', (0, 1, 2, 0)), 25: array('i', (2, 1, 2, 0)), 26: array('i', (1, 1, 2, 0)), 27: array('i', (3, 1, 2, 0)),
|
---|
| 74 | 28: array('i', (0, 3, 2, 0)), 29: array('i', (2, 3, 2, 0)), 30: array('i', (1, 3, 2, 0)), 31: array('i', (3, 3, 2, 0)),
|
---|
| 75 | 32: array('i', (0, 0, 1, 0)), 33: array('i', (2, 0, 1, 0)), 34: array('i', (1, 0, 1, 0)), 35: array('i', (3, 0, 1, 0)),
|
---|
| 76 | 36: array('i', (0, 2, 1, 0)), 37: array('i', (2, 2, 1, 0)), 38: array('i', (1, 2, 1, 0)), 39: array('i', (3, 2, 1, 0)),
|
---|
| 77 | 40: array('i', (0, 1, 1, 0)), 41: array('i', (2, 1, 1, 0)), 42: array('i', (1, 1, 1, 0)), 43: array('i', (3, 1, 1, 0)),
|
---|
| 78 | 44: array('i', (0, 3, 1, 0)), 45: array('i', (2, 3, 1, 0)), 46: array('i', (1, 3, 1, 0)), 47: array('i', (3, 3, 1, 0)),
|
---|
| 79 | 48: array('i', (0, 0, 3, 0)), 49: array('i', (2, 0, 3, 0)), 50: array('i', (1, 0, 3, 0)), 51: array('i', (3, 0, 3, 0)),
|
---|
| 80 | 52: array('i', (0, 2, 3, 0)), 53: array('i', (2, 2, 3, 0)), 54: array('i', (1, 2, 3, 0)), 55: array('i', (3, 2, 3, 0)),
|
---|
| 81 | 56: array('i', (0, 1, 3, 0)), 57: array('i', (2, 1, 3, 0)), 58: array('i', (1, 1, 3, 0)), 59: array('i', (3, 1, 3, 0)),
|
---|
| 82 | 60: array('i', (0, 3, 3, 0)), 61: array('i', (2, 3, 3, 0)), 62: array('i', (1, 3, 3, 0)), 63: array('i', (3, 3, 3, 0)),
|
---|
| 83 | 64: array('i', (0, 0, 0, 2)), 65: array('i', (2, 0, 0, 2)), 66: array('i', (1, 0, 0, 2)), 67: array('i', (3, 0, 0, 2)),
|
---|
| 84 | 68: array('i', (0, 2, 0, 2)), 69: array('i', (2, 2, 0, 2)), 70: array('i', (1, 2, 0, 2)), 71: array('i', (3, 2, 0, 2)),
|
---|
| 85 | 72: array('i', (0, 1, 0, 2)), 73: array('i', (2, 1, 0, 2)), 74: array('i', (1, 1, 0, 2)), 75: array('i', (3, 1, 0, 2)),
|
---|
| 86 | 76: array('i', (0, 3, 0, 2)), 77: array('i', (2, 3, 0, 2)), 78: array('i', (1, 3, 0, 2)), 79: array('i', (3, 3, 0, 2)),
|
---|
| 87 | 80: array('i', (0, 0, 2, 2)), 81: array('i', (2, 0, 2, 2)), 82: array('i', (1, 0, 2, 2)), 83: array('i', (3, 0, 2, 2)),
|
---|
| 88 | 84: array('i', (0, 2, 2, 2)), 85: array('i', (2, 2, 2, 2)), 86: array('i', (1, 2, 2, 2)), 87: array('i', (3, 2, 2, 2)),
|
---|
| 89 | 88: array('i', (0, 1, 2, 2)), 89: array('i', (2, 1, 2, 2)), 90: array('i', (1, 1, 2, 2)), 91: array('i', (3, 1, 2, 2)),
|
---|
| 90 | 92: array('i', (0, 3, 2, 2)), 93: array('i', (2, 3, 2, 2)), 94: array('i', (1, 3, 2, 2)), 95: array('i', (3, 3, 2, 2)),
|
---|
| 91 | 96: array('i', (0, 0, 1, 2)), 97: array('i', (2, 0, 1, 2)), 98: array('i', (1, 0, 1, 2)), 99: array('i', (3, 0, 1, 2)),
|
---|
| 92 | 100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)),
|
---|
| 93 | 104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)),
|
---|
| 94 | 108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)),
|
---|
| 95 | 112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)),
|
---|
| 96 | 116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)),
|
---|
| 97 | 120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)),
|
---|
| 98 | 124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)),
|
---|
| 99 | 128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)),
|
---|
| 100 | 132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)),
|
---|
| 101 | 136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)),
|
---|
| 102 | 140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)),
|
---|
| 103 | 144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)),
|
---|
| 104 | 148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)),
|
---|
| 105 | 152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)),
|
---|
| 106 | 156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)),
|
---|
| 107 | 160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)),
|
---|
| 108 | 164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)),
|
---|
| 109 | 168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)),
|
---|
| 110 | 172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)),
|
---|
| 111 | 176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)),
|
---|
| 112 | 180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)),
|
---|
| 113 | 184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)),
|
---|
| 114 | 188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)),
|
---|
| 115 | 192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)),
|
---|
| 116 | 196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)),
|
---|
| 117 | 200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)),
|
---|
| 118 | 204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)),
|
---|
| 119 | 208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)),
|
---|
| 120 | 212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)),
|
---|
| 121 | 216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)),
|
---|
| 122 | 220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)),
|
---|
| 123 | 224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)),
|
---|
| 124 | 228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)),
|
---|
| 125 | 232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)),
|
---|
| 126 | 236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)),
|
---|
| 127 | 240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)),
|
---|
| 128 | 244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)),
|
---|
| 129 | 248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)),
|
---|
| 130 | 252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)),
|
---|
| 131 | }
|
---|
| 132 |
|
---|
| 133 | GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()])
|
---|
| 134 |
|
---|
| 135 | ### Exceptions
|
---|
| 136 | class DuplicateMarkerInMapFile(Exception): pass
|
---|
| 137 | class MapLineTooShort(Exception): pass
|
---|
| 138 | class ThirdAllele(Exception): pass
|
---|
| 139 | class PedError(Exception): pass
|
---|
| 140 | class BadMagic(Exception):
|
---|
| 141 | """ Raised when one of the MAGIC bytes in a bed file does not match
|
---|
| 142 | """
|
---|
| 143 | pass
|
---|
| 144 | class BedError(Exception):
|
---|
| 145 | """ Raised when parsing a bed file runs into problems
|
---|
| 146 | """
|
---|
| 147 | pass
|
---|
| 148 | class UnknownGenocode(Exception):
|
---|
| 149 | """ Raised when we get a 2-bit genotype that is undecipherable (is it possible?)
|
---|
| 150 | """
|
---|
| 151 | pass
|
---|
| 152 | class UnknownGeno(Exception): pass
|
---|
| 153 |
|
---|
| 154 | ### Utility functions
|
---|
| 155 |
|
---|
| 156 | def timenow():
|
---|
| 157 | """return current time as a string
|
---|
| 158 | """
|
---|
| 159 | return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
|
---|
| 160 |
|
---|
| 161 | def ceiling(n, k):
|
---|
| 162 | ''' Return the least multiple of k which is greater than n
|
---|
| 163 | '''
|
---|
| 164 | m = n % k
|
---|
| 165 | if m == 0:
|
---|
| 166 | return n
|
---|
| 167 | else:
|
---|
| 168 | return n + k - m
|
---|
| 169 |
|
---|
| 170 | def nbytes(n):
|
---|
| 171 | ''' Return the number of bytes required for n subjects
|
---|
| 172 | '''
|
---|
| 173 | return 2*ceiling(n, 4)/8
|
---|
| 174 |
|
---|
| 175 | ### Primary module functionality
|
---|
| 176 | class LPed:
|
---|
| 177 | """ The uber-class for processing the Linkage-format *.ped/*.map files
|
---|
| 178 | """
|
---|
| 179 | def __init__(self, base):
|
---|
| 180 | self.base = base
|
---|
| 181 | self._ped = Ped('%s.ped' % (self.base))
|
---|
| 182 | self._map = Map('%s.map' % (self.base))
|
---|
| 183 |
|
---|
| 184 | self._markers = {}
|
---|
| 185 | self._ordered_markers = []
|
---|
| 186 | self._marker_allele_lookup = {}
|
---|
| 187 | self._autosomal_indices = set()
|
---|
| 188 |
|
---|
| 189 | self._subjects = {}
|
---|
| 190 | self._ordered_subjects = []
|
---|
| 191 |
|
---|
| 192 | self._genotypes = []
|
---|
| 193 |
|
---|
| 194 | def parse(self):
|
---|
| 195 | """
|
---|
| 196 | """
|
---|
| 197 | if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow())
|
---|
| 198 | self._map.parse()
|
---|
| 199 | self._markers = self._map._markers
|
---|
| 200 | self._ordered_markers = self._map._ordered_markers
|
---|
| 201 | self._autosomal_indices = self._map._autosomal_indices
|
---|
| 202 |
|
---|
| 203 | self._ped.parse(self._ordered_markers)
|
---|
| 204 | self._subjects = self._ped._subjects
|
---|
| 205 | self._ordered_subjects = self._ped._ordered_subjects
|
---|
| 206 | self._genotypes = self._ped._genotypes
|
---|
| 207 | self._marker_allele_lookup = self._ped._marker_allele_lookup
|
---|
| 208 |
|
---|
| 209 | ### Adjust self._markers based on the allele information
|
---|
| 210 | ### we got from parsing the ped file
|
---|
| 211 | for m, name in enumerate(self._ordered_markers):
|
---|
| 212 | a1, a2 = self._marker_allele_lookup[m][HET]
|
---|
| 213 | self._markers[name][-2] = a1
|
---|
| 214 | self._markers[name][-1] = a2
|
---|
| 215 | if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow())
|
---|
| 216 |
|
---|
| 217 | def getSubjectInfo(self, fid, oiid):
|
---|
| 218 | """
|
---|
| 219 | """
|
---|
| 220 | return self._subject_info[(fid, oiid)]
|
---|
| 221 |
|
---|
| 222 | def getSubjectInfoByLine(self, line):
|
---|
| 223 | """
|
---|
| 224 | """
|
---|
| 225 | return self._subject_info[self._ordered_subjects[line]]
|
---|
| 226 |
|
---|
| 227 | def getGenotypesByIndices(self, s, mlist, format):
|
---|
| 228 | """ needed for grr if lped - deprecated but..
|
---|
| 229 | """
|
---|
| 230 | mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ?
|
---|
| 231 | raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)])
|
---|
| 232 | if format == 'raw':
|
---|
| 233 | return raw_array
|
---|
| 234 | elif format == 'ref':
|
---|
| 235 | result = array('i', [0]*len(mlist))
|
---|
| 236 | for m, gcode in enumerate(raw_array):
|
---|
| 237 | if gcode == HOM0:
|
---|
| 238 | nref = 3
|
---|
| 239 | elif gcode == HET:
|
---|
| 240 | nref = 2
|
---|
| 241 | elif gcode == HOM1:
|
---|
| 242 | nref = 1
|
---|
| 243 | else:
|
---|
| 244 | nref = 0
|
---|
| 245 | result[m] = nref
|
---|
| 246 | return result
|
---|
| 247 | else:
|
---|
| 248 | result = []
|
---|
| 249 | for m, gcode in enumerate(raw_array):
|
---|
| 250 | result.append(self._marker_allele_lookup[m][gcode])
|
---|
| 251 | return result
|
---|
| 252 |
|
---|
| 253 | def writebed(self, base):
|
---|
| 254 | """
|
---|
| 255 | """
|
---|
| 256 | dst_name = '%s.fam' % (base)
|
---|
| 257 | print 'Writing pedigree information to [ %s ]' % (dst_name)
|
---|
| 258 | dst = open(dst_name, 'w')
|
---|
| 259 | for skey in self._ordered_subjects:
|
---|
| 260 | (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey]
|
---|
| 261 | dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe))
|
---|
| 262 | dst.close()
|
---|
| 263 |
|
---|
| 264 | dst_name = '%s.bim' % (base)
|
---|
| 265 | print 'Writing map (extended format) information to [ %s ]' % (dst_name)
|
---|
| 266 | dst = open(dst_name, 'w')
|
---|
| 267 | for m, marker in enumerate(self._ordered_markers):
|
---|
| 268 | chrom, name, genpos, abspos, a1, a2 = self._markers[marker]
|
---|
| 269 | dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2))
|
---|
| 270 | dst.close()
|
---|
| 271 |
|
---|
| 272 | bed_name = '%s.bed' % (base)
|
---|
| 273 | print 'Writing genotype bitfile to [ %s ]' % (bed_name)
|
---|
| 274 | print 'Using (default) SNP-major mode'
|
---|
| 275 | bed = open(bed_name, 'w')
|
---|
| 276 |
|
---|
| 277 | ### Write the 3 header bytes
|
---|
| 278 | bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2)))
|
---|
| 279 | bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2)))
|
---|
| 280 | bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2)))
|
---|
| 281 |
|
---|
| 282 | ### Calculate how many "pad bits" we should add after the last subject
|
---|
| 283 | nsubjects = len(self._ordered_subjects)
|
---|
| 284 | nmarkers = len(self._ordered_markers)
|
---|
| 285 | total_bytes = nbytes(nsubjects)
|
---|
| 286 | nbits = nsubjects * 2
|
---|
| 287 | pad_nibbles = ((total_bytes * 8) - nbits)/2
|
---|
| 288 | pad = array('i', [0]*pad_nibbles)
|
---|
| 289 |
|
---|
| 290 | ### And now write genotypes to the file
|
---|
| 291 | for m in xrange(nmarkers):
|
---|
| 292 | geno = self._genotypes[m]
|
---|
| 293 | geno.extend(pad)
|
---|
| 294 | bytes = len(geno)/4
|
---|
| 295 | for b in range(bytes):
|
---|
| 296 | idx = b*4
|
---|
| 297 | gcode = tuple(geno[idx:idx+4])
|
---|
| 298 | try:
|
---|
| 299 | byte = struct.pack('B', GCODE_TO_INT[gcode])
|
---|
| 300 | except KeyError:
|
---|
| 301 | print m, b, gcode
|
---|
| 302 | raise
|
---|
| 303 | bed.write(byte)
|
---|
| 304 | bed.close()
|
---|
| 305 |
|
---|
| 306 | def autosomal_indices(self):
|
---|
| 307 | """ Return the indices of markers in this ped/map that are autosomal.
|
---|
| 308 | This is used by rgGRR so that it can select a random set of markers
|
---|
| 309 | from the autosomes (sex chroms screw up the plot)
|
---|
| 310 | """
|
---|
| 311 | return self._autosomal_indices
|
---|
| 312 |
|
---|
| 313 | class Ped:
|
---|
| 314 | def __init__(self, path):
|
---|
| 315 | self.path = path
|
---|
| 316 | self._subjects = {}
|
---|
| 317 | self._ordered_subjects = []
|
---|
| 318 | self._genotypes = []
|
---|
| 319 | self._marker_allele_lookup = {}
|
---|
| 320 |
|
---|
| 321 | def lineCount(self,infile):
|
---|
| 322 | """ count the number of lines in a file - efficiently using wget
|
---|
| 323 | """
|
---|
| 324 | return int(commands.getoutput('wc -l %s' % (infile)).split()[0])
|
---|
| 325 |
|
---|
| 326 |
|
---|
| 327 | def parse(self, markers):
|
---|
| 328 | """ Parse a given file -- this needs to be memory-efficient so that large
|
---|
| 329 | files can be parsed (~1 million markers on ~5000 subjects?). It
|
---|
| 330 | should also be fast, if possible.
|
---|
| 331 | """
|
---|
| 332 |
|
---|
| 333 | ### Find out how many lines are in the file so we can ...
|
---|
| 334 | nsubjects = self.lineCount(self.path)
|
---|
| 335 | ### ... Pre-allocate the genotype arrays
|
---|
| 336 | nmarkers = len(markers)
|
---|
| 337 | _marker_alleles = [['0', '0'] for _ in xrange(nmarkers)]
|
---|
| 338 | self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)]
|
---|
| 339 |
|
---|
| 340 | if self.path.endswith('.gz'):
|
---|
| 341 | pfile = gzip.open(self.path, 'r')
|
---|
| 342 | else:
|
---|
| 343 | pfile = open(self.path, 'r')
|
---|
| 344 |
|
---|
| 345 | for s, line in enumerate(pfile):
|
---|
| 346 | line = line.strip()
|
---|
| 347 | if not line:
|
---|
| 348 | continue
|
---|
| 349 |
|
---|
| 350 | fid, iid, did, mid, sex, phe, genos = line.split(None, 6)
|
---|
| 351 | sid = iid.split('.')[0]
|
---|
| 352 | d_sid = did.split('.')[0]
|
---|
| 353 | m_sid = mid.split('.')[0]
|
---|
| 354 |
|
---|
| 355 | skey = (fid, iid)
|
---|
| 356 | self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
|
---|
| 357 | self._ordered_subjects.append(skey)
|
---|
| 358 |
|
---|
| 359 | genotypes = genos.split()
|
---|
| 360 |
|
---|
| 361 | for m, marker in enumerate(markers):
|
---|
| 362 | idx = m*2
|
---|
| 363 | a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m
|
---|
| 364 | s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m
|
---|
| 365 |
|
---|
| 366 | ### FIXME: I think this can still be faster, and simpler to read
|
---|
| 367 | # Two pieces of logic intertwined here: first, we need to code
|
---|
| 368 | # this genotype as HOM0, HOM1, HET or MISS. Second, we need to
|
---|
| 369 | # keep an ongoing record of the genotypes seen for this marker
|
---|
| 370 | if a1 == a2:
|
---|
| 371 | if a1 in MISSING_ALLELES:
|
---|
| 372 | geno = MISS_GENO
|
---|
| 373 | else:
|
---|
| 374 | if s1 == '0':
|
---|
| 375 | seen[0] = a1
|
---|
| 376 | elif s1 == a1 or s2 == a2:
|
---|
| 377 | pass
|
---|
| 378 | elif s2 == '0':
|
---|
| 379 | seen[1] = a1
|
---|
| 380 | else:
|
---|
| 381 | raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen)))
|
---|
| 382 |
|
---|
| 383 | if a1 == seen[0]:
|
---|
| 384 | geno = HOM0_GENO
|
---|
| 385 | elif a1 == seen[1]:
|
---|
| 386 | geno = HOM1_GENO
|
---|
| 387 | else:
|
---|
| 388 | raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen)))
|
---|
| 389 | elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES:
|
---|
| 390 | geno = MISS_GENO
|
---|
| 391 | else:
|
---|
| 392 | geno = HET_GENO
|
---|
| 393 | if s1 == '0':
|
---|
| 394 | seen[0] = a1
|
---|
| 395 | seen[1] = a2
|
---|
| 396 | elif s2 == '0':
|
---|
| 397 | if s1 == a1:
|
---|
| 398 | seen[1] = a2
|
---|
| 399 | elif s1 == a2:
|
---|
| 400 | seen[1] = a1
|
---|
| 401 | else:
|
---|
| 402 | raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
|
---|
| 403 | else:
|
---|
| 404 | if sorted(seen) != sorted((a1, a2)):
|
---|
| 405 | raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
|
---|
| 406 |
|
---|
| 407 | gcode = GENO_TO_GCODE.get(geno, None)
|
---|
| 408 | if gcode is None:
|
---|
| 409 | raise UnknownGeno(str(geno))
|
---|
| 410 | self._genotypes[m][s] = gcode
|
---|
| 411 |
|
---|
| 412 | # Build the _marker_allele_lookup table
|
---|
| 413 | for m, alleles in enumerate(_marker_alleles):
|
---|
| 414 | if len(alleles) == 2:
|
---|
| 415 | a1, a2 = alleles
|
---|
| 416 | elif len(alleles) == 1:
|
---|
| 417 | a1 = alleles[0]
|
---|
| 418 | a2 = '0'
|
---|
| 419 | else:
|
---|
| 420 | print 'All alleles blank for %s: %s' % (m, str(alleles))
|
---|
| 421 | raise
|
---|
| 422 |
|
---|
| 423 | self._marker_allele_lookup[m] = {
|
---|
| 424 | HOM0: (a2, a2),
|
---|
| 425 | HOM1: (a1, a1),
|
---|
| 426 | HET : (a1, a2),
|
---|
| 427 | MISS: ('0','0'),
|
---|
| 428 | }
|
---|
| 429 |
|
---|
| 430 | if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects), nsubjects, self.path)
|
---|
| 431 |
|
---|
| 432 | class Map:
|
---|
| 433 | def __init__(self, path=None):
|
---|
| 434 | self.path = path
|
---|
| 435 | self._markers = {}
|
---|
| 436 | self._ordered_markers = []
|
---|
| 437 | self._autosomal_indices = set()
|
---|
| 438 |
|
---|
| 439 | def __len__(self):
|
---|
| 440 | return len(self._markers)
|
---|
| 441 |
|
---|
| 442 | def parse(self):
|
---|
| 443 | """ Parse a Linkage-format map file
|
---|
| 444 | """
|
---|
| 445 | if self.path.endswith('.gz'):
|
---|
| 446 | fh = gzip.open(self.path, 'r')
|
---|
| 447 | else:
|
---|
| 448 | fh = open(self.path, 'r')
|
---|
| 449 |
|
---|
| 450 | for i, line in enumerate(fh):
|
---|
| 451 | line = line.strip()
|
---|
| 452 | if not line:
|
---|
| 453 | continue
|
---|
| 454 |
|
---|
| 455 | fields = line.split()
|
---|
| 456 | if len(fields) < 4:
|
---|
| 457 | raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line), len(fields)))
|
---|
| 458 | else:
|
---|
| 459 | chrom, name, genpos, abspos = fields
|
---|
| 460 | if name in self._markers:
|
---|
| 461 | raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path))
|
---|
| 462 | abspos = int(abspos)
|
---|
| 463 | if abspos < 0:
|
---|
| 464 | continue
|
---|
| 465 | if chrom in AUTOSOMES:
|
---|
| 466 | self._autosomal_indices.add(i)
|
---|
| 467 | chrom = CHROM_REPLACE.get(chrom, chrom)
|
---|
| 468 | self._markers[name] = [chrom, name, genpos, abspos, None, None]
|
---|
| 469 | self._ordered_markers.append(name)
|
---|
| 470 | fh.close()
|
---|
| 471 | if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers), i, self.path)
|
---|
| 472 |
|
---|
| 473 | class BPed:
|
---|
| 474 | """ The uber-class for processing Plink's Binary Ped file format *.bed/*.bim/*.fam
|
---|
| 475 | """
|
---|
| 476 | def __init__(self, base):
|
---|
| 477 | self.base = base
|
---|
| 478 | self._bed = Bed('%s.bed' % (self.base))
|
---|
| 479 | self._bim = Bim('%s.bim' % (self.base))
|
---|
| 480 | self._fam = Fam('%s.fam' % (self.base))
|
---|
| 481 |
|
---|
| 482 | self._markers = {}
|
---|
| 483 | self._ordered_markers = []
|
---|
| 484 | self._marker_allele_lookup = {}
|
---|
| 485 | self._autosomal_indices = set()
|
---|
| 486 |
|
---|
| 487 | self._subjects = {}
|
---|
| 488 | self._ordered_subjects = []
|
---|
| 489 |
|
---|
| 490 | self._genotypes = []
|
---|
| 491 |
|
---|
| 492 | def parse(self, quick=False):
|
---|
| 493 | """
|
---|
| 494 | """
|
---|
| 495 | self._quick = quick
|
---|
| 496 |
|
---|
| 497 | self._bim.parse()
|
---|
| 498 | self._markers = self._bim._markers
|
---|
| 499 | self._ordered_markers = self._bim._ordered_markers
|
---|
| 500 | self._marker_allele_lookup = self._bim._marker_allele_lookup
|
---|
| 501 | self._autosomal_indices = self._bim._autosomal_indices
|
---|
| 502 |
|
---|
| 503 | self._fam.parse()
|
---|
| 504 | self._subjects = self._fam._subjects
|
---|
| 505 | self._ordered_subjects = self._fam._ordered_subjects
|
---|
| 506 |
|
---|
| 507 | self._bed.parse(self._ordered_subjects, self._ordered_markers, quick=quick)
|
---|
| 508 | self._bedf = self._bed._fh
|
---|
| 509 | self._genotypes = self._bed._genotypes
|
---|
| 510 | self.nsubjects = len(self._ordered_subjects)
|
---|
| 511 | self.nmarkers = len(self._ordered_markers)
|
---|
| 512 | self._bytes_per_marker = nbytes(self.nsubjects)
|
---|
| 513 |
|
---|
| 514 | def writeped(self, path=None):
|
---|
| 515 | """
|
---|
| 516 | """
|
---|
| 517 | path = self.path = path or self.path
|
---|
| 518 |
|
---|
| 519 | map_name = self.path.replace('.bed', '.map')
|
---|
| 520 | print 'Writing map file [ %s ]' % (map_name)
|
---|
| 521 | dst = open(map_name, 'w')
|
---|
| 522 | for m in self._ordered_markers:
|
---|
| 523 | chrom, snp, genpos, abspos, a1, a2 = self._markers[m]
|
---|
| 524 | dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos))
|
---|
| 525 | dst.close()
|
---|
| 526 |
|
---|
| 527 | ped_name = self.path.replace('.bed', '.ped')
|
---|
| 528 | print 'Writing ped file [ %s ]' % (ped_name)
|
---|
| 529 | ped = open(ped_name, 'w')
|
---|
| 530 | firstyikes = False
|
---|
| 531 | for s, skey in enumerate(self._ordered_subjects):
|
---|
| 532 | idx = s*2
|
---|
| 533 | (fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey]
|
---|
| 534 | ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe))
|
---|
| 535 | genotypes_for_subject = self.getGenotypesForSubject(s)
|
---|
| 536 | for m, snp in enumerate(self._ordered_markers):
|
---|
| 537 | #a1, a2 = self.getGenotypeByIndices(s, m)
|
---|
| 538 | a1,a2 = genotypes_for_subject[m]
|
---|
| 539 | ped.write(' %s %s' % (a1, a2))
|
---|
| 540 | ped.write('\n')
|
---|
| 541 | ped.close()
|
---|
| 542 |
|
---|
| 543 | def getGenotype(self, subject, marker):
|
---|
| 544 | """ Retrieve a genotype for a particular subject/marker pair
|
---|
| 545 | """
|
---|
| 546 | m = self._ordered_markers.index(marker)
|
---|
| 547 | s = self._ordered_subjects.index(subject)
|
---|
| 548 | return self.getGenotypeByIndices(s, m)
|
---|
| 549 |
|
---|
| 550 | def getGenotypesForSubject(self, s, raw=False):
|
---|
| 551 | """ Returns list of genotypes for all m markers
|
---|
| 552 | for subject s. If raw==True, then an array
|
---|
| 553 | of raw integer gcodes is returned instead
|
---|
| 554 | """
|
---|
| 555 | if self._quick:
|
---|
| 556 | nmarkers = len(self._markers)
|
---|
| 557 | raw_array = array('i', [0]*nmarkers)
|
---|
| 558 | seek_nibble = s % 4
|
---|
| 559 | for m in xrange(nmarkers):
|
---|
| 560 | seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
|
---|
| 561 | self._bedf.seek(seek_byte)
|
---|
| 562 | geno = struct.unpack('B', self._bedf.read(1))[0]
|
---|
| 563 | quartet = INT_TO_GCODE[geno]
|
---|
| 564 | gcode = quartet[seek_nibble]
|
---|
| 565 | raw_array[m] = gcode
|
---|
| 566 | else:
|
---|
| 567 | raw_array = array('i', [row[s] for row in self._genotypes])
|
---|
| 568 |
|
---|
| 569 | if raw:
|
---|
| 570 | return raw_array
|
---|
| 571 | else:
|
---|
| 572 | result = []
|
---|
| 573 | for m, gcode in enumerate(raw_array):
|
---|
| 574 | result.append(self._marker_allele_lookup[m][gcode])
|
---|
| 575 | return result
|
---|
| 576 |
|
---|
| 577 | def getGenotypeByIndices(self, s, m):
|
---|
| 578 | """
|
---|
| 579 | """
|
---|
| 580 | if self._quick:
|
---|
| 581 | # Determine which byte we need to seek to, and
|
---|
| 582 | # which nibble within the byte we need
|
---|
| 583 | seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
|
---|
| 584 | seek_nibble = s % 4
|
---|
| 585 | self._bedf.seek(seek_byte)
|
---|
| 586 | geno = struct.unpack('B', self._bedf.read(1))[0]
|
---|
| 587 | quartet = INT_TO_GCODE[geno]
|
---|
| 588 | gcode = quartet[seek_nibble]
|
---|
| 589 | else:
|
---|
| 590 | # Otherwise, just grab the genotypes from the
|
---|
| 591 | # list of arrays
|
---|
| 592 | genos_for_marker = self._genotypes[m]
|
---|
| 593 | gcode = genos_for_marker[s]
|
---|
| 594 |
|
---|
| 595 | return self._marker_allele_lookup[m][gcode]
|
---|
| 596 |
|
---|
| 597 | def getGenotypesByIndices(self, s, mlist, format):
|
---|
| 598 | """
|
---|
| 599 | """
|
---|
| 600 | if self._quick:
|
---|
| 601 | raw_array = array('i', [0]*len(mlist))
|
---|
| 602 | seek_nibble = s % 4
|
---|
| 603 | for i,m in enumerate(mlist):
|
---|
| 604 | seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
|
---|
| 605 | self._bedf.seek(seek_byte)
|
---|
| 606 | geno = struct.unpack('B', self._bedf.read(1))[0]
|
---|
| 607 | quartet = INT_TO_GCODE[geno]
|
---|
| 608 | gcode = quartet[seek_nibble]
|
---|
| 609 | raw_array[i] = gcode
|
---|
| 610 | mlist = set(mlist)
|
---|
| 611 | else:
|
---|
| 612 | mlist = set(mlist)
|
---|
| 613 | raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist])
|
---|
| 614 |
|
---|
| 615 | if format == 'raw':
|
---|
| 616 | return raw_array
|
---|
| 617 | elif format == 'ref':
|
---|
| 618 | result = array('i', [0]*len(mlist))
|
---|
| 619 | for m, gcode in enumerate(raw_array):
|
---|
| 620 | if gcode == HOM0:
|
---|
| 621 | nref = 3
|
---|
| 622 | elif gcode == HET:
|
---|
| 623 | nref = 2
|
---|
| 624 | elif gcode == HOM1:
|
---|
| 625 | nref = 1
|
---|
| 626 | else:
|
---|
| 627 | nref = 0
|
---|
| 628 | result[m] = nref
|
---|
| 629 | return result
|
---|
| 630 | else:
|
---|
| 631 | result = []
|
---|
| 632 | for m, gcode in enumerate(raw_array):
|
---|
| 633 | result.append(self._marker_allele_lookup[m][gcode])
|
---|
| 634 | return result
|
---|
| 635 |
|
---|
| 636 | def getSubject(self, s):
|
---|
| 637 | """
|
---|
| 638 | """
|
---|
| 639 | skey = self._ordered_subjects[s]
|
---|
| 640 | return self._subjects[skey]
|
---|
| 641 |
|
---|
| 642 | def autosomal_indices(self):
|
---|
| 643 | """ Return the indices of markers in this ped/map that are autosomal.
|
---|
| 644 | This is used by rgGRR so that it can select a random set of markers
|
---|
| 645 | from the autosomes (sex chroms screw up the plot)
|
---|
| 646 | """
|
---|
| 647 | return self._autosomal_indices
|
---|
| 648 |
|
---|
| 649 | class Bed:
|
---|
| 650 |
|
---|
| 651 | def __init__(self, path):
|
---|
| 652 | self.path = path
|
---|
| 653 | self._genotypes = []
|
---|
| 654 | self._fh = None
|
---|
| 655 |
|
---|
| 656 | def parse(self, subjects, markers, quick=False):
|
---|
| 657 | """ Parse the bed file, indicated either by the path parameter,
|
---|
| 658 | or as the self.path indicated in __init__. If quick is
|
---|
| 659 | True, then just parse the bim and fam, then genotypes will
|
---|
| 660 | be looked up dynamically by indices
|
---|
| 661 | """
|
---|
| 662 | self._quick = quick
|
---|
| 663 |
|
---|
| 664 | ordered_markers = markers
|
---|
| 665 | ordered_subjects = subjects
|
---|
| 666 | nsubjects = len(ordered_subjects)
|
---|
| 667 | nmarkers = len(ordered_markers)
|
---|
| 668 |
|
---|
| 669 | bed = open(self.path, 'rb')
|
---|
| 670 | self._fh = bed
|
---|
| 671 |
|
---|
| 672 | byte1 = bed.read(1)
|
---|
| 673 | byte2 = bed.read(1)
|
---|
| 674 | byte3 = bed.read(1)
|
---|
| 675 | format_flag = struct.unpack('B', byte3)[0]
|
---|
| 676 |
|
---|
| 677 | h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]])
|
---|
| 678 | h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]])
|
---|
| 679 | h3 = tuple(INT_TO_GCODE[format_flag])
|
---|
| 680 |
|
---|
| 681 | if h1 != MAGIC1 or h2 != MAGIC2:
|
---|
| 682 | raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2))
|
---|
| 683 | if format_flag:
|
---|
| 684 | print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3)
|
---|
| 685 | else:
|
---|
| 686 | raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3)
|
---|
| 687 |
|
---|
| 688 | print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects)
|
---|
| 689 |
|
---|
| 690 | ### If quick mode was specified, we're done ...
|
---|
| 691 | self._quick = quick
|
---|
| 692 | if quick:
|
---|
| 693 | return
|
---|
| 694 |
|
---|
| 695 | ### ... Otherwise, parse genotypes into an array, and append that
|
---|
| 696 | ### array to self._genotypes
|
---|
| 697 | ngcodes = ceiling(nsubjects, 4)
|
---|
| 698 | bytes_per_marker = nbytes(nsubjects)
|
---|
| 699 | for m in xrange(nmarkers):
|
---|
| 700 | genotype_array = array('i', [-1]*(ngcodes))
|
---|
| 701 | for byte in xrange(bytes_per_marker):
|
---|
| 702 | intval = struct.unpack('B', bed.read(1))[0]
|
---|
| 703 | idx = byte*4
|
---|
| 704 | genotype_array[idx:idx+4] = INT_TO_GCODE[intval]
|
---|
| 705 | self._genotypes.append(genotype_array)
|
---|
| 706 |
|
---|
| 707 | class Bim:
|
---|
| 708 | def __init__(self, path):
|
---|
| 709 | """
|
---|
| 710 | """
|
---|
| 711 | self.path = path
|
---|
| 712 | self._markers = {}
|
---|
| 713 | self._ordered_markers = []
|
---|
| 714 | self._marker_allele_lookup = {}
|
---|
| 715 | self._autosomal_indices = set()
|
---|
| 716 |
|
---|
| 717 | def parse(self):
|
---|
| 718 | """
|
---|
| 719 | """
|
---|
| 720 | print 'Reading map (extended format) from [ %s ]' % (self.path)
|
---|
| 721 | bim = open(self.path, 'r')
|
---|
| 722 | for m, line in enumerate(bim):
|
---|
| 723 | chrom, snp, gpos, apos, a1, a2 = line.strip().split()
|
---|
| 724 | self._markers[snp] = (chrom, snp, gpos, apos, a1, a2)
|
---|
| 725 | self._marker_allele_lookup[m] = {
|
---|
| 726 | HOM0: (a2, a2),
|
---|
| 727 | HOM1: (a1, a1),
|
---|
| 728 | HET : (a1, a2),
|
---|
| 729 | MISS: ('0','0'),
|
---|
| 730 | }
|
---|
| 731 | self._ordered_markers.append(snp)
|
---|
| 732 | if chrom in AUTOSOMES:
|
---|
| 733 | self._autosomal_indices.add(m)
|
---|
| 734 | bim.close()
|
---|
| 735 | print '%s markers to be included from [ %s ]' % (m+1, self.path)
|
---|
| 736 |
|
---|
| 737 | class Fam:
|
---|
| 738 | def __init__(self, path):
|
---|
| 739 | """
|
---|
| 740 | """
|
---|
| 741 | self.path = path
|
---|
| 742 | self._subjects = {}
|
---|
| 743 | self._ordered_subjects = []
|
---|
| 744 |
|
---|
| 745 | def parse(self):
|
---|
| 746 | """
|
---|
| 747 | """
|
---|
| 748 | print 'Reading pedigree information from [ %s ]' % (self.path)
|
---|
| 749 | fam = open(self.path, 'r')
|
---|
| 750 | for s, line in enumerate(fam):
|
---|
| 751 | fid, iid, did, mid, sex, phe = line.strip().split()
|
---|
| 752 | sid = iid.split('.')[0]
|
---|
| 753 | d_sid = did.split('.')[0]
|
---|
| 754 | m_sid = mid.split('.')[0]
|
---|
| 755 | skey = (fid, iid)
|
---|
| 756 | self._ordered_subjects.append(skey)
|
---|
| 757 | self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
|
---|
| 758 | fam.close()
|
---|
| 759 | print '%s individuals read from [ %s ]' % (s+1, self.path)
|
---|
| 760 |
|
---|
| 761 | ### Command-line functionality and testing
|
---|
| 762 | def test(arg):
|
---|
| 763 | '''
|
---|
| 764 | '''
|
---|
| 765 |
|
---|
| 766 | import time
|
---|
| 767 |
|
---|
| 768 | if arg == 'CAMP_AFFY.ped':
|
---|
| 769 | print 'Testing bed.parse(quick=True)'
|
---|
| 770 | s = time.time()
|
---|
| 771 | bed = Bed(arg.replace('.ped', '.bed'))
|
---|
| 772 | bed.parse(quick=True)
|
---|
| 773 | print bed.getGenotype(('400118', '10300283'), 'rs2000467')
|
---|
| 774 | print bed.getGenotype(('400118', '10101384'), 'rs2294019')
|
---|
| 775 | print bed.getGenotype(('400121', '10101149'), 'rs2294019')
|
---|
| 776 | print bed.getGenotype(('400123', '10200290'), 'rs2294019')
|
---|
| 777 | assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4')
|
---|
| 778 | e = time.time()
|
---|
| 779 | print 'e-s = %s\n' % (e-s)
|
---|
| 780 |
|
---|
| 781 | print 'Testing bed.parse'
|
---|
| 782 | s = time.time()
|
---|
| 783 | bed = BPed(arg)
|
---|
| 784 | bed.parse(quick=False)
|
---|
| 785 | e = time.time()
|
---|
| 786 | print 'e-s = %s\n' % (e-s)
|
---|
| 787 |
|
---|
| 788 | print 'Testing bed.writeped'
|
---|
| 789 | s = time.time()
|
---|
| 790 | outname = '%s_BEDTEST' % (arg)
|
---|
| 791 | bed.writeped(outname)
|
---|
| 792 | e = time.time()
|
---|
| 793 | print 'e-s = %s\n' % (e-s)
|
---|
| 794 | del(bed)
|
---|
| 795 |
|
---|
| 796 | print 'Testing ped.parse'
|
---|
| 797 | s = time.time()
|
---|
| 798 | ped = LPed(arg)
|
---|
| 799 | ped.parse()
|
---|
| 800 | e = time.time()
|
---|
| 801 | print 'e-s = %s\n' % (e-s)
|
---|
| 802 |
|
---|
| 803 | print 'Testing ped.writebed'
|
---|
| 804 | s = time.time()
|
---|
| 805 | outname = '%s_PEDTEST' % (arg)
|
---|
| 806 | ped.writebed(outname)
|
---|
| 807 | e = time.time()
|
---|
| 808 | print 'e-s = %s\n' % (e-s)
|
---|
| 809 | del(ped)
|
---|
| 810 |
|
---|
| 811 | def profile_bed(arg):
|
---|
| 812 | """
|
---|
| 813 | """
|
---|
| 814 | bed = BPed(arg)
|
---|
| 815 | bed.parse(quick=False)
|
---|
| 816 | outname = '%s_BEDPROFILE' % (arg)
|
---|
| 817 | bed.writeped(outname)
|
---|
| 818 |
|
---|
| 819 | def profile_ped(arg):
|
---|
| 820 | """
|
---|
| 821 | """
|
---|
| 822 | ped = LPed(arg)
|
---|
| 823 | ped.parse()
|
---|
| 824 | outname = '%s_PEDPROFILE' % (arg)
|
---|
| 825 | ped.writebed(outname)
|
---|
| 826 |
|
---|
| 827 | if __name__ == '__main__':
|
---|
| 828 | """ Run as a command-line, this script should get one or more arguments,
|
---|
| 829 | each one a ped file to be parsed with the PedParser (unit tests?)
|
---|
| 830 | """
|
---|
| 831 | op = optparse.OptionParser()
|
---|
| 832 | op.add_option('--profile-bed', action='store_true', default=False)
|
---|
| 833 | op.add_option('--profile-ped', action='store_true', default=False)
|
---|
| 834 | opts, args = op.parse_args()
|
---|
| 835 |
|
---|
| 836 | if opts.profile_bed:
|
---|
| 837 | import profile
|
---|
| 838 | import pstats
|
---|
| 839 | profile.run('profile_bed(args[0])', 'fooprof')
|
---|
| 840 | p = pstats.Stats('fooprof')
|
---|
| 841 | p.sort_stats('cumulative').print_stats(10)
|
---|
| 842 | elif opts.profile_ped:
|
---|
| 843 | import profile
|
---|
| 844 | import pstats
|
---|
| 845 | profile.run('profile_ped(args[0])', 'fooprof')
|
---|
| 846 | p = pstats.Stats('fooprof')
|
---|
| 847 | p.sort_stats('cumulative').print_stats(10)
|
---|
| 848 | else:
|
---|
| 849 | for arg in args:
|
---|
| 850 | test(arg)
|
---|
| 851 |
|
---|
| 852 | ### Code used to generate the INT_TO_GCODE dictionary
|
---|
| 853 | #print '{\n ',
|
---|
| 854 | #for i in range(256):
|
---|
| 855 | # b = INT2BIN[i]
|
---|
| 856 | # ints = []
|
---|
| 857 | # s = str(i).rjust(3)
|
---|
| 858 | # #print b
|
---|
| 859 | # for j in range(4):
|
---|
| 860 | # idx = j*2
|
---|
| 861 | # #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2)
|
---|
| 862 | # ints.append(int(b[idx:idx+2], 2))
|
---|
| 863 | # print '%s: array(\'i\', %s),' % (s,tuple(ints)),
|
---|
| 864 | # if i > 0 and (i+1) % 4 == 0:
|
---|
| 865 | # print '\n ',
|
---|
| 866 | #print '}'
|
---|
| 867 |
|
---|
| 868 |
|
---|