1 | #!/usr/bin/env python2.4
|
---|
2 | """
|
---|
3 | """
|
---|
4 |
|
---|
5 | import optparse,os,subprocess,gzip,struct,time,commands
|
---|
6 | from array import array
|
---|
7 |
|
---|
8 | #from AIMS import util
|
---|
9 | #from pga import util as pgautil
|
---|
10 |
|
---|
11 | __FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $'
|
---|
12 |
|
---|
13 | VERBOSE = True
|
---|
14 |
|
---|
15 | MISSING_ALLELES = set(['N', '0', '.', '-',''])
|
---|
16 |
|
---|
17 | AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)])
|
---|
18 |
|
---|
19 | MAGIC_BYTE1 = '00110110'
|
---|
20 | MAGIC_BYTE2 = '11011000'
|
---|
21 | FORMAT_SNP_MAJOR_BYTE = '10000000'
|
---|
22 | FORMAT_IND_MAJOR_BYTE = '00000000'
|
---|
23 | MAGIC1 = (0, 3, 1, 2)
|
---|
24 | MAGIC2 = (3, 1, 2, 0)
|
---|
25 | FORMAT_SNP_MAJOR = (2, 0, 0, 0)
|
---|
26 | FORMAT_IND_MAJOR = (0, 0, 0, 0)
|
---|
27 | HEADER_LENGTH = 3
|
---|
28 |
|
---|
29 | HOM0 = 3
|
---|
30 | HOM1 = 0
|
---|
31 | MISS = 2
|
---|
32 | HET = 1
|
---|
33 | HOM0_GENO = (0, 0)
|
---|
34 | HOM1_GENO = (1, 1)
|
---|
35 | HET_GENO = (0, 1)
|
---|
36 | MISS_GENO = (-9, -9)
|
---|
37 |
|
---|
38 | GENO_TO_GCODE = {
|
---|
39 | HOM0_GENO: HOM0,
|
---|
40 | HET_GENO: HET,
|
---|
41 | HOM1_GENO: HOM1,
|
---|
42 | MISS_GENO: MISS,
|
---|
43 | }
|
---|
44 |
|
---|
45 | CHROM_REPLACE = {
|
---|
46 | 'X': '23',
|
---|
47 | 'Y': '24',
|
---|
48 | 'XY': '25',
|
---|
49 | 'MT': '26',
|
---|
50 | 'M': '26',
|
---|
51 | }
|
---|
52 |
|
---|
53 | MAP_LINE_EXCEPTION_TEXT = """
|
---|
54 | One or more lines in the *.map file has only three fields.
|
---|
55 | The line was:
|
---|
56 |
|
---|
57 | %s
|
---|
58 |
|
---|
59 | If you are running rgGRR through EPMP, this is usually a
|
---|
60 | sign that you are using an old version of the map file.
|
---|
61 | You can correct the problem by re-running Subject QC. If
|
---|
62 | you have already tried this, please contact the developers,
|
---|
63 | or file a bug.
|
---|
64 | """
|
---|
65 |
|
---|
66 | INT_TO_GCODE = {
|
---|
67 | 0: array('i', (0, 0, 0, 0)), 1: array('i', (2, 0, 0, 0)), 2: array('i', (1, 0, 0, 0)), 3: array('i', (3, 0, 0, 0)),
|
---|
68 | 4: array('i', (0, 2, 0, 0)), 5: array('i', (2, 2, 0, 0)), 6: array('i', (1, 2, 0, 0)), 7: array('i', (3, 2, 0, 0)),
|
---|
69 | 8: array('i', (0, 1, 0, 0)), 9: array('i', (2, 1, 0, 0)), 10: array('i', (1, 1, 0, 0)), 11: array('i', (3, 1, 0, 0)),
|
---|
70 | 12: array('i', (0, 3, 0, 0)), 13: array('i', (2, 3, 0, 0)), 14: array('i', (1, 3, 0, 0)), 15: array('i', (3, 3, 0, 0)),
|
---|
71 | 16: array('i', (0, 0, 2, 0)), 17: array('i', (2, 0, 2, 0)), 18: array('i', (1, 0, 2, 0)), 19: array('i', (3, 0, 2, 0)),
|
---|
72 | 20: array('i', (0, 2, 2, 0)), 21: array('i', (2, 2, 2, 0)), 22: array('i', (1, 2, 2, 0)), 23: array('i', (3, 2, 2, 0)),
|
---|
73 | 24: array('i', (0, 1, 2, 0)), 25: array('i', (2, 1, 2, 0)), 26: array('i', (1, 1, 2, 0)), 27: array('i', (3, 1, 2, 0)),
|
---|
74 | 28: array('i', (0, 3, 2, 0)), 29: array('i', (2, 3, 2, 0)), 30: array('i', (1, 3, 2, 0)), 31: array('i', (3, 3, 2, 0)),
|
---|
75 | 32: array('i', (0, 0, 1, 0)), 33: array('i', (2, 0, 1, 0)), 34: array('i', (1, 0, 1, 0)), 35: array('i', (3, 0, 1, 0)),
|
---|
76 | 36: array('i', (0, 2, 1, 0)), 37: array('i', (2, 2, 1, 0)), 38: array('i', (1, 2, 1, 0)), 39: array('i', (3, 2, 1, 0)),
|
---|
77 | 40: array('i', (0, 1, 1, 0)), 41: array('i', (2, 1, 1, 0)), 42: array('i', (1, 1, 1, 0)), 43: array('i', (3, 1, 1, 0)),
|
---|
78 | 44: array('i', (0, 3, 1, 0)), 45: array('i', (2, 3, 1, 0)), 46: array('i', (1, 3, 1, 0)), 47: array('i', (3, 3, 1, 0)),
|
---|
79 | 48: array('i', (0, 0, 3, 0)), 49: array('i', (2, 0, 3, 0)), 50: array('i', (1, 0, 3, 0)), 51: array('i', (3, 0, 3, 0)),
|
---|
80 | 52: array('i', (0, 2, 3, 0)), 53: array('i', (2, 2, 3, 0)), 54: array('i', (1, 2, 3, 0)), 55: array('i', (3, 2, 3, 0)),
|
---|
81 | 56: array('i', (0, 1, 3, 0)), 57: array('i', (2, 1, 3, 0)), 58: array('i', (1, 1, 3, 0)), 59: array('i', (3, 1, 3, 0)),
|
---|
82 | 60: array('i', (0, 3, 3, 0)), 61: array('i', (2, 3, 3, 0)), 62: array('i', (1, 3, 3, 0)), 63: array('i', (3, 3, 3, 0)),
|
---|
83 | 64: array('i', (0, 0, 0, 2)), 65: array('i', (2, 0, 0, 2)), 66: array('i', (1, 0, 0, 2)), 67: array('i', (3, 0, 0, 2)),
|
---|
84 | 68: array('i', (0, 2, 0, 2)), 69: array('i', (2, 2, 0, 2)), 70: array('i', (1, 2, 0, 2)), 71: array('i', (3, 2, 0, 2)),
|
---|
85 | 72: array('i', (0, 1, 0, 2)), 73: array('i', (2, 1, 0, 2)), 74: array('i', (1, 1, 0, 2)), 75: array('i', (3, 1, 0, 2)),
|
---|
86 | 76: array('i', (0, 3, 0, 2)), 77: array('i', (2, 3, 0, 2)), 78: array('i', (1, 3, 0, 2)), 79: array('i', (3, 3, 0, 2)),
|
---|
87 | 80: array('i', (0, 0, 2, 2)), 81: array('i', (2, 0, 2, 2)), 82: array('i', (1, 0, 2, 2)), 83: array('i', (3, 0, 2, 2)),
|
---|
88 | 84: array('i', (0, 2, 2, 2)), 85: array('i', (2, 2, 2, 2)), 86: array('i', (1, 2, 2, 2)), 87: array('i', (3, 2, 2, 2)),
|
---|
89 | 88: array('i', (0, 1, 2, 2)), 89: array('i', (2, 1, 2, 2)), 90: array('i', (1, 1, 2, 2)), 91: array('i', (3, 1, 2, 2)),
|
---|
90 | 92: array('i', (0, 3, 2, 2)), 93: array('i', (2, 3, 2, 2)), 94: array('i', (1, 3, 2, 2)), 95: array('i', (3, 3, 2, 2)),
|
---|
91 | 96: array('i', (0, 0, 1, 2)), 97: array('i', (2, 0, 1, 2)), 98: array('i', (1, 0, 1, 2)), 99: array('i', (3, 0, 1, 2)),
|
---|
92 | 100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)),
|
---|
93 | 104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)),
|
---|
94 | 108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)),
|
---|
95 | 112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)),
|
---|
96 | 116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)),
|
---|
97 | 120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)),
|
---|
98 | 124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)),
|
---|
99 | 128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)),
|
---|
100 | 132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)),
|
---|
101 | 136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)),
|
---|
102 | 140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)),
|
---|
103 | 144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)),
|
---|
104 | 148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)),
|
---|
105 | 152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)),
|
---|
106 | 156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)),
|
---|
107 | 160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)),
|
---|
108 | 164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)),
|
---|
109 | 168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)),
|
---|
110 | 172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)),
|
---|
111 | 176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)),
|
---|
112 | 180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)),
|
---|
113 | 184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)),
|
---|
114 | 188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)),
|
---|
115 | 192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)),
|
---|
116 | 196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)),
|
---|
117 | 200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)),
|
---|
118 | 204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)),
|
---|
119 | 208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)),
|
---|
120 | 212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)),
|
---|
121 | 216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)),
|
---|
122 | 220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)),
|
---|
123 | 224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)),
|
---|
124 | 228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)),
|
---|
125 | 232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)),
|
---|
126 | 236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)),
|
---|
127 | 240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)),
|
---|
128 | 244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)),
|
---|
129 | 248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)),
|
---|
130 | 252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)),
|
---|
131 | }
|
---|
132 |
|
---|
133 | GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()])
|
---|
134 |
|
---|
135 | ### Exceptions
|
---|
136 | class DuplicateMarkerInMapFile(Exception): pass
|
---|
137 | class MapLineTooShort(Exception): pass
|
---|
138 | class ThirdAllele(Exception): pass
|
---|
139 | class PedError(Exception): pass
|
---|
140 | class BadMagic(Exception):
|
---|
141 | """ Raised when one of the MAGIC bytes in a bed file does not match
|
---|
142 | """
|
---|
143 | pass
|
---|
144 | class BedError(Exception):
|
---|
145 | """ Raised when parsing a bed file runs into problems
|
---|
146 | """
|
---|
147 | pass
|
---|
148 | class UnknownGenocode(Exception):
|
---|
149 | """ Raised when we get a 2-bit genotype that is undecipherable (is it possible?)
|
---|
150 | """
|
---|
151 | pass
|
---|
152 | class UnknownGeno(Exception): pass
|
---|
153 |
|
---|
154 | ### Utility functions
|
---|
155 |
|
---|
156 | def timenow():
|
---|
157 | """return current time as a string
|
---|
158 | """
|
---|
159 | return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
|
---|
160 |
|
---|
161 | def ceiling(n, k):
|
---|
162 | ''' Return the least multiple of k which is greater than n
|
---|
163 | '''
|
---|
164 | m = n % k
|
---|
165 | if m == 0:
|
---|
166 | return n
|
---|
167 | else:
|
---|
168 | return n + k - m
|
---|
169 |
|
---|
170 | def nbytes(n):
|
---|
171 | ''' Return the number of bytes required for n subjects
|
---|
172 | '''
|
---|
173 | return 2*ceiling(n, 4)/8
|
---|
174 |
|
---|
175 | ### Primary module functionality
|
---|
176 | class LPed:
|
---|
177 | """ The uber-class for processing the Linkage-format *.ped/*.map files
|
---|
178 | """
|
---|
179 | def __init__(self, base):
|
---|
180 | self.base = base
|
---|
181 | self._ped = Ped('%s.ped' % (self.base))
|
---|
182 | self._map = Map('%s.map' % (self.base))
|
---|
183 |
|
---|
184 | self._markers = {}
|
---|
185 | self._ordered_markers = []
|
---|
186 | self._marker_allele_lookup = {}
|
---|
187 | self._autosomal_indices = set()
|
---|
188 |
|
---|
189 | self._subjects = {}
|
---|
190 | self._ordered_subjects = []
|
---|
191 |
|
---|
192 | self._genotypes = []
|
---|
193 |
|
---|
194 | def parse(self):
|
---|
195 | """
|
---|
196 | """
|
---|
197 | if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow())
|
---|
198 | self._map.parse()
|
---|
199 | self._markers = self._map._markers
|
---|
200 | self._ordered_markers = self._map._ordered_markers
|
---|
201 | self._autosomal_indices = self._map._autosomal_indices
|
---|
202 |
|
---|
203 | self._ped.parse(self._ordered_markers)
|
---|
204 | self._subjects = self._ped._subjects
|
---|
205 | self._ordered_subjects = self._ped._ordered_subjects
|
---|
206 | self._genotypes = self._ped._genotypes
|
---|
207 | self._marker_allele_lookup = self._ped._marker_allele_lookup
|
---|
208 |
|
---|
209 | ### Adjust self._markers based on the allele information
|
---|
210 | ### we got from parsing the ped file
|
---|
211 | for m, name in enumerate(self._ordered_markers):
|
---|
212 | a1, a2 = self._marker_allele_lookup[m][HET]
|
---|
213 | self._markers[name][-2] = a1
|
---|
214 | self._markers[name][-1] = a2
|
---|
215 | if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow())
|
---|
216 |
|
---|
217 | def getSubjectInfo(self, fid, oiid):
|
---|
218 | """
|
---|
219 | """
|
---|
220 | return self._subject_info[(fid, oiid)]
|
---|
221 |
|
---|
222 | def getSubjectInfoByLine(self, line):
|
---|
223 | """
|
---|
224 | """
|
---|
225 | return self._subject_info[self._ordered_subjects[line]]
|
---|
226 |
|
---|
227 | def getGenotypesByIndices(self, s, mlist, format):
|
---|
228 | """ needed for grr if lped - deprecated but..
|
---|
229 | """
|
---|
230 | mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ?
|
---|
231 | raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)])
|
---|
232 | if format == 'raw':
|
---|
233 | return raw_array
|
---|
234 | elif format == 'ref':
|
---|
235 | result = array('i', [0]*len(mlist))
|
---|
236 | for m, gcode in enumerate(raw_array):
|
---|
237 | if gcode == HOM0:
|
---|
238 | nref = 3
|
---|
239 | elif gcode == HET:
|
---|
240 | nref = 2
|
---|
241 | elif gcode == HOM1:
|
---|
242 | nref = 1
|
---|
243 | else:
|
---|
244 | nref = 0
|
---|
245 | result[m] = nref
|
---|
246 | return result
|
---|
247 | else:
|
---|
248 | result = []
|
---|
249 | for m, gcode in enumerate(raw_array):
|
---|
250 | result.append(self._marker_allele_lookup[m][gcode])
|
---|
251 | return result
|
---|
252 |
|
---|
253 | def writebed(self, base):
|
---|
254 | """
|
---|
255 | """
|
---|
256 | dst_name = '%s.fam' % (base)
|
---|
257 | print 'Writing pedigree information to [ %s ]' % (dst_name)
|
---|
258 | dst = open(dst_name, 'w')
|
---|
259 | for skey in self._ordered_subjects:
|
---|
260 | (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey]
|
---|
261 | dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe))
|
---|
262 | dst.close()
|
---|
263 |
|
---|
264 | dst_name = '%s.bim' % (base)
|
---|
265 | print 'Writing map (extended format) information to [ %s ]' % (dst_name)
|
---|
266 | dst = open(dst_name, 'w')
|
---|
267 | for m, marker in enumerate(self._ordered_markers):
|
---|
268 | chrom, name, genpos, abspos, a1, a2 = self._markers[marker]
|
---|
269 | dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2))
|
---|
270 | dst.close()
|
---|
271 |
|
---|
272 | bed_name = '%s.bed' % (base)
|
---|
273 | print 'Writing genotype bitfile to [ %s ]' % (bed_name)
|
---|
274 | print 'Using (default) SNP-major mode'
|
---|
275 | bed = open(bed_name, 'w')
|
---|
276 |
|
---|
277 | ### Write the 3 header bytes
|
---|
278 | bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2)))
|
---|
279 | bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2)))
|
---|
280 | bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2)))
|
---|
281 |
|
---|
282 | ### Calculate how many "pad bits" we should add after the last subject
|
---|
283 | nsubjects = len(self._ordered_subjects)
|
---|
284 | nmarkers = len(self._ordered_markers)
|
---|
285 | total_bytes = nbytes(nsubjects)
|
---|
286 | nbits = nsubjects * 2
|
---|
287 | pad_nibbles = ((total_bytes * 8) - nbits)/2
|
---|
288 | pad = array('i', [0]*pad_nibbles)
|
---|
289 |
|
---|
290 | ### And now write genotypes to the file
|
---|
291 | for m in xrange(nmarkers):
|
---|
292 | geno = self._genotypes[m]
|
---|
293 | geno.extend(pad)
|
---|
294 | bytes = len(geno)/4
|
---|
295 | for b in range(bytes):
|
---|
296 | idx = b*4
|
---|
297 | gcode = tuple(geno[idx:idx+4])
|
---|
298 | try:
|
---|
299 | byte = struct.pack('B', GCODE_TO_INT[gcode])
|
---|
300 | except KeyError:
|
---|
301 | print m, b, gcode
|
---|
302 | raise
|
---|
303 | bed.write(byte)
|
---|
304 | bed.close()
|
---|
305 |
|
---|
306 | def autosomal_indices(self):
|
---|
307 | """ Return the indices of markers in this ped/map that are autosomal.
|
---|
308 | This is used by rgGRR so that it can select a random set of markers
|
---|
309 | from the autosomes (sex chroms screw up the plot)
|
---|
310 | """
|
---|
311 | return self._autosomal_indices
|
---|
312 |
|
---|
313 | class Ped:
|
---|
314 | def __init__(self, path):
|
---|
315 | self.path = path
|
---|
316 | self._subjects = {}
|
---|
317 | self._ordered_subjects = []
|
---|
318 | self._genotypes = []
|
---|
319 | self._marker_allele_lookup = {}
|
---|
320 |
|
---|
321 | def lineCount(self,infile):
|
---|
322 | """ count the number of lines in a file - efficiently using wget
|
---|
323 | """
|
---|
324 | return int(commands.getoutput('wc -l %s' % (infile)).split()[0])
|
---|
325 |
|
---|
326 |
|
---|
327 | def parse(self, markers):
|
---|
328 | """ Parse a given file -- this needs to be memory-efficient so that large
|
---|
329 | files can be parsed (~1 million markers on ~5000 subjects?). It
|
---|
330 | should also be fast, if possible.
|
---|
331 | """
|
---|
332 |
|
---|
333 | ### Find out how many lines are in the file so we can ...
|
---|
334 | nsubjects = self.lineCount(self.path)
|
---|
335 | ### ... Pre-allocate the genotype arrays
|
---|
336 | nmarkers = len(markers)
|
---|
337 | _marker_alleles = [['0', '0'] for _ in xrange(nmarkers)]
|
---|
338 | self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)]
|
---|
339 |
|
---|
340 | if self.path.endswith('.gz'):
|
---|
341 | pfile = gzip.open(self.path, 'r')
|
---|
342 | else:
|
---|
343 | pfile = open(self.path, 'r')
|
---|
344 |
|
---|
345 | for s, line in enumerate(pfile):
|
---|
346 | line = line.strip()
|
---|
347 | if not line:
|
---|
348 | continue
|
---|
349 |
|
---|
350 | fid, iid, did, mid, sex, phe, genos = line.split(None, 6)
|
---|
351 | sid = iid.split('.')[0]
|
---|
352 | d_sid = did.split('.')[0]
|
---|
353 | m_sid = mid.split('.')[0]
|
---|
354 |
|
---|
355 | skey = (fid, iid)
|
---|
356 | self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
|
---|
357 | self._ordered_subjects.append(skey)
|
---|
358 |
|
---|
359 | genotypes = genos.split()
|
---|
360 |
|
---|
361 | for m, marker in enumerate(markers):
|
---|
362 | idx = m*2
|
---|
363 | a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m
|
---|
364 | s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m
|
---|
365 |
|
---|
366 | ### FIXME: I think this can still be faster, and simpler to read
|
---|
367 | # Two pieces of logic intertwined here: first, we need to code
|
---|
368 | # this genotype as HOM0, HOM1, HET or MISS. Second, we need to
|
---|
369 | # keep an ongoing record of the genotypes seen for this marker
|
---|
370 | if a1 == a2:
|
---|
371 | if a1 in MISSING_ALLELES:
|
---|
372 | geno = MISS_GENO
|
---|
373 | else:
|
---|
374 | if s1 == '0':
|
---|
375 | seen[0] = a1
|
---|
376 | elif s1 == a1 or s2 == a2:
|
---|
377 | pass
|
---|
378 | elif s2 == '0':
|
---|
379 | seen[1] = a1
|
---|
380 | else:
|
---|
381 | raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen)))
|
---|
382 |
|
---|
383 | if a1 == seen[0]:
|
---|
384 | geno = HOM0_GENO
|
---|
385 | elif a1 == seen[1]:
|
---|
386 | geno = HOM1_GENO
|
---|
387 | else:
|
---|
388 | raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen)))
|
---|
389 | elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES:
|
---|
390 | geno = MISS_GENO
|
---|
391 | else:
|
---|
392 | geno = HET_GENO
|
---|
393 | if s1 == '0':
|
---|
394 | seen[0] = a1
|
---|
395 | seen[1] = a2
|
---|
396 | elif s2 == '0':
|
---|
397 | if s1 == a1:
|
---|
398 | seen[1] = a2
|
---|
399 | elif s1 == a2:
|
---|
400 | seen[1] = a1
|
---|
401 | else:
|
---|
402 | raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
|
---|
403 | else:
|
---|
404 | if sorted(seen) != sorted((a1, a2)):
|
---|
405 | raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
|
---|
406 |
|
---|
407 | gcode = GENO_TO_GCODE.get(geno, None)
|
---|
408 | if gcode is None:
|
---|
409 | raise UnknownGeno(str(geno))
|
---|
410 | self._genotypes[m][s] = gcode
|
---|
411 |
|
---|
412 | # Build the _marker_allele_lookup table
|
---|
413 | for m, alleles in enumerate(_marker_alleles):
|
---|
414 | if len(alleles) == 2:
|
---|
415 | a1, a2 = alleles
|
---|
416 | elif len(alleles) == 1:
|
---|
417 | a1 = alleles[0]
|
---|
418 | a2 = '0'
|
---|
419 | else:
|
---|
420 | print 'All alleles blank for %s: %s' % (m, str(alleles))
|
---|
421 | raise
|
---|
422 |
|
---|
423 | self._marker_allele_lookup[m] = {
|
---|
424 | HOM0: (a2, a2),
|
---|
425 | HOM1: (a1, a1),
|
---|
426 | HET : (a1, a2),
|
---|
427 | MISS: ('0','0'),
|
---|
428 | }
|
---|
429 |
|
---|
430 | if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects), nsubjects, self.path)
|
---|
431 |
|
---|
432 | class Map:
|
---|
433 | def __init__(self, path=None):
|
---|
434 | self.path = path
|
---|
435 | self._markers = {}
|
---|
436 | self._ordered_markers = []
|
---|
437 | self._autosomal_indices = set()
|
---|
438 |
|
---|
439 | def __len__(self):
|
---|
440 | return len(self._markers)
|
---|
441 |
|
---|
442 | def parse(self):
|
---|
443 | """ Parse a Linkage-format map file
|
---|
444 | """
|
---|
445 | if self.path.endswith('.gz'):
|
---|
446 | fh = gzip.open(self.path, 'r')
|
---|
447 | else:
|
---|
448 | fh = open(self.path, 'r')
|
---|
449 |
|
---|
450 | for i, line in enumerate(fh):
|
---|
451 | line = line.strip()
|
---|
452 | if not line:
|
---|
453 | continue
|
---|
454 |
|
---|
455 | fields = line.split()
|
---|
456 | if len(fields) < 4:
|
---|
457 | raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line), len(fields)))
|
---|
458 | else:
|
---|
459 | chrom, name, genpos, abspos = fields
|
---|
460 | if name in self._markers:
|
---|
461 | raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path))
|
---|
462 | abspos = int(abspos)
|
---|
463 | if abspos < 0:
|
---|
464 | continue
|
---|
465 | if chrom in AUTOSOMES:
|
---|
466 | self._autosomal_indices.add(i)
|
---|
467 | chrom = CHROM_REPLACE.get(chrom, chrom)
|
---|
468 | self._markers[name] = [chrom, name, genpos, abspos, None, None]
|
---|
469 | self._ordered_markers.append(name)
|
---|
470 | fh.close()
|
---|
471 | if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers), i, self.path)
|
---|
472 |
|
---|
473 | class BPed:
|
---|
474 | """ The uber-class for processing Plink's Binary Ped file format *.bed/*.bim/*.fam
|
---|
475 | """
|
---|
476 | def __init__(self, base):
|
---|
477 | self.base = base
|
---|
478 | self._bed = Bed('%s.bed' % (self.base))
|
---|
479 | self._bim = Bim('%s.bim' % (self.base))
|
---|
480 | self._fam = Fam('%s.fam' % (self.base))
|
---|
481 |
|
---|
482 | self._markers = {}
|
---|
483 | self._ordered_markers = []
|
---|
484 | self._marker_allele_lookup = {}
|
---|
485 | self._autosomal_indices = set()
|
---|
486 |
|
---|
487 | self._subjects = {}
|
---|
488 | self._ordered_subjects = []
|
---|
489 |
|
---|
490 | self._genotypes = []
|
---|
491 |
|
---|
492 | def parse(self, quick=False):
|
---|
493 | """
|
---|
494 | """
|
---|
495 | self._quick = quick
|
---|
496 |
|
---|
497 | self._bim.parse()
|
---|
498 | self._markers = self._bim._markers
|
---|
499 | self._ordered_markers = self._bim._ordered_markers
|
---|
500 | self._marker_allele_lookup = self._bim._marker_allele_lookup
|
---|
501 | self._autosomal_indices = self._bim._autosomal_indices
|
---|
502 |
|
---|
503 | self._fam.parse()
|
---|
504 | self._subjects = self._fam._subjects
|
---|
505 | self._ordered_subjects = self._fam._ordered_subjects
|
---|
506 |
|
---|
507 | self._bed.parse(self._ordered_subjects, self._ordered_markers, quick=quick)
|
---|
508 | self._bedf = self._bed._fh
|
---|
509 | self._genotypes = self._bed._genotypes
|
---|
510 | self.nsubjects = len(self._ordered_subjects)
|
---|
511 | self.nmarkers = len(self._ordered_markers)
|
---|
512 | self._bytes_per_marker = nbytes(self.nsubjects)
|
---|
513 |
|
---|
514 | def writeped(self, path=None):
|
---|
515 | """
|
---|
516 | """
|
---|
517 | path = self.path = path or self.path
|
---|
518 |
|
---|
519 | map_name = self.path.replace('.bed', '.map')
|
---|
520 | print 'Writing map file [ %s ]' % (map_name)
|
---|
521 | dst = open(map_name, 'w')
|
---|
522 | for m in self._ordered_markers:
|
---|
523 | chrom, snp, genpos, abspos, a1, a2 = self._markers[m]
|
---|
524 | dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos))
|
---|
525 | dst.close()
|
---|
526 |
|
---|
527 | ped_name = self.path.replace('.bed', '.ped')
|
---|
528 | print 'Writing ped file [ %s ]' % (ped_name)
|
---|
529 | ped = open(ped_name, 'w')
|
---|
530 | firstyikes = False
|
---|
531 | for s, skey in enumerate(self._ordered_subjects):
|
---|
532 | idx = s*2
|
---|
533 | (fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey]
|
---|
534 | ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe))
|
---|
535 | genotypes_for_subject = self.getGenotypesForSubject(s)
|
---|
536 | for m, snp in enumerate(self._ordered_markers):
|
---|
537 | #a1, a2 = self.getGenotypeByIndices(s, m)
|
---|
538 | a1,a2 = genotypes_for_subject[m]
|
---|
539 | ped.write(' %s %s' % (a1, a2))
|
---|
540 | ped.write('\n')
|
---|
541 | ped.close()
|
---|
542 |
|
---|
543 | def getGenotype(self, subject, marker):
|
---|
544 | """ Retrieve a genotype for a particular subject/marker pair
|
---|
545 | """
|
---|
546 | m = self._ordered_markers.index(marker)
|
---|
547 | s = self._ordered_subjects.index(subject)
|
---|
548 | return self.getGenotypeByIndices(s, m)
|
---|
549 |
|
---|
550 | def getGenotypesForSubject(self, s, raw=False):
|
---|
551 | """ Returns list of genotypes for all m markers
|
---|
552 | for subject s. If raw==True, then an array
|
---|
553 | of raw integer gcodes is returned instead
|
---|
554 | """
|
---|
555 | if self._quick:
|
---|
556 | nmarkers = len(self._markers)
|
---|
557 | raw_array = array('i', [0]*nmarkers)
|
---|
558 | seek_nibble = s % 4
|
---|
559 | for m in xrange(nmarkers):
|
---|
560 | seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
|
---|
561 | self._bedf.seek(seek_byte)
|
---|
562 | geno = struct.unpack('B', self._bedf.read(1))[0]
|
---|
563 | quartet = INT_TO_GCODE[geno]
|
---|
564 | gcode = quartet[seek_nibble]
|
---|
565 | raw_array[m] = gcode
|
---|
566 | else:
|
---|
567 | raw_array = array('i', [row[s] for row in self._genotypes])
|
---|
568 |
|
---|
569 | if raw:
|
---|
570 | return raw_array
|
---|
571 | else:
|
---|
572 | result = []
|
---|
573 | for m, gcode in enumerate(raw_array):
|
---|
574 | result.append(self._marker_allele_lookup[m][gcode])
|
---|
575 | return result
|
---|
576 |
|
---|
577 | def getGenotypeByIndices(self, s, m):
|
---|
578 | """
|
---|
579 | """
|
---|
580 | if self._quick:
|
---|
581 | # Determine which byte we need to seek to, and
|
---|
582 | # which nibble within the byte we need
|
---|
583 | seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
|
---|
584 | seek_nibble = s % 4
|
---|
585 | self._bedf.seek(seek_byte)
|
---|
586 | geno = struct.unpack('B', self._bedf.read(1))[0]
|
---|
587 | quartet = INT_TO_GCODE[geno]
|
---|
588 | gcode = quartet[seek_nibble]
|
---|
589 | else:
|
---|
590 | # Otherwise, just grab the genotypes from the
|
---|
591 | # list of arrays
|
---|
592 | genos_for_marker = self._genotypes[m]
|
---|
593 | gcode = genos_for_marker[s]
|
---|
594 |
|
---|
595 | return self._marker_allele_lookup[m][gcode]
|
---|
596 |
|
---|
597 | def getGenotypesByIndices(self, s, mlist, format):
|
---|
598 | """
|
---|
599 | """
|
---|
600 | if self._quick:
|
---|
601 | raw_array = array('i', [0]*len(mlist))
|
---|
602 | seek_nibble = s % 4
|
---|
603 | for i,m in enumerate(mlist):
|
---|
604 | seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
|
---|
605 | self._bedf.seek(seek_byte)
|
---|
606 | geno = struct.unpack('B', self._bedf.read(1))[0]
|
---|
607 | quartet = INT_TO_GCODE[geno]
|
---|
608 | gcode = quartet[seek_nibble]
|
---|
609 | raw_array[i] = gcode
|
---|
610 | mlist = set(mlist)
|
---|
611 | else:
|
---|
612 | mlist = set(mlist)
|
---|
613 | raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist])
|
---|
614 |
|
---|
615 | if format == 'raw':
|
---|
616 | return raw_array
|
---|
617 | elif format == 'ref':
|
---|
618 | result = array('i', [0]*len(mlist))
|
---|
619 | for m, gcode in enumerate(raw_array):
|
---|
620 | if gcode == HOM0:
|
---|
621 | nref = 3
|
---|
622 | elif gcode == HET:
|
---|
623 | nref = 2
|
---|
624 | elif gcode == HOM1:
|
---|
625 | nref = 1
|
---|
626 | else:
|
---|
627 | nref = 0
|
---|
628 | result[m] = nref
|
---|
629 | return result
|
---|
630 | else:
|
---|
631 | result = []
|
---|
632 | for m, gcode in enumerate(raw_array):
|
---|
633 | result.append(self._marker_allele_lookup[m][gcode])
|
---|
634 | return result
|
---|
635 |
|
---|
636 | def getSubject(self, s):
|
---|
637 | """
|
---|
638 | """
|
---|
639 | skey = self._ordered_subjects[s]
|
---|
640 | return self._subjects[skey]
|
---|
641 |
|
---|
642 | def autosomal_indices(self):
|
---|
643 | """ Return the indices of markers in this ped/map that are autosomal.
|
---|
644 | This is used by rgGRR so that it can select a random set of markers
|
---|
645 | from the autosomes (sex chroms screw up the plot)
|
---|
646 | """
|
---|
647 | return self._autosomal_indices
|
---|
648 |
|
---|
649 | class Bed:
|
---|
650 |
|
---|
651 | def __init__(self, path):
|
---|
652 | self.path = path
|
---|
653 | self._genotypes = []
|
---|
654 | self._fh = None
|
---|
655 |
|
---|
656 | def parse(self, subjects, markers, quick=False):
|
---|
657 | """ Parse the bed file, indicated either by the path parameter,
|
---|
658 | or as the self.path indicated in __init__. If quick is
|
---|
659 | True, then just parse the bim and fam, then genotypes will
|
---|
660 | be looked up dynamically by indices
|
---|
661 | """
|
---|
662 | self._quick = quick
|
---|
663 |
|
---|
664 | ordered_markers = markers
|
---|
665 | ordered_subjects = subjects
|
---|
666 | nsubjects = len(ordered_subjects)
|
---|
667 | nmarkers = len(ordered_markers)
|
---|
668 |
|
---|
669 | bed = open(self.path, 'rb')
|
---|
670 | self._fh = bed
|
---|
671 |
|
---|
672 | byte1 = bed.read(1)
|
---|
673 | byte2 = bed.read(1)
|
---|
674 | byte3 = bed.read(1)
|
---|
675 | format_flag = struct.unpack('B', byte3)[0]
|
---|
676 |
|
---|
677 | h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]])
|
---|
678 | h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]])
|
---|
679 | h3 = tuple(INT_TO_GCODE[format_flag])
|
---|
680 |
|
---|
681 | if h1 != MAGIC1 or h2 != MAGIC2:
|
---|
682 | raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2))
|
---|
683 | if format_flag:
|
---|
684 | print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3)
|
---|
685 | else:
|
---|
686 | raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3)
|
---|
687 |
|
---|
688 | print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects)
|
---|
689 |
|
---|
690 | ### If quick mode was specified, we're done ...
|
---|
691 | self._quick = quick
|
---|
692 | if quick:
|
---|
693 | return
|
---|
694 |
|
---|
695 | ### ... Otherwise, parse genotypes into an array, and append that
|
---|
696 | ### array to self._genotypes
|
---|
697 | ngcodes = ceiling(nsubjects, 4)
|
---|
698 | bytes_per_marker = nbytes(nsubjects)
|
---|
699 | for m in xrange(nmarkers):
|
---|
700 | genotype_array = array('i', [-1]*(ngcodes))
|
---|
701 | for byte in xrange(bytes_per_marker):
|
---|
702 | intval = struct.unpack('B', bed.read(1))[0]
|
---|
703 | idx = byte*4
|
---|
704 | genotype_array[idx:idx+4] = INT_TO_GCODE[intval]
|
---|
705 | self._genotypes.append(genotype_array)
|
---|
706 |
|
---|
707 | class Bim:
|
---|
708 | def __init__(self, path):
|
---|
709 | """
|
---|
710 | """
|
---|
711 | self.path = path
|
---|
712 | self._markers = {}
|
---|
713 | self._ordered_markers = []
|
---|
714 | self._marker_allele_lookup = {}
|
---|
715 | self._autosomal_indices = set()
|
---|
716 |
|
---|
717 | def parse(self):
|
---|
718 | """
|
---|
719 | """
|
---|
720 | print 'Reading map (extended format) from [ %s ]' % (self.path)
|
---|
721 | bim = open(self.path, 'r')
|
---|
722 | for m, line in enumerate(bim):
|
---|
723 | chrom, snp, gpos, apos, a1, a2 = line.strip().split()
|
---|
724 | self._markers[snp] = (chrom, snp, gpos, apos, a1, a2)
|
---|
725 | self._marker_allele_lookup[m] = {
|
---|
726 | HOM0: (a2, a2),
|
---|
727 | HOM1: (a1, a1),
|
---|
728 | HET : (a1, a2),
|
---|
729 | MISS: ('0','0'),
|
---|
730 | }
|
---|
731 | self._ordered_markers.append(snp)
|
---|
732 | if chrom in AUTOSOMES:
|
---|
733 | self._autosomal_indices.add(m)
|
---|
734 | bim.close()
|
---|
735 | print '%s markers to be included from [ %s ]' % (m+1, self.path)
|
---|
736 |
|
---|
737 | class Fam:
|
---|
738 | def __init__(self, path):
|
---|
739 | """
|
---|
740 | """
|
---|
741 | self.path = path
|
---|
742 | self._subjects = {}
|
---|
743 | self._ordered_subjects = []
|
---|
744 |
|
---|
745 | def parse(self):
|
---|
746 | """
|
---|
747 | """
|
---|
748 | print 'Reading pedigree information from [ %s ]' % (self.path)
|
---|
749 | fam = open(self.path, 'r')
|
---|
750 | for s, line in enumerate(fam):
|
---|
751 | fid, iid, did, mid, sex, phe = line.strip().split()
|
---|
752 | sid = iid.split('.')[0]
|
---|
753 | d_sid = did.split('.')[0]
|
---|
754 | m_sid = mid.split('.')[0]
|
---|
755 | skey = (fid, iid)
|
---|
756 | self._ordered_subjects.append(skey)
|
---|
757 | self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
|
---|
758 | fam.close()
|
---|
759 | print '%s individuals read from [ %s ]' % (s+1, self.path)
|
---|
760 |
|
---|
761 | ### Command-line functionality and testing
|
---|
762 | def test(arg):
|
---|
763 | '''
|
---|
764 | '''
|
---|
765 |
|
---|
766 | import time
|
---|
767 |
|
---|
768 | if arg == 'CAMP_AFFY.ped':
|
---|
769 | print 'Testing bed.parse(quick=True)'
|
---|
770 | s = time.time()
|
---|
771 | bed = Bed(arg.replace('.ped', '.bed'))
|
---|
772 | bed.parse(quick=True)
|
---|
773 | print bed.getGenotype(('400118', '10300283'), 'rs2000467')
|
---|
774 | print bed.getGenotype(('400118', '10101384'), 'rs2294019')
|
---|
775 | print bed.getGenotype(('400121', '10101149'), 'rs2294019')
|
---|
776 | print bed.getGenotype(('400123', '10200290'), 'rs2294019')
|
---|
777 | assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4')
|
---|
778 | e = time.time()
|
---|
779 | print 'e-s = %s\n' % (e-s)
|
---|
780 |
|
---|
781 | print 'Testing bed.parse'
|
---|
782 | s = time.time()
|
---|
783 | bed = BPed(arg)
|
---|
784 | bed.parse(quick=False)
|
---|
785 | e = time.time()
|
---|
786 | print 'e-s = %s\n' % (e-s)
|
---|
787 |
|
---|
788 | print 'Testing bed.writeped'
|
---|
789 | s = time.time()
|
---|
790 | outname = '%s_BEDTEST' % (arg)
|
---|
791 | bed.writeped(outname)
|
---|
792 | e = time.time()
|
---|
793 | print 'e-s = %s\n' % (e-s)
|
---|
794 | del(bed)
|
---|
795 |
|
---|
796 | print 'Testing ped.parse'
|
---|
797 | s = time.time()
|
---|
798 | ped = LPed(arg)
|
---|
799 | ped.parse()
|
---|
800 | e = time.time()
|
---|
801 | print 'e-s = %s\n' % (e-s)
|
---|
802 |
|
---|
803 | print 'Testing ped.writebed'
|
---|
804 | s = time.time()
|
---|
805 | outname = '%s_PEDTEST' % (arg)
|
---|
806 | ped.writebed(outname)
|
---|
807 | e = time.time()
|
---|
808 | print 'e-s = %s\n' % (e-s)
|
---|
809 | del(ped)
|
---|
810 |
|
---|
811 | def profile_bed(arg):
|
---|
812 | """
|
---|
813 | """
|
---|
814 | bed = BPed(arg)
|
---|
815 | bed.parse(quick=False)
|
---|
816 | outname = '%s_BEDPROFILE' % (arg)
|
---|
817 | bed.writeped(outname)
|
---|
818 |
|
---|
819 | def profile_ped(arg):
|
---|
820 | """
|
---|
821 | """
|
---|
822 | ped = LPed(arg)
|
---|
823 | ped.parse()
|
---|
824 | outname = '%s_PEDPROFILE' % (arg)
|
---|
825 | ped.writebed(outname)
|
---|
826 |
|
---|
827 | if __name__ == '__main__':
|
---|
828 | """ Run as a command-line, this script should get one or more arguments,
|
---|
829 | each one a ped file to be parsed with the PedParser (unit tests?)
|
---|
830 | """
|
---|
831 | op = optparse.OptionParser()
|
---|
832 | op.add_option('--profile-bed', action='store_true', default=False)
|
---|
833 | op.add_option('--profile-ped', action='store_true', default=False)
|
---|
834 | opts, args = op.parse_args()
|
---|
835 |
|
---|
836 | if opts.profile_bed:
|
---|
837 | import profile
|
---|
838 | import pstats
|
---|
839 | profile.run('profile_bed(args[0])', 'fooprof')
|
---|
840 | p = pstats.Stats('fooprof')
|
---|
841 | p.sort_stats('cumulative').print_stats(10)
|
---|
842 | elif opts.profile_ped:
|
---|
843 | import profile
|
---|
844 | import pstats
|
---|
845 | profile.run('profile_ped(args[0])', 'fooprof')
|
---|
846 | p = pstats.Stats('fooprof')
|
---|
847 | p.sort_stats('cumulative').print_stats(10)
|
---|
848 | else:
|
---|
849 | for arg in args:
|
---|
850 | test(arg)
|
---|
851 |
|
---|
852 | ### Code used to generate the INT_TO_GCODE dictionary
|
---|
853 | #print '{\n ',
|
---|
854 | #for i in range(256):
|
---|
855 | # b = INT2BIN[i]
|
---|
856 | # ints = []
|
---|
857 | # s = str(i).rjust(3)
|
---|
858 | # #print b
|
---|
859 | # for j in range(4):
|
---|
860 | # idx = j*2
|
---|
861 | # #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2)
|
---|
862 | # ints.append(int(b[idx:idx+2], 2))
|
---|
863 | # print '%s: array(\'i\', %s),' % (s,tuple(ints)),
|
---|
864 | # if i > 0 and (i+1) % 4 == 0:
|
---|
865 | # print '\n ',
|
---|
866 | #print '}'
|
---|
867 |
|
---|
868 |
|
---|