root/galaxy-central/tools/rgenetics/plinkbinJZ.py

リビジョン 2, 35.6 KB (コミッタ: hatakeyama, 14 年 前)

import galaxy-central

行番号 
1#!/usr/bin/env python2.4
2"""
3"""
4
5import optparse,os,subprocess,gzip,struct,time,commands
6from array import array
7
8#from AIMS import util
9#from pga import util as pgautil
10
11__FILE_ID__ = '$Id: plinkbinJZ.py,v 1.14 2009/07/13 20:16:50 rejpz Exp $'
12
13VERBOSE = True
14
15MISSING_ALLELES = set(['N', '0', '.', '-',''])
16
17AUTOSOMES = set(range(1, 23) + [str(c) for c in range(1, 23)])
18
19MAGIC_BYTE1 = '00110110'
20MAGIC_BYTE2 = '11011000'
21FORMAT_SNP_MAJOR_BYTE = '10000000'
22FORMAT_IND_MAJOR_BYTE = '00000000'
23MAGIC1 = (0, 3, 1, 2)
24MAGIC2 = (3, 1, 2, 0)
25FORMAT_SNP_MAJOR = (2, 0, 0, 0)
26FORMAT_IND_MAJOR = (0, 0, 0, 0)
27HEADER_LENGTH = 3
28
29HOM0 = 3
30HOM1 = 0
31MISS = 2
32HET  = 1
33HOM0_GENO = (0, 0)
34HOM1_GENO = (1, 1)
35HET_GENO = (0, 1)
36MISS_GENO = (-9, -9)
37
38GENO_TO_GCODE = {
39    HOM0_GENO: HOM0,
40    HET_GENO: HET,
41    HOM1_GENO: HOM1,
42    MISS_GENO: MISS,
43    }
44
45CHROM_REPLACE = {
46    'X': '23',
47    'Y': '24',
48    'XY': '25',
49    'MT': '26',
50    'M': '26',
51}
52
53MAP_LINE_EXCEPTION_TEXT = """
54One or more lines in the *.map file has only three fields.
55The line was:
56
57%s
58
59If you are running rgGRR through EPMP, this is usually a
60sign that you are using an old version of the map file.
61You can correct the problem by re-running Subject QC.  If
62you have already tried this, please contact the developers,
63or file a bug.
64"""
65
66INT_TO_GCODE = {
67     0: array('i', (0, 0, 0, 0)),   1: array('i', (2, 0, 0, 0)),   2: array('i', (1, 0, 0, 0)),   3: array('i', (3, 0, 0, 0)),
68     4: array('i', (0, 2, 0, 0)),   5: array('i', (2, 2, 0, 0)),   6: array('i', (1, 2, 0, 0)),   7: array('i', (3, 2, 0, 0)),
69     8: array('i', (0, 1, 0, 0)),   9: array('i', (2, 1, 0, 0)),  10: array('i', (1, 1, 0, 0)),  11: array('i', (3, 1, 0, 0)),
70    12: array('i', (0, 3, 0, 0)),  13: array('i', (2, 3, 0, 0)),  14: array('i', (1, 3, 0, 0)),  15: array('i', (3, 3, 0, 0)),
71    16: array('i', (0, 0, 2, 0)),  17: array('i', (2, 0, 2, 0)),  18: array('i', (1, 0, 2, 0)),  19: array('i', (3, 0, 2, 0)),
72    20: array('i', (0, 2, 2, 0)),  21: array('i', (2, 2, 2, 0)),  22: array('i', (1, 2, 2, 0)),  23: array('i', (3, 2, 2, 0)),
73    24: array('i', (0, 1, 2, 0)),  25: array('i', (2, 1, 2, 0)),  26: array('i', (1, 1, 2, 0)),  27: array('i', (3, 1, 2, 0)),
74    28: array('i', (0, 3, 2, 0)),  29: array('i', (2, 3, 2, 0)),  30: array('i', (1, 3, 2, 0)),  31: array('i', (3, 3, 2, 0)),
75    32: array('i', (0, 0, 1, 0)),  33: array('i', (2, 0, 1, 0)),  34: array('i', (1, 0, 1, 0)),  35: array('i', (3, 0, 1, 0)),
76    36: array('i', (0, 2, 1, 0)),  37: array('i', (2, 2, 1, 0)),  38: array('i', (1, 2, 1, 0)),  39: array('i', (3, 2, 1, 0)),
77    40: array('i', (0, 1, 1, 0)),  41: array('i', (2, 1, 1, 0)),  42: array('i', (1, 1, 1, 0)),  43: array('i', (3, 1, 1, 0)),
78    44: array('i', (0, 3, 1, 0)),  45: array('i', (2, 3, 1, 0)),  46: array('i', (1, 3, 1, 0)),  47: array('i', (3, 3, 1, 0)),
79    48: array('i', (0, 0, 3, 0)),  49: array('i', (2, 0, 3, 0)),  50: array('i', (1, 0, 3, 0)),  51: array('i', (3, 0, 3, 0)),
80    52: array('i', (0, 2, 3, 0)),  53: array('i', (2, 2, 3, 0)),  54: array('i', (1, 2, 3, 0)),  55: array('i', (3, 2, 3, 0)),
81    56: array('i', (0, 1, 3, 0)),  57: array('i', (2, 1, 3, 0)),  58: array('i', (1, 1, 3, 0)),  59: array('i', (3, 1, 3, 0)),
82    60: array('i', (0, 3, 3, 0)),  61: array('i', (2, 3, 3, 0)),  62: array('i', (1, 3, 3, 0)),  63: array('i', (3, 3, 3, 0)),
83    64: array('i', (0, 0, 0, 2)),  65: array('i', (2, 0, 0, 2)),  66: array('i', (1, 0, 0, 2)),  67: array('i', (3, 0, 0, 2)),
84    68: array('i', (0, 2, 0, 2)),  69: array('i', (2, 2, 0, 2)),  70: array('i', (1, 2, 0, 2)),  71: array('i', (3, 2, 0, 2)),
85    72: array('i', (0, 1, 0, 2)),  73: array('i', (2, 1, 0, 2)),  74: array('i', (1, 1, 0, 2)),  75: array('i', (3, 1, 0, 2)),
86    76: array('i', (0, 3, 0, 2)),  77: array('i', (2, 3, 0, 2)),  78: array('i', (1, 3, 0, 2)),  79: array('i', (3, 3, 0, 2)),
87    80: array('i', (0, 0, 2, 2)),  81: array('i', (2, 0, 2, 2)),  82: array('i', (1, 0, 2, 2)),  83: array('i', (3, 0, 2, 2)),
88    84: array('i', (0, 2, 2, 2)),  85: array('i', (2, 2, 2, 2)),  86: array('i', (1, 2, 2, 2)),  87: array('i', (3, 2, 2, 2)),
89    88: array('i', (0, 1, 2, 2)),  89: array('i', (2, 1, 2, 2)),  90: array('i', (1, 1, 2, 2)),  91: array('i', (3, 1, 2, 2)),
90    92: array('i', (0, 3, 2, 2)),  93: array('i', (2, 3, 2, 2)),  94: array('i', (1, 3, 2, 2)),  95: array('i', (3, 3, 2, 2)),
91    96: array('i', (0, 0, 1, 2)),  97: array('i', (2, 0, 1, 2)),  98: array('i', (1, 0, 1, 2)),  99: array('i', (3, 0, 1, 2)),
92   100: array('i', (0, 2, 1, 2)), 101: array('i', (2, 2, 1, 2)), 102: array('i', (1, 2, 1, 2)), 103: array('i', (3, 2, 1, 2)),
93   104: array('i', (0, 1, 1, 2)), 105: array('i', (2, 1, 1, 2)), 106: array('i', (1, 1, 1, 2)), 107: array('i', (3, 1, 1, 2)),
94   108: array('i', (0, 3, 1, 2)), 109: array('i', (2, 3, 1, 2)), 110: array('i', (1, 3, 1, 2)), 111: array('i', (3, 3, 1, 2)),
95   112: array('i', (0, 0, 3, 2)), 113: array('i', (2, 0, 3, 2)), 114: array('i', (1, 0, 3, 2)), 115: array('i', (3, 0, 3, 2)),
96   116: array('i', (0, 2, 3, 2)), 117: array('i', (2, 2, 3, 2)), 118: array('i', (1, 2, 3, 2)), 119: array('i', (3, 2, 3, 2)),
97   120: array('i', (0, 1, 3, 2)), 121: array('i', (2, 1, 3, 2)), 122: array('i', (1, 1, 3, 2)), 123: array('i', (3, 1, 3, 2)),
98   124: array('i', (0, 3, 3, 2)), 125: array('i', (2, 3, 3, 2)), 126: array('i', (1, 3, 3, 2)), 127: array('i', (3, 3, 3, 2)),
99   128: array('i', (0, 0, 0, 1)), 129: array('i', (2, 0, 0, 1)), 130: array('i', (1, 0, 0, 1)), 131: array('i', (3, 0, 0, 1)),
100   132: array('i', (0, 2, 0, 1)), 133: array('i', (2, 2, 0, 1)), 134: array('i', (1, 2, 0, 1)), 135: array('i', (3, 2, 0, 1)),
101   136: array('i', (0, 1, 0, 1)), 137: array('i', (2, 1, 0, 1)), 138: array('i', (1, 1, 0, 1)), 139: array('i', (3, 1, 0, 1)),
102   140: array('i', (0, 3, 0, 1)), 141: array('i', (2, 3, 0, 1)), 142: array('i', (1, 3, 0, 1)), 143: array('i', (3, 3, 0, 1)),
103   144: array('i', (0, 0, 2, 1)), 145: array('i', (2, 0, 2, 1)), 146: array('i', (1, 0, 2, 1)), 147: array('i', (3, 0, 2, 1)),
104   148: array('i', (0, 2, 2, 1)), 149: array('i', (2, 2, 2, 1)), 150: array('i', (1, 2, 2, 1)), 151: array('i', (3, 2, 2, 1)),
105   152: array('i', (0, 1, 2, 1)), 153: array('i', (2, 1, 2, 1)), 154: array('i', (1, 1, 2, 1)), 155: array('i', (3, 1, 2, 1)),
106   156: array('i', (0, 3, 2, 1)), 157: array('i', (2, 3, 2, 1)), 158: array('i', (1, 3, 2, 1)), 159: array('i', (3, 3, 2, 1)),
107   160: array('i', (0, 0, 1, 1)), 161: array('i', (2, 0, 1, 1)), 162: array('i', (1, 0, 1, 1)), 163: array('i', (3, 0, 1, 1)),
108   164: array('i', (0, 2, 1, 1)), 165: array('i', (2, 2, 1, 1)), 166: array('i', (1, 2, 1, 1)), 167: array('i', (3, 2, 1, 1)),
109   168: array('i', (0, 1, 1, 1)), 169: array('i', (2, 1, 1, 1)), 170: array('i', (1, 1, 1, 1)), 171: array('i', (3, 1, 1, 1)),
110   172: array('i', (0, 3, 1, 1)), 173: array('i', (2, 3, 1, 1)), 174: array('i', (1, 3, 1, 1)), 175: array('i', (3, 3, 1, 1)),
111   176: array('i', (0, 0, 3, 1)), 177: array('i', (2, 0, 3, 1)), 178: array('i', (1, 0, 3, 1)), 179: array('i', (3, 0, 3, 1)),
112   180: array('i', (0, 2, 3, 1)), 181: array('i', (2, 2, 3, 1)), 182: array('i', (1, 2, 3, 1)), 183: array('i', (3, 2, 3, 1)),
113   184: array('i', (0, 1, 3, 1)), 185: array('i', (2, 1, 3, 1)), 186: array('i', (1, 1, 3, 1)), 187: array('i', (3, 1, 3, 1)),
114   188: array('i', (0, 3, 3, 1)), 189: array('i', (2, 3, 3, 1)), 190: array('i', (1, 3, 3, 1)), 191: array('i', (3, 3, 3, 1)),
115   192: array('i', (0, 0, 0, 3)), 193: array('i', (2, 0, 0, 3)), 194: array('i', (1, 0, 0, 3)), 195: array('i', (3, 0, 0, 3)),
116   196: array('i', (0, 2, 0, 3)), 197: array('i', (2, 2, 0, 3)), 198: array('i', (1, 2, 0, 3)), 199: array('i', (3, 2, 0, 3)),
117   200: array('i', (0, 1, 0, 3)), 201: array('i', (2, 1, 0, 3)), 202: array('i', (1, 1, 0, 3)), 203: array('i', (3, 1, 0, 3)),
118   204: array('i', (0, 3, 0, 3)), 205: array('i', (2, 3, 0, 3)), 206: array('i', (1, 3, 0, 3)), 207: array('i', (3, 3, 0, 3)),
119   208: array('i', (0, 0, 2, 3)), 209: array('i', (2, 0, 2, 3)), 210: array('i', (1, 0, 2, 3)), 211: array('i', (3, 0, 2, 3)),
120   212: array('i', (0, 2, 2, 3)), 213: array('i', (2, 2, 2, 3)), 214: array('i', (1, 2, 2, 3)), 215: array('i', (3, 2, 2, 3)),
121   216: array('i', (0, 1, 2, 3)), 217: array('i', (2, 1, 2, 3)), 218: array('i', (1, 1, 2, 3)), 219: array('i', (3, 1, 2, 3)),
122   220: array('i', (0, 3, 2, 3)), 221: array('i', (2, 3, 2, 3)), 222: array('i', (1, 3, 2, 3)), 223: array('i', (3, 3, 2, 3)),
123   224: array('i', (0, 0, 1, 3)), 225: array('i', (2, 0, 1, 3)), 226: array('i', (1, 0, 1, 3)), 227: array('i', (3, 0, 1, 3)),
124   228: array('i', (0, 2, 1, 3)), 229: array('i', (2, 2, 1, 3)), 230: array('i', (1, 2, 1, 3)), 231: array('i', (3, 2, 1, 3)),
125   232: array('i', (0, 1, 1, 3)), 233: array('i', (2, 1, 1, 3)), 234: array('i', (1, 1, 1, 3)), 235: array('i', (3, 1, 1, 3)),
126   236: array('i', (0, 3, 1, 3)), 237: array('i', (2, 3, 1, 3)), 238: array('i', (1, 3, 1, 3)), 239: array('i', (3, 3, 1, 3)),
127   240: array('i', (0, 0, 3, 3)), 241: array('i', (2, 0, 3, 3)), 242: array('i', (1, 0, 3, 3)), 243: array('i', (3, 0, 3, 3)),
128   244: array('i', (0, 2, 3, 3)), 245: array('i', (2, 2, 3, 3)), 246: array('i', (1, 2, 3, 3)), 247: array('i', (3, 2, 3, 3)),
129   248: array('i', (0, 1, 3, 3)), 249: array('i', (2, 1, 3, 3)), 250: array('i', (1, 1, 3, 3)), 251: array('i', (3, 1, 3, 3)),
130   252: array('i', (0, 3, 3, 3)), 253: array('i', (2, 3, 3, 3)), 254: array('i', (1, 3, 3, 3)), 255: array('i', (3, 3, 3, 3)),
131   }
132
133GCODE_TO_INT = dict([(tuple(v),k) for (k,v) in INT_TO_GCODE.items()])
134
135### Exceptions
136class DuplicateMarkerInMapFile(Exception): pass
137class MapLineTooShort(Exception): pass
138class ThirdAllele(Exception): pass
139class PedError(Exception): pass
140class BadMagic(Exception):
141    """ Raised when one of the MAGIC bytes in a bed file does not match
142    """
143    pass
144class BedError(Exception):
145    """ Raised when parsing a bed file runs into problems
146    """
147    pass
148class UnknownGenocode(Exception):
149    """ Raised when we get a 2-bit genotype that is undecipherable (is it possible?)
150    """
151    pass
152class UnknownGeno(Exception): pass
153
154### Utility functions
155
156def timenow():
157    """return current time as a string
158    """
159    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
160
161def ceiling(n, k):
162    ''' Return the least multiple of k which is greater than n
163    '''
164    m = n % k
165    if m == 0:
166        return n
167    else:
168        return n + k - m
169
170def nbytes(n):
171    ''' Return the number of bytes required for n subjects
172    '''
173    return 2*ceiling(n, 4)/8
174
175### Primary module functionality
176class LPed:
177    """ The uber-class for processing the Linkage-format *.ped/*.map files
178    """
179    def __init__(self,  base):
180        self.base = base
181        self._ped = Ped('%s.ped' % (self.base))
182        self._map = Map('%s.map' % (self.base))
183
184        self._markers = {}
185        self._ordered_markers = []
186        self._marker_allele_lookup = {}
187        self._autosomal_indices = set()
188       
189        self._subjects = {}
190        self._ordered_subjects = []
191       
192        self._genotypes = []
193
194    def parse(self):
195        """
196        """
197        if VERBOSE: print 'plinkbinJZ: Analysis started: %s' % (timenow())
198        self._map.parse()
199        self._markers = self._map._markers
200        self._ordered_markers = self._map._ordered_markers
201        self._autosomal_indices = self._map._autosomal_indices
202       
203        self._ped.parse(self._ordered_markers)
204        self._subjects = self._ped._subjects
205        self._ordered_subjects = self._ped._ordered_subjects
206        self._genotypes = self._ped._genotypes
207        self._marker_allele_lookup = self._ped._marker_allele_lookup
208       
209        ### Adjust self._markers based on the allele information
210        ### we got from parsing the ped file
211        for m,  name in enumerate(self._ordered_markers):
212            a1,  a2 = self._marker_allele_lookup[m][HET]
213            self._markers[name][-2] = a1
214            self._markers[name][-1] = a2
215        if VERBOSE: print 'plinkbinJZ: Analysis finished: %s' % (timenow())
216
217    def getSubjectInfo(self, fid, oiid):
218        """
219        """
220        return self._subject_info[(fid, oiid)]
221
222    def getSubjectInfoByLine(self, line):
223        """
224        """
225        return self._subject_info[self._ordered_subjects[line]]
226   
227    def getGenotypesByIndices(self, s, mlist, format):
228        """ needed for grr if lped - deprecated but..
229        """
230        mlist = dict(zip(mlist,[True,]*len(mlist))) # hash quicker than 'in' ?
231        raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if mlist.get(m,None)])           
232        if format == 'raw':
233            return raw_array
234        elif format == 'ref':
235            result = array('i', [0]*len(mlist))
236            for m, gcode in enumerate(raw_array):
237                if gcode == HOM0:
238                    nref = 3
239                elif gcode == HET:
240                    nref = 2
241                elif gcode == HOM1:
242                    nref = 1
243                else:
244                    nref = 0
245                result[m] = nref
246            return result
247        else:
248            result = []
249            for m, gcode in enumerate(raw_array):
250                result.append(self._marker_allele_lookup[m][gcode])
251            return result
252   
253    def writebed(self, base):
254        """
255        """
256        dst_name = '%s.fam' % (base)       
257        print 'Writing pedigree information to [ %s ]' % (dst_name)
258        dst = open(dst_name, 'w')
259        for skey in self._ordered_subjects:           
260            (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid) = self._subjects[skey]
261            dst.write('%s %s %s %s %s %s\n' % (fid, iid, did, mid, sex, phe))
262        dst.close()
263
264        dst_name = '%s.bim' % (base)       
265        print 'Writing map (extended format) information to [ %s ]' % (dst_name)
266        dst = open(dst_name, 'w')       
267        for m, marker in enumerate(self._ordered_markers):
268            chrom, name, genpos, abspos,  a1,  a2 = self._markers[marker]
269            dst.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (chrom, name, genpos, abspos, a1, a2))
270        dst.close()
271
272        bed_name = '%s.bed' % (base)       
273        print 'Writing genotype bitfile to [ %s ]' % (bed_name)
274        print 'Using (default) SNP-major mode'
275        bed = open(bed_name, 'w')
276
277        ### Write the 3 header bytes
278        bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE1)), 2)))
279        bed.write(struct.pack('B', int(''.join(reversed(MAGIC_BYTE2)), 2)))
280        bed.write(struct.pack('B', int(''.join(reversed(FORMAT_SNP_MAJOR_BYTE)), 2)))
281       
282        ### Calculate how many "pad bits" we should add after the last subject
283        nsubjects = len(self._ordered_subjects)
284        nmarkers = len(self._ordered_markers)
285        total_bytes = nbytes(nsubjects)
286        nbits = nsubjects  * 2
287        pad_nibbles = ((total_bytes * 8) - nbits)/2
288        pad = array('i', [0]*pad_nibbles)
289
290        ### And now write genotypes to the file
291        for m in xrange(nmarkers):
292            geno = self._genotypes[m]
293            geno.extend(pad)
294            bytes = len(geno)/4
295            for b in range(bytes):
296                idx = b*4
297                gcode = tuple(geno[idx:idx+4])
298                try:
299                    byte = struct.pack('B', GCODE_TO_INT[gcode])
300                except KeyError:
301                    print m, b, gcode
302                    raise
303                bed.write(byte)
304        bed.close()
305       
306    def autosomal_indices(self):
307        """ Return the indices of markers in this ped/map that are autosomal.
308            This is used by rgGRR so that it can select a random set of markers
309            from the autosomes (sex chroms screw up the plot)
310        """
311        return self._autosomal_indices
312
313class Ped:
314    def __init__(self, path):
315        self.path = path
316        self._subjects = {}
317        self._ordered_subjects = []
318        self._genotypes = []
319        self._marker_allele_lookup = {}
320       
321    def lineCount(self,infile):
322        """ count the number of lines in a file - efficiently using wget
323        """
324        return int(commands.getoutput('wc -l %s' % (infile)).split()[0])   
325         
326
327    def parse(self,  markers):
328        """ Parse a given file -- this needs to be memory-efficient so that large
329            files can be parsed (~1 million markers on ~5000 subjects?).  It
330            should also be fast, if possible.
331        """
332               
333        ### Find out how many lines are in the file so we can ...
334        nsubjects = self.lineCount(self.path)
335        ### ... Pre-allocate the genotype arrays
336        nmarkers = len(markers)
337        _marker_alleles = [['0', '0'] for _ in xrange(nmarkers)]
338        self._genotypes = [array('i', [-1]*nsubjects) for _ in xrange(nmarkers)]
339
340        if self.path.endswith('.gz'):
341            pfile = gzip.open(self.path, 'r')
342        else:
343            pfile = open(self.path, 'r')
344
345        for s, line in enumerate(pfile):
346            line = line.strip()
347            if not line:
348                continue
349           
350            fid, iid, did, mid, sex, phe, genos = line.split(None, 6)
351            sid = iid.split('.')[0]
352            d_sid = did.split('.')[0]
353            m_sid = mid.split('.')[0]
354           
355            skey = (fid, iid)           
356            self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
357            self._ordered_subjects.append(skey)
358
359            genotypes = genos.split()
360           
361            for m, marker in enumerate(markers):
362                idx = m*2
363                a1, a2 = genotypes[idx:idx+2] # Alleles for subject s, marker m
364                s1, s2 = seen = _marker_alleles[m] # Alleles seen for marker m
365               
366                ### FIXME: I think this can still be faster, and simpler to read
367                # Two pieces of logic intertwined here:  first, we need to code
368                # this genotype as HOM0, HOM1, HET or MISS.  Second, we need to
369                # keep an ongoing record of the genotypes seen for this marker
370                if a1 == a2:
371                    if a1 in MISSING_ALLELES:
372                        geno = MISS_GENO
373                    else:
374                        if s1 == '0':
375                            seen[0] = a1
376                        elif s1 == a1 or s2 == a2:
377                            pass
378                        elif s2 == '0':
379                            seen[1] = a1
380                        else:
381                            raise ThirdAllele('a1=a2=%s, seen=%s?' % (a1, str(seen)))
382                   
383                        if a1 == seen[0]:
384                            geno = HOM0_GENO
385                        elif a1 == seen[1]:
386                            geno = HOM1_GENO
387                        else:
388                            raise PedError('Cannot assign geno for a1=a2=%s from seen=%s' % (a1, str(seen)))
389                elif a1 in MISSING_ALLELES or a2 in MISSING_ALLELES:
390                    geno = MISS_GENO
391                else:
392                    geno = HET_GENO
393                    if s1 == '0':
394                        seen[0] = a1
395                        seen[1] = a2
396                    elif s2 == '0':
397                        if s1 == a1:
398                            seen[1] = a2
399                        elif s1 == a2:
400                            seen[1] = a1
401                        else:
402                            raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
403                    else:
404                        if sorted(seen) != sorted((a1, a2)):
405                            raise ThirdAllele('a1=%s, a2=%s, seen=%s?' % (a1, a2, str(seen)))
406                                       
407                gcode = GENO_TO_GCODE.get(geno, None)
408                if gcode is None:
409                    raise UnknownGeno(str(geno))
410                self._genotypes[m][s] = gcode
411
412        # Build the _marker_allele_lookup table
413        for m,  alleles in enumerate(_marker_alleles):               
414            if len(alleles) == 2:
415                a1,  a2 = alleles
416            elif len(alleles) == 1:
417                a1 = alleles[0]
418                a2 = '0'
419            else:
420                print 'All alleles blank for %s: %s' % (m,  str(alleles))
421                raise
422
423            self._marker_allele_lookup[m] = {
424                HOM0: (a2, a2),
425                HOM1: (a1, a1),
426                HET : (a1, a2),
427                MISS: ('0','0'),
428                }
429
430        if VERBOSE: print '%s(%s) individuals read from [ %s ]' % (len(self._subjects),  nsubjects,  self.path)
431       
432class Map:
433    def __init__(self, path=None):
434        self.path = path
435        self._markers = {}
436        self._ordered_markers = []
437        self._autosomal_indices = set()
438
439    def __len__(self):
440        return len(self._markers)
441
442    def parse(self):
443        """ Parse a Linkage-format map file
444        """
445        if self.path.endswith('.gz'):
446            fh = gzip.open(self.path, 'r')
447        else:
448            fh = open(self.path, 'r')
449           
450        for i, line in enumerate(fh):
451            line = line.strip()
452            if not line:
453                continue
454
455            fields = line.split()
456            if len(fields) < 4:
457                raise MapLineTooShort(MAP_LINE_EXCEPTION_TEXT % (str(line),  len(fields)))
458            else:
459                chrom, name, genpos, abspos = fields
460            if name in self._markers:
461                raise DuplicateMarkerInMapFile('Marker %s was found twice in map file %s' % (name, self.path))
462            abspos = int(abspos)
463            if abspos < 0:
464                continue
465            if chrom in AUTOSOMES:
466                self._autosomal_indices.add(i)
467            chrom = CHROM_REPLACE.get(chrom, chrom)
468            self._markers[name] = [chrom, name, genpos, abspos,  None,  None]
469            self._ordered_markers.append(name)
470        fh.close()
471        if VERBOSE: print '%s (of %s) markers to be included from [ %s ]' % (len(self._ordered_markers),  i,  self.path)
472
473class BPed:
474    """ The uber-class for processing Plink's Binary Ped file format *.bed/*.bim/*.fam
475    """
476    def __init__(self,  base):
477        self.base = base
478        self._bed = Bed('%s.bed' % (self.base))
479        self._bim = Bim('%s.bim' % (self.base))
480        self._fam = Fam('%s.fam' % (self.base))
481
482        self._markers = {}
483        self._ordered_markers = []
484        self._marker_allele_lookup = {}
485        self._autosomal_indices = set()
486       
487        self._subjects = {}
488        self._ordered_subjects = []
489       
490        self._genotypes = []
491       
492    def parse(self,  quick=False):
493        """
494        """
495        self._quick = quick
496       
497        self._bim.parse()
498        self._markers = self._bim._markers
499        self._ordered_markers = self._bim._ordered_markers
500        self._marker_allele_lookup = self._bim._marker_allele_lookup
501        self._autosomal_indices = self._bim._autosomal_indices
502       
503        self._fam.parse()
504        self._subjects = self._fam._subjects
505        self._ordered_subjects = self._fam._ordered_subjects
506
507        self._bed.parse(self._ordered_subjects,  self._ordered_markers,  quick=quick)
508        self._bedf = self._bed._fh
509        self._genotypes = self._bed._genotypes
510        self.nsubjects = len(self._ordered_subjects)
511        self.nmarkers = len(self._ordered_markers)
512        self._bytes_per_marker = nbytes(self.nsubjects)
513
514    def writeped(self, path=None):
515        """
516        """
517        path = self.path = path or self.path
518       
519        map_name = self.path.replace('.bed', '.map')
520        print 'Writing map file [ %s ]' % (map_name)
521        dst = open(map_name, 'w')
522        for m in self._ordered_markers:
523            chrom, snp, genpos, abspos, a1, a2 = self._markers[m]
524            dst.write('%s\t%s\t%s\t%s\n' % (chrom, snp, genpos, abspos))
525        dst.close()
526
527        ped_name = self.path.replace('.bed', '.ped')
528        print 'Writing ped file [ %s ]' % (ped_name)
529        ped = open(ped_name, 'w')
530        firstyikes = False
531        for s, skey in enumerate(self._ordered_subjects):
532            idx = s*2
533            (fid, iid, did, mid, sex, phe, oiid, odid, omid) = self._subjects[skey]
534            ped.write('%s %s %s %s %s %s' % (fid, iid, odid, omid, sex, phe))
535            genotypes_for_subject = self.getGenotypesForSubject(s)
536            for m, snp in enumerate(self._ordered_markers):
537                #a1, a2 = self.getGenotypeByIndices(s, m)
538                a1,a2 = genotypes_for_subject[m]
539                ped.write(' %s %s' % (a1, a2))
540            ped.write('\n')
541        ped.close()
542
543    def getGenotype(self, subject, marker):
544        """ Retrieve a genotype for a particular subject/marker pair
545        """
546        m = self._ordered_markers.index(marker)
547        s = self._ordered_subjects.index(subject)
548        return self.getGenotypeByIndices(s, m)
549
550    def getGenotypesForSubject(self, s, raw=False):
551        """ Returns list of genotypes for all m markers
552            for subject s.  If raw==True, then an array
553            of raw integer gcodes is returned instead
554        """
555        if self._quick:
556            nmarkers = len(self._markers)
557            raw_array = array('i', [0]*nmarkers)
558            seek_nibble = s % 4
559            for m in xrange(nmarkers):
560                seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
561                self._bedf.seek(seek_byte)
562                geno = struct.unpack('B', self._bedf.read(1))[0]
563                quartet = INT_TO_GCODE[geno]
564                gcode = quartet[seek_nibble]
565                raw_array[m] = gcode
566        else:
567            raw_array = array('i', [row[s] for row in self._genotypes])
568           
569        if raw:
570            return raw_array
571        else:
572            result = []
573            for m, gcode in enumerate(raw_array):
574                result.append(self._marker_allele_lookup[m][gcode])
575            return result
576       
577    def getGenotypeByIndices(self, s, m):
578        """
579        """
580        if self._quick:
581            # Determine which byte we need to seek to, and
582            # which nibble within the byte we need
583            seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
584            seek_nibble = s % 4
585            self._bedf.seek(seek_byte)
586            geno = struct.unpack('B', self._bedf.read(1))[0]
587            quartet = INT_TO_GCODE[geno]
588            gcode = quartet[seek_nibble]
589        else:
590            # Otherwise, just grab the genotypes from the
591            # list of arrays
592            genos_for_marker = self._genotypes[m]
593            gcode = genos_for_marker[s]
594
595        return self._marker_allele_lookup[m][gcode]
596
597    def getGenotypesByIndices(self, s, mlist, format):
598        """
599        """
600        if self._quick:
601            raw_array = array('i', [0]*len(mlist))
602            seek_nibble = s % 4
603            for i,m in enumerate(mlist):
604                seek_byte = m * self._bytes_per_marker + s/4 + HEADER_LENGTH
605                self._bedf.seek(seek_byte)
606                geno = struct.unpack('B', self._bedf.read(1))[0]
607                quartet = INT_TO_GCODE[geno]
608                gcode = quartet[seek_nibble]
609                raw_array[i] = gcode
610            mlist = set(mlist)
611        else:
612            mlist = set(mlist)
613            raw_array = array('i', [row[s] for m,row in enumerate(self._genotypes) if m in mlist])
614           
615        if format == 'raw':
616            return raw_array
617        elif format == 'ref':
618            result = array('i', [0]*len(mlist))
619            for m, gcode in enumerate(raw_array):
620                if gcode == HOM0:
621                    nref = 3
622                elif gcode == HET:
623                    nref = 2
624                elif gcode == HOM1:
625                    nref = 1
626                else:
627                    nref = 0
628                result[m] = nref
629            return result
630        else:
631            result = []
632            for m, gcode in enumerate(raw_array):
633                result.append(self._marker_allele_lookup[m][gcode])
634            return result
635   
636    def getSubject(self, s):
637        """
638        """
639        skey = self._ordered_subjects[s]
640        return self._subjects[skey]
641   
642    def autosomal_indices(self):
643        """ Return the indices of markers in this ped/map that are autosomal.
644            This is used by rgGRR so that it can select a random set of markers
645            from the autosomes (sex chroms screw up the plot)
646        """
647        return self._autosomal_indices
648
649class Bed:
650
651    def __init__(self, path):
652        self.path = path
653        self._genotypes = []
654        self._fh = None
655
656    def parse(self, subjects,  markers,  quick=False):
657        """ Parse the bed file, indicated either by the path parameter,
658            or as the self.path indicated in __init__.  If quick is
659            True, then just parse the bim and fam, then genotypes will
660            be looked up dynamically by indices
661        """
662        self._quick = quick
663       
664        ordered_markers = markers
665        ordered_subjects = subjects
666        nsubjects = len(ordered_subjects)
667        nmarkers = len(ordered_markers)
668       
669        bed = open(self.path, 'rb')
670        self._fh = bed
671
672        byte1 = bed.read(1)
673        byte2 = bed.read(1)
674        byte3 = bed.read(1)
675        format_flag = struct.unpack('B', byte3)[0]
676
677        h1 = tuple(INT_TO_GCODE[struct.unpack('B', byte1)[0]])
678        h2 = tuple(INT_TO_GCODE[struct.unpack('B', byte2)[0]])
679        h3 = tuple(INT_TO_GCODE[format_flag])
680
681        if h1 != MAGIC1 or h2 != MAGIC2:
682            raise BadMagic('One or both MAGIC bytes is wrong: %s==%s or %s==%s' % (h1, MAGIC1, h2, MAGIC2))
683        if format_flag:
684            print 'Detected that binary PED file is v1.00 SNP-major mode (%s, "%s")\n' % (format_flag, h3)
685        else:
686            raise 'BAD_FORMAT_FLAG? (%s, "%s")\n' % (format_flag, h3)
687
688        print 'Parsing binary ped file for %s markers and %s subjects' % (nmarkers, nsubjects)
689
690        ### If quick mode was specified, we're done ...
691        self._quick = quick
692        if quick:
693            return
694           
695        ### ... Otherwise, parse genotypes into an array, and append that
696        ### array to self._genotypes
697        ngcodes = ceiling(nsubjects, 4)
698        bytes_per_marker = nbytes(nsubjects)
699        for m in xrange(nmarkers):
700            genotype_array = array('i', [-1]*(ngcodes))
701            for byte in xrange(bytes_per_marker):
702                intval = struct.unpack('B', bed.read(1))[0]
703                idx = byte*4
704                genotype_array[idx:idx+4] = INT_TO_GCODE[intval]
705            self._genotypes.append(genotype_array)
706       
707class Bim:
708    def __init__(self, path):
709        """
710        """
711        self.path = path
712        self._markers = {}
713        self._ordered_markers = []
714        self._marker_allele_lookup = {}
715        self._autosomal_indices = set()
716
717    def parse(self):
718        """
719        """
720        print 'Reading map (extended format) from [ %s ]' % (self.path)
721        bim = open(self.path, 'r')
722        for m, line in enumerate(bim):
723            chrom, snp, gpos, apos, a1, a2 = line.strip().split()
724            self._markers[snp] = (chrom, snp, gpos, apos, a1, a2)
725            self._marker_allele_lookup[m] = {
726                HOM0: (a2, a2),
727                HOM1: (a1, a1),
728                HET : (a1, a2),
729                MISS: ('0','0'),
730                }
731            self._ordered_markers.append(snp)
732            if chrom in AUTOSOMES:
733                self._autosomal_indices.add(m)
734        bim.close()
735        print '%s markers to be included from [ %s ]' % (m+1, self.path)
736
737class Fam:
738    def __init__(self, path):
739        """
740        """
741        self.path = path
742        self._subjects = {}
743        self._ordered_subjects = []
744
745    def parse(self):
746        """
747        """
748        print 'Reading pedigree information from [ %s ]' % (self.path)
749        fam = open(self.path, 'r')
750        for s, line in enumerate(fam):
751            fid, iid, did, mid, sex, phe = line.strip().split()
752            sid = iid.split('.')[0]
753            d_sid = did.split('.')[0]
754            m_sid = mid.split('.')[0]
755            skey = (fid, iid)
756            self._ordered_subjects.append(skey)
757            self._subjects[skey] = (fid, iid, did, mid, sex, phe, sid, d_sid, m_sid)
758        fam.close()
759        print '%s individuals read from [ %s ]' % (s+1, self.path)
760
761### Command-line functionality and testing
762def test(arg):
763    '''
764    '''
765   
766    import time
767
768    if arg == 'CAMP_AFFY.ped':
769        print 'Testing bed.parse(quick=True)'
770        s = time.time()
771        bed = Bed(arg.replace('.ped', '.bed'))
772        bed.parse(quick=True)
773        print bed.getGenotype(('400118', '10300283'), 'rs2000467')
774        print bed.getGenotype(('400118', '10101384'), 'rs2294019')
775        print bed.getGenotype(('400121', '10101149'), 'rs2294019')       
776        print bed.getGenotype(('400123', '10200290'), 'rs2294019')       
777        assert bed.getGenotype(('400118', '10101384'), 'rs2294019') == ('4','4')
778        e = time.time()
779        print 'e-s = %s\n' % (e-s)
780   
781    print 'Testing bed.parse'
782    s = time.time()
783    bed = BPed(arg)
784    bed.parse(quick=False)
785    e = time.time()
786    print 'e-s = %s\n' % (e-s)
787
788    print 'Testing bed.writeped'
789    s = time.time()
790    outname = '%s_BEDTEST' % (arg)
791    bed.writeped(outname)
792    e = time.time()
793    print 'e-s = %s\n' % (e-s)
794    del(bed)
795
796    print 'Testing ped.parse'
797    s = time.time()
798    ped = LPed(arg)
799    ped.parse()
800    e = time.time()
801    print 'e-s = %s\n' % (e-s)
802
803    print 'Testing ped.writebed'
804    s = time.time()
805    outname = '%s_PEDTEST' % (arg)
806    ped.writebed(outname)
807    e = time.time()
808    print 'e-s = %s\n' % (e-s)
809    del(ped)
810   
811def profile_bed(arg):
812    """
813    """
814    bed = BPed(arg)
815    bed.parse(quick=False)
816    outname = '%s_BEDPROFILE' % (arg)
817    bed.writeped(outname)
818
819def profile_ped(arg):
820    """
821    """
822    ped = LPed(arg)
823    ped.parse()
824    outname = '%s_PEDPROFILE' % (arg)
825    ped.writebed(outname)
826
827if __name__ == '__main__':
828    """ Run as a command-line, this script should get one or more arguments,
829        each one a ped file to be parsed with the PedParser (unit tests?)
830    """
831    op = optparse.OptionParser()
832    op.add_option('--profile-bed', action='store_true', default=False)
833    op.add_option('--profile-ped', action='store_true', default=False)
834    opts, args = op.parse_args()
835   
836    if opts.profile_bed:
837        import profile
838        import pstats
839        profile.run('profile_bed(args[0])', 'fooprof')
840        p = pstats.Stats('fooprof')
841        p.sort_stats('cumulative').print_stats(10)
842    elif opts.profile_ped:
843        import profile
844        import pstats
845        profile.run('profile_ped(args[0])', 'fooprof')
846        p = pstats.Stats('fooprof')
847        p.sort_stats('cumulative').print_stats(10)
848    else:
849        for arg in args:
850            test(arg)
851   
852    ### Code used to generate the INT_TO_GCODE dictionary
853    #print '{\n  ',
854    #for i in range(256):
855    #   b = INT2BIN[i]
856    #    ints = []
857    #    s = str(i).rjust(3)
858    #    #print b
859    #    for j in range(4):
860    #        idx = j*2
861    #        #print i, j, idx, b[idx:idx+2], int(b[idx:idx+2], 2)
862    #        ints.append(int(b[idx:idx+2], 2))
863    #    print '%s: array(\'i\', %s),' % (s,tuple(ints)),
864    #    if i > 0 and (i+1) % 4 == 0:
865    #        print '\n  ',
866    #print '}'
867
868
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。