1 | """
|
---|
2 | fakephe.py
|
---|
3 | ross lazarus sept 30 2007
|
---|
4 | This is available under the LGPL as defined then.
|
---|
5 |
|
---|
6 | use the pedigree data for ids
|
---|
7 |
|
---|
8 | use pythons generators to literally generate a bunch of random phenotype measurements
|
---|
9 |
|
---|
10 | Plink format at http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#pheno
|
---|
11 | is
|
---|
12 |
|
---|
13 | To specify an alternate phenotype for analysis, i.e. other than the one in the *.ped file (or, if using a binary fileset, the
|
---|
14 | *.fam file), use the --pheno option:
|
---|
15 | plink --file mydata --pheno pheno.txt
|
---|
16 |
|
---|
17 | where pheno.txt is a file that contains 3 columns (one row per individual):
|
---|
18 |
|
---|
19 | Family ID
|
---|
20 | Individual ID
|
---|
21 | Phenotype
|
---|
22 |
|
---|
23 | NOTE The original PED file must still contain a phenotype in column 6 (even if this is a dummy phenotype, e.g. all missing),
|
---|
24 | unless the --no-pheno flag is given.
|
---|
25 |
|
---|
26 | If an individual is in the original file but not listed in the alternate phenotype file, that person's phenotype will be set to
|
---|
27 | missing. If a person is in the alternate phenotype file but not in the original file, that entry will be ignored. The order of
|
---|
28 | the alternate phenotype file need not be the same as for the original file.
|
---|
29 |
|
---|
30 | If the phenotype file contains more than one phenotype, then use the --mpheno N option to specify the Nth phenotype is the one
|
---|
31 | to be used:
|
---|
32 | plink --file mydata --pheno pheno2.txt --mpheno 4
|
---|
33 |
|
---|
34 | where pheno2.txt contains 5 different phenotypes (i.e. 7 columns in total), this command will use the 4th for analysis
|
---|
35 | (phenotype D):
|
---|
36 |
|
---|
37 | Family ID
|
---|
38 | Individual ID
|
---|
39 | Phenotype A
|
---|
40 | Phenotype B
|
---|
41 | Phenotype C
|
---|
42 | Phenotype D
|
---|
43 | Phenotype E
|
---|
44 |
|
---|
45 | Alternatively, your alternate phenotype file can have a header row, in which case you can use variable names to specify which
|
---|
46 | phenotype to use. If you have a header row, the first two variables must be labelled FID and IID. All subsequent variable names
|
---|
47 | cannot have any whitespace in them. For example,
|
---|
48 |
|
---|
49 | FID IID qt1 bmi site
|
---|
50 | F1 1110 2.3 22.22 2
|
---|
51 | F2 2202 34.12 18.23 1
|
---|
52 | ...
|
---|
53 |
|
---|
54 | then
|
---|
55 | plink --file mydata --pheno pheno2.txt --pheno-name bmi --assoc
|
---|
56 |
|
---|
57 | will select the second phenotype labelled "bmi", for analysis
|
---|
58 |
|
---|
59 | Finally, if there is more than one phenotype, then for basic association tests, it is possible to specify that all phenotypes
|
---|
60 | be tested, sequentially, with the output sent to different files: e.g. if bigpheno.raw contains 10,000 phenotypes, then
|
---|
61 | plink --bfile mydata --assoc --pheno bigpheno.raw --all-pheno
|
---|
62 |
|
---|
63 | will loop over all of these, one at a time testing for association with SNP, generating a lot of output. You might want to use
|
---|
64 | the --pfilter command in this case, to only report results with a p-value less than a certain value, e.g. --pfilter 1e-3.
|
---|
65 |
|
---|
66 | WARNING Currently, all phenotypes must be numerically coded, including missing values, in the alternate phenotype file. The
|
---|
67 | default missing value is -9, change this with --missing-phenotype, but it must be a numeric value still (in contrast to the
|
---|
68 | main phenotype in the PED/FAM file. This issue will be fixed in future releases.
|
---|
69 | Covariate files
|
---|
70 |
|
---|
71 | ===========================
|
---|
72 | rgfakePhe.xml
|
---|
73 | <tool id="fakePhe1" name="Fake phenos">
|
---|
74 | <description>for multiple null fake phenotype</description>
|
---|
75 | <command interpreter="python2.4">rgfakePhe.py $input1 '$title1' $out_file1 $log_file1 $script_file</command>
|
---|
76 | <inputs>
|
---|
77 | <page>
|
---|
78 | <param name="input1"
|
---|
79 | type="library" format="lped"
|
---|
80 | label="Pedigree from Dataset"/>
|
---|
81 | <param name="title1" type="text"
|
---|
82 | value="My fake phenos" size="60"
|
---|
83 | label="Title for outputs"/>
|
---|
84 | </page>
|
---|
85 | <page>
|
---|
86 | <repeat name="fakePhe" title="Phenotypes to Fake">
|
---|
87 | <param name="pName" type="text" label="Phenotype Name">
|
---|
88 | </param>
|
---|
89 | <conditional name="series">
|
---|
90 | <param name="phetype" type="select" label="Phenotype Type">
|
---|
91 | <option value="rnorm" selected="true">Random normal variate</option>
|
---|
92 | <option value="cat">Random categorical</option>
|
---|
93 | </param>
|
---|
94 | <when value="rnorm">
|
---|
95 | <param name="Mean" type="float" value="0.0" label="Mean">
|
---|
96 | </param>
|
---|
97 | <param name="SD" type="float" label="SD" value="1.0">
|
---|
98 | </param>
|
---|
99 | </when>
|
---|
100 | <when value="cat">
|
---|
101 | <param name="values" type="text" value="1,2,3,fiddle" label="comma separated values to choose from">
|
---|
102 | </param>
|
---|
103 | </when>
|
---|
104 | </conditional>
|
---|
105 | </repeat>
|
---|
106 | </page>
|
---|
107 | </inputs>
|
---|
108 | <configfiles>
|
---|
109 | <configfile name="script_file">
|
---|
110 | #for $n, $f in enumerate($fakePhe)
|
---|
111 | #if $f.series.phetype=='rnorm'
|
---|
112 | {'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'Mean':'$f.series.Mean', 'SD':'$f.series.SD'}"}
|
---|
113 | #elif $f.series.phetype=='cat'
|
---|
114 | {'pN':'$f.pName','pT':'$f.series.phetype','pP':"{'values':'$f.series.values'}"}
|
---|
115 | #end if
|
---|
116 | #end for
|
---|
117 | </configfile>
|
---|
118 | </configfiles>
|
---|
119 |
|
---|
120 | <outputs>
|
---|
121 | <data format="pphe" name="out_file1" />
|
---|
122 | <data format="text" name="log_file1" parent="out_file1"/>
|
---|
123 | </outputs>
|
---|
124 |
|
---|
125 | <help>
|
---|
126 | .. class:: infomark
|
---|
127 |
|
---|
128 | This tool allows you to generate an arbitrary (sort of)
|
---|
129 | synthetic phenotype file with measurements drawn from normal,
|
---|
130 | gamma, or categorical distributions. These are for testing under
|
---|
131 | the null hypothesis of no association - the values are random but
|
---|
132 | from user specified distributions.
|
---|
133 |
|
---|
134 | -----
|
---|
135 |
|
---|
136 | .. class:: warningmark
|
---|
137 |
|
---|
138 | This tool is very experimental
|
---|
139 |
|
---|
140 | -----
|
---|
141 |
|
---|
142 | - **Pedigree** is a library pedigree file - the id's will be used in the synthetic null phenotypes
|
---|
143 | - **Title** is a name to give to the output phenotype file
|
---|
144 |
|
---|
145 | On the next page, you can add an unlimited number of various kinds of phenotypes including choices for
|
---|
146 | categorical ones or distributions with specific parameters
|
---|
147 |
|
---|
148 | Just keep adding new ones until you're done then use the Execute button to run the generation
|
---|
149 |
|
---|
150 |
|
---|
151 |
|
---|
152 |
|
---|
153 | **Example from another tool to keep you busy and in awe**
|
---|
154 |
|
---|
155 | Input file::
|
---|
156 |
|
---|
157 | 1 68 4.1
|
---|
158 | 2 71 4.6
|
---|
159 | 3 62 3.8
|
---|
160 | 4 75 4.4
|
---|
161 | 5 58 3.2
|
---|
162 | 6 60 3.1
|
---|
163 | 7 67 3.8
|
---|
164 | 8 68 4.1
|
---|
165 | 9 71 4.3
|
---|
166 | 10 69 3.7
|
---|
167 |
|
---|
168 | Create a two series XY plot on the above data:
|
---|
169 |
|
---|
170 | - Series 1: Red Dashed-Line plot between columns 1 and 2
|
---|
171 | - Series 2: Blue Circular-Point plot between columns 3 and 2
|
---|
172 |
|
---|
173 | .. image:: ../static/images/xy_example.jpg
|
---|
174 | </help>
|
---|
175 | </tool>
|
---|
176 |
|
---|
177 |
|
---|
178 |
|
---|
179 | """
|
---|
180 |
|
---|
181 | import random,copy,sys,os,time,string,math
|
---|
182 |
|
---|
183 | doFbatphe = False
|
---|
184 |
|
---|
185 | galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?>
|
---|
186 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
---|
187 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
---|
188 | <head>
|
---|
189 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
---|
190 | <meta name="generator" content="Galaxy %s tool output - see http://g2.trac.bx.psu.edu/" />
|
---|
191 | <title></title>
|
---|
192 | <link rel="stylesheet" href="/static/style/base.css" type="text/css" />
|
---|
193 | </head>
|
---|
194 | <body>
|
---|
195 | <div class="document">
|
---|
196 | """
|
---|
197 |
|
---|
198 |
|
---|
199 | def timenow():
|
---|
200 | """return current time as a string
|
---|
201 | """
|
---|
202 | return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
|
---|
203 |
|
---|
204 |
|
---|
205 | def poisson(lamb=2):
|
---|
206 | """
|
---|
207 | algorithm poisson random number (Knuth):
|
---|
208 | init:
|
---|
209 | Let L = e^-lamb, k = 0 and p = 1.
|
---|
210 | do:
|
---|
211 | k = k + 1.
|
---|
212 | Generate uniform random number u in [0,1] and let p = p u.
|
---|
213 | while p >= L.
|
---|
214 | return k - 1.
|
---|
215 | """
|
---|
216 | lamb = float(lamb)
|
---|
217 | l = math.e**(-lamb)
|
---|
218 | k=0
|
---|
219 | p=1
|
---|
220 | while 1:
|
---|
221 | while p >= l:
|
---|
222 | k += 1
|
---|
223 | u = random.uniform(0.0,1.0)
|
---|
224 | p *= u
|
---|
225 | yield '%e' % (k - 1)
|
---|
226 |
|
---|
227 | def gammaPhe(alpha=1,beta=1):
|
---|
228 | """generator for random values from a gamma
|
---|
229 | """
|
---|
230 | while 1: # an iterator for a random phenotype
|
---|
231 | dat = random.gammavariate(float(alpha),float(beta))
|
---|
232 | yield '%e' % dat
|
---|
233 |
|
---|
234 | def weibullPhe(alpha=1,beta=1):
|
---|
235 | """generator for random values from a weibull distribution
|
---|
236 | """
|
---|
237 | while 1: # an iterator for a random phenotype
|
---|
238 | dat = random.weibullvariate(float(alpha),float(beta))
|
---|
239 | yield '%e' % dat
|
---|
240 |
|
---|
241 | def normPhe(mean=0,sd=1):
|
---|
242 | """generator for random values from a normal distribution
|
---|
243 | """
|
---|
244 | while 1:# an iterator for a random phenotype
|
---|
245 | dat = random.normalvariate(float(mean),float(sd))
|
---|
246 | yield '%e' % dat
|
---|
247 |
|
---|
248 | def expoPhe(mean=1):
|
---|
249 | """generator for random values from an exponential distribution
|
---|
250 | """
|
---|
251 | lamb = 1.0/float(mean)
|
---|
252 | while 1:# an iterator for a random phenotype
|
---|
253 | dat = random.expovariate(lamb)
|
---|
254 | yield '%e' % dat
|
---|
255 |
|
---|
256 |
|
---|
257 | def catPhe(values='1,2,3'):
|
---|
258 | """ Schrodinger's of course.
|
---|
259 | """
|
---|
260 | v = values.split(',')
|
---|
261 | while 1:# an iterator for a random phenotype
|
---|
262 | dat = random.choice(v)
|
---|
263 | yield dat
|
---|
264 |
|
---|
265 | def uniPhe(low=0.0,hi=1.0):
|
---|
266 | """generator for a uniform distribution
|
---|
267 | what if low=-5 and hi=-2
|
---|
268 | """
|
---|
269 | low = float(low)
|
---|
270 | hi = float(hi)
|
---|
271 | while 1: # unif phenotype
|
---|
272 | v = random.uniform(low,hi) # 0-1
|
---|
273 | yield '%e' % v
|
---|
274 |
|
---|
275 | def getIds(indir='foo'):
|
---|
276 | """read identifiers - first 2 cols from a pedigree file or fam file
|
---|
277 | """
|
---|
278 | inpath = os.path.split(indir)[0] # get root
|
---|
279 | infam = '%s.fam' % indir
|
---|
280 | inped = '%s.ped' % indir # if there's a ped
|
---|
281 | flist = os.listdir(inpath)
|
---|
282 | if len(flist) == 0:
|
---|
283 | print >> sys.stderr, '### Error - input path %s is empty' % indir
|
---|
284 | sys.exit(1)
|
---|
285 | if os.path.exists(infam):
|
---|
286 | pfname = infam
|
---|
287 | elif os.path.exists(inped):
|
---|
288 | pfname = inped
|
---|
289 | else:
|
---|
290 | print >> sys.stderr, '### Error - input path %s contains no ped or fam' % indir
|
---|
291 | sys.exit(1)
|
---|
292 | f = file(pfname,'r')
|
---|
293 | ids = []
|
---|
294 | for l in f:
|
---|
295 | ll = l.split()
|
---|
296 | if len(ll) > 5: # ok line?
|
---|
297 | ids.append(ll[:2])
|
---|
298 | return ids
|
---|
299 |
|
---|
300 | def makePhe(phes = [],ids=[]):
|
---|
301 | """Create the null phenotype values and append them to the case id
|
---|
302 | phes is the (generator, headername) for each column
|
---|
303 | for a phe file, ids are the case identifiers for the phenotype file
|
---|
304 | res contains the final strings for the file
|
---|
305 | each column is derived by iterating
|
---|
306 | over the generator actions set up by makePhes
|
---|
307 | """
|
---|
308 | header = ['FID','IID'] # for plink
|
---|
309 | res = copy.copy(ids)
|
---|
310 | for (f,fname) in phes:
|
---|
311 | header.append(fname)
|
---|
312 | for n,subject in enumerate(ids):
|
---|
313 | res[n].append(f.next()) # generate the right value
|
---|
314 | res.insert(0,header)
|
---|
315 | res = [' '.join(x) for x in res] # must be space delim for fbat
|
---|
316 | return res
|
---|
317 |
|
---|
318 | def makePhes(pheTypes=[], pheNames=[], pheParams=[]):
|
---|
319 | """set up phes for makePhe
|
---|
320 | each has to have an iterator (action) and a name
|
---|
321 | """
|
---|
322 | action = None
|
---|
323 | phes = []
|
---|
324 | for n,pt in enumerate(pheTypes):
|
---|
325 | name = pheNames[n]
|
---|
326 | if pt == 'rnorm':
|
---|
327 | m = pheParams[n].get('Mean',0.0)
|
---|
328 | s = pheParams[n].get('SD',1.0)
|
---|
329 | action = normPhe(mean=m,sd=s) # so we can just iterate over action
|
---|
330 | elif pt == 'rgamma':
|
---|
331 | m = pheParams[n].get('Alpha',0.0)
|
---|
332 | s = pheParams[n].get('Beta',1.0)
|
---|
333 | action = gammaPhe(alpha=m,beta=s)
|
---|
334 | if pt == 'exponential':
|
---|
335 | m = pheParams[n].get('Mean',1.0)
|
---|
336 | action = expoPhe(mean=m) # so we can just iterate over action
|
---|
337 | elif pt == 'weibull':
|
---|
338 | m = pheParams[n].get('Alpha',0.0)
|
---|
339 | s = pheParams[n].get('Beta',1.0)
|
---|
340 | action = weibullPhe(alpha=m,beta=s)
|
---|
341 | elif pt == 'cat':
|
---|
342 | v = pheParams[n].get('values',['?',])
|
---|
343 | action = catPhe(values=v)
|
---|
344 | elif pt == 'unif':
|
---|
345 | low = pheParams[n].get('low',0.0)
|
---|
346 | hi = pheParams[n].get('hi',1.0)
|
---|
347 | action = uniPhe(low=low,hi=hi)
|
---|
348 | elif pt == 'poisson':
|
---|
349 | lamb = pheParams[n].get('lamb',1.0)
|
---|
350 | action = poisson(lamb=lamb)
|
---|
351 | phes.append((action,name))
|
---|
352 | return phes
|
---|
353 |
|
---|
354 | def doImport(outfile='test',flist=[],expl='',mylog=[]):
|
---|
355 | """ import into one of the new html composite data types for Rgenetics
|
---|
356 | Dan Blankenberg with mods by Ross Lazarus
|
---|
357 | October 2007
|
---|
358 | """
|
---|
359 | outf = open(outfile,'w')
|
---|
360 | progname = os.path.basename(sys.argv[0])
|
---|
361 | outf.write(galhtmlprefix % progname)
|
---|
362 | if len(flist) > 0:
|
---|
363 | outf.write('<ol>\n')
|
---|
364 | for i, data in enumerate( flist ):
|
---|
365 | outf.write('<li><a href="%s">%s %s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1],expl))
|
---|
366 | outf.write('</ol>\n')
|
---|
367 | else:
|
---|
368 | outf.write('No files found')
|
---|
369 | outf.write('<hr/><h4>Log of run follows:</h4><br/><pre>%s\n</pre><br/><hr/>' % ('\n'.join(mylog)))
|
---|
370 | outf.write("</div></body></html>\n")
|
---|
371 | outf.close()
|
---|
372 |
|
---|
373 |
|
---|
374 | def test():
|
---|
375 | """test case
|
---|
376 | need to get these out of a galaxy form - series of pages - get types
|
---|
377 | on first screen, names on second, params on third?
|
---|
378 | holy shit. this actually works I think
|
---|
379 | """
|
---|
380 | pT = ['rnorm','rnorm','rnorm','rnorm','cat','unif']
|
---|
381 | pN = ['SysBP','DiaBP','HtCM','WtKG','Race','age']
|
---|
382 | pP = [{'Mean':'120','SD':'10'},{'Mean':'90','SD':'15'},{'Mean':'160','SD':'20'},{'Mean':'60','SD':'20'}, \
|
---|
383 | {'values':'Blink,What,Yours,green'},{'low':16,'hi':99}]
|
---|
384 | phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)
|
---|
385 | ids = []
|
---|
386 | for i in range(10):
|
---|
387 | ids.append(['fid%d' % i,'iid%d' % i])
|
---|
388 | pheres = makePhe(phes=phes,ids=ids)
|
---|
389 | res = [''.join(x) for x in pheres]
|
---|
390 | print '\n'.join(res)
|
---|
391 |
|
---|
392 |
|
---|
393 |
|
---|
394 | if __name__ == "__main__":
|
---|
395 | """
|
---|
396 | <command interpreter="python">rgfakePhe.py '$infile1.extra_files_path/$infile1.metadata.base_name'
|
---|
397 | "$title1" '$ppheout' '$ppheout.files_path' '$script_file '
|
---|
398 | </command>
|
---|
399 | The xml file for this tool is complex, and allows any arbitrary number of
|
---|
400 | phenotype columns to be specified from a couple of optional types - rnorm, cat
|
---|
401 | are working now.
|
---|
402 |
|
---|
403 | Note that we create new files in their respective library directories and link to them in the output file
|
---|
404 | so they can be displayed and downloaded separately
|
---|
405 |
|
---|
406 | """
|
---|
407 | killme = string.punctuation + string.whitespace
|
---|
408 | trantab = string.maketrans(killme,'_'*len(killme))
|
---|
409 | progname = os.path.basename(sys.argv[0])
|
---|
410 | cl = '## at %s, %s got cl= %s' % (timenow(),progname,' '.join(sys.argv))
|
---|
411 | print >> sys.stdout,cl
|
---|
412 | if len(sys.argv) < 5:
|
---|
413 | test()
|
---|
414 | else:
|
---|
415 | inped = sys.argv[1]
|
---|
416 | title = sys.argv[2].translate(trantab)
|
---|
417 | ppheout = sys.argv[3]
|
---|
418 | pphe_path = sys.argv[4]
|
---|
419 | scriptfile = sys.argv[5]
|
---|
420 | ind = file(scriptfile,'r').readlines()
|
---|
421 | mylog = []
|
---|
422 | s = '## %s starting at %s<br/>\n' % (progname,timenow())
|
---|
423 | mylog.append(s)
|
---|
424 | mylog.append(cl)
|
---|
425 | s = '## params = %s<br/>\n' % (' '.join(sys.argv[1:]))
|
---|
426 | mylog.append(s)
|
---|
427 | s = '\n'.join(ind)
|
---|
428 | mylog.append('Script file %s contained %s<br/>\n' % (scriptfile,s))
|
---|
429 | pT = []
|
---|
430 | pN = []
|
---|
431 | pP = []
|
---|
432 | for l in ind:
|
---|
433 | l = l.strip()
|
---|
434 | if len(l) > 1:
|
---|
435 | adict = eval(l)
|
---|
436 | pT.append(adict.get('pT',None))
|
---|
437 | pN.append(adict.get('pN',None))
|
---|
438 | pP.append(eval(adict.get('pP',None)))
|
---|
439 | s = '## pt,pn,pp=%s,%s,%s<br/>\n' % (str(pT),str(pN),str(pP))
|
---|
440 | mylog.append(s)
|
---|
441 | phes = makePhes(pheTypes=pT, pheNames=pN, pheParams=pP)
|
---|
442 | ids = getIds(indir=inped) # for labelling rows
|
---|
443 | pheres = makePhe(phes=phes,ids=ids) # random values from given distributions
|
---|
444 | try:
|
---|
445 | os.makedirs(pphe_path)
|
---|
446 | except:
|
---|
447 | pass
|
---|
448 | outname = os.path.join(pphe_path,title)
|
---|
449 | pphefname = '%s.pphe' % outname
|
---|
450 | f = file(pphefname, 'w')
|
---|
451 | f.write('\n'.join(pheres))
|
---|
452 | f.write('\n')
|
---|
453 | f.close()
|
---|
454 | if doFbatphe:
|
---|
455 | try:
|
---|
456 | os.makedirs(fphe_path)
|
---|
457 | except:
|
---|
458 | pass
|
---|
459 | outname = os.path.join(fphe_path,title)
|
---|
460 | fphefname = '%s.phe' % outname
|
---|
461 | f = file(fphefname, 'w')
|
---|
462 | header = pheres[0].split()
|
---|
463 | pheres[0] = ' '.join(header[2:])# remove iid fid from header for fbat
|
---|
464 | f.write('\n'.join(pheres))
|
---|
465 | f.close()
|
---|
466 | doImport(outfile=fpheout,flist=[fphefname,],expl='(FBAT phenotype format)',mylog=mylog)
|
---|
467 | #doImport(outfile='test',flist=[],expl='',mylog=[]):
|
---|
468 | doImport(outfile=ppheout,flist=[pphefname,],expl='(Plink phenotype format)',mylog=mylog)
|
---|
469 |
|
---|