[2] | 1 | #!/usr/local/bin/python |
---|
| 2 | """ |
---|
| 3 | # added most of the available options for linear models |
---|
| 4 | # june 2009 rml |
---|
| 5 | # hack to run and process a plink quantitative trait |
---|
| 6 | # |
---|
| 7 | |
---|
| 8 | This is a wrapper for Shaun Purcell's Plink linear/logistic models for |
---|
| 9 | traits, covariates and genotypes. |
---|
| 10 | |
---|
| 11 | It requires some judgement to interpret the findings |
---|
| 12 | We need some better visualizations - manhattan plots are good. |
---|
| 13 | svg with rs numbers for top 1%? |
---|
| 14 | |
---|
| 15 | toptable tools - truncate a gg file down to some low percentile |
---|
| 16 | |
---|
| 17 | intersect with other tables - eg gene expression regressions on snps |
---|
| 18 | |
---|
| 19 | |
---|
| 20 | |
---|
| 21 | """ |
---|
| 22 | |
---|
| 23 | import sys,math,shutil,subprocess,os,string,tempfile,shutil,commands |
---|
| 24 | from rgutils import plinke |
---|
| 25 | |
---|
| 26 | def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): |
---|
| 27 | """ |
---|
| 28 | score must be scaled to 0-1000 |
---|
| 29 | |
---|
| 30 | Want to make some wig tracks from each analysis |
---|
| 31 | Best n -log10(p). Make top hit the window. |
---|
| 32 | we use our tab output which has |
---|
| 33 | rs chrom offset ADD_stat ADD_p ADD_log10p |
---|
| 34 | rs3094315 1 792429 1.151 0.2528 0.597223 |
---|
| 35 | |
---|
| 36 | """ |
---|
| 37 | |
---|
| 38 | def is_number(s): |
---|
| 39 | try: |
---|
| 40 | float(s) |
---|
| 41 | return True |
---|
| 42 | except ValueError: |
---|
| 43 | return False |
---|
| 44 | header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) |
---|
| 45 | column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] |
---|
| 46 | halfwidth=100 |
---|
| 47 | resfpath = os.path.join(twd,resf) |
---|
| 48 | resf = open(resfpath,'r') |
---|
| 49 | resfl = resf.readlines() # dumb but convenient for millions of rows |
---|
| 50 | resfl = [x.split() for x in resfl] |
---|
| 51 | headl = resfl[0] |
---|
| 52 | resfl = resfl[1:] |
---|
| 53 | headl = [x.strip().upper() for x in headl] |
---|
| 54 | headIndex = dict(zip(headl,range(0,len(headl)))) |
---|
| 55 | chrpos = headIndex.get('CHROM',None) |
---|
| 56 | rspos = headIndex.get('RS',None) |
---|
| 57 | offspos = headIndex.get('OFFSET',None) |
---|
| 58 | ppos = headIndex.get('ADD_LOG10P',None) |
---|
| 59 | wewant = [chrpos,rspos,offspos,ppos] |
---|
| 60 | if None in wewant: # missing something |
---|
| 61 | logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex) |
---|
| 62 | return |
---|
| 63 | resfl = [x for x in resfl if x[ppos] > ''] |
---|
| 64 | resfl = [(float(x[ppos]),x) for x in resfl] # decorate |
---|
| 65 | resfl.sort() |
---|
| 66 | resfl.reverse() # using -log10 so larger is better |
---|
| 67 | resfl = resfl[:topn] # truncate |
---|
| 68 | pvals = [x[0] for x in resfl] # need to scale |
---|
| 69 | resfl = [x[1] for x in resfl] # drop decoration |
---|
| 70 | if len(pvals) == 0: |
---|
| 71 | logf.write('### no pvalues found in resfl - %s' % (resfl[:3])) |
---|
| 72 | sys.exit(1) |
---|
| 73 | maxp = max(pvals) # need to scale |
---|
| 74 | minp = min(pvals) |
---|
| 75 | prange = abs(maxp-minp) + 0.5 # fudge |
---|
| 76 | scalefact = 1000.0/prange |
---|
| 77 | logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) |
---|
| 78 | for i,row in enumerate(resfl): |
---|
| 79 | row[ppos] = '%d' % (int(scalefact*pvals[i])) |
---|
| 80 | resfl[i] = row # replace |
---|
| 81 | outf = file(outfname,'w') |
---|
| 82 | outf.write(header) |
---|
| 83 | outres = [] # need to resort into chrom offset order |
---|
| 84 | for i,lrow in enumerate(resfl): |
---|
| 85 | chrom,snp,offset,p, = [lrow[x] for x in wewant] |
---|
| 86 | gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth), |
---|
| 87 | '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) |
---|
| 88 | outres.append(gff) |
---|
| 89 | outres = [(x[0],int(x[3]),x) for x in outres] # decorate |
---|
| 90 | outres.sort() # into chrom offset |
---|
| 91 | outres=[x[2] for x in outres] # undecorate |
---|
| 92 | outres = ['\t'.join(x) for x in outres] |
---|
| 93 | outf.write('\n'.join(outres)) |
---|
| 94 | outf.write('\n') |
---|
| 95 | outf.close() |
---|
| 96 | |
---|
| 97 | |
---|
| 98 | def xformQassoc(resf='',outfname='',logf=None,twd='.'): |
---|
| 99 | """ plink.assoc.linear to gg file |
---|
| 100 | from the docs |
---|
| 101 | The output per each SNP might look something like: |
---|
| 102 | |
---|
| 103 | CHR SNP BP A1 TEST NMISS OR STAT P |
---|
| 104 | 5 rs000001 10001 A ADD 664 0.7806 -1.942 0.05216 |
---|
| 105 | 5 rs000001 10001 A DOMDEV 664 0.9395 -0.3562 0.7217 |
---|
| 106 | 5 rs000001 10001 A COV1 664 0.9723 -0.7894 0.4299 |
---|
| 107 | 5 rs000001 10001 A COV2 664 1.159 0.5132 0.6078 |
---|
| 108 | 5 rs000001 10001 A GENO_2DF 664 NA 5.059 0.0797 |
---|
| 109 | need to transform into gg columns for each distinct test |
---|
| 110 | or bed for tracks? |
---|
| 111 | |
---|
| 112 | """ |
---|
| 113 | logf.write('xformQassoc got resf=%s, outfname=%s\n' % (resf,outfname)) |
---|
| 114 | resdict = {} |
---|
| 115 | rsdict = {} |
---|
| 116 | markerlist = [] |
---|
| 117 | # plink is "clever" - will run logistic if only 2 categories such as gender |
---|
| 118 | resfs = resf.split('.') |
---|
| 119 | if resfs[-1] == 'logistic': |
---|
| 120 | resfs[-1] = 'linear' |
---|
| 121 | else: |
---|
| 122 | resfs[-1] = 'logistic' |
---|
| 123 | altresf = '.'.join(resfs) |
---|
| 124 | |
---|
| 125 | altresfpath = os.path.join(twd,altresf) |
---|
| 126 | resfpath = os.path.join(twd,resf) |
---|
| 127 | try: |
---|
| 128 | resf = open(resfpath,'r') |
---|
| 129 | except: |
---|
| 130 | try: |
---|
| 131 | resf = open(altresfpath,'r') |
---|
| 132 | except: |
---|
| 133 | print >> sys.stderr, '## error - no file plink output %s or %s found - cannot continue' % (resfpath, altresfpath) |
---|
| 134 | sys.exit(1) |
---|
| 135 | for lnum,row in enumerate(resf): |
---|
| 136 | if lnum == 0: |
---|
| 137 | headl = row.split() |
---|
| 138 | headl = [x.strip().upper() for x in headl] |
---|
| 139 | headIndex = dict(zip(headl,range(0,len(headl)))) |
---|
| 140 | chrpos = headIndex.get('CHR',None) |
---|
| 141 | rspos = headIndex.get('SNP',None) |
---|
| 142 | offspos = headIndex.get('BP',None) |
---|
| 143 | nmisspos = headIndex.get('NMISS',None) |
---|
| 144 | testpos = headIndex.get('TEST',None) |
---|
| 145 | ppos = headIndex.get('P',None) |
---|
| 146 | coeffpos = headIndex.get('OR',None) |
---|
| 147 | if not coeffpos: |
---|
| 148 | coeffpos = headIndex.get('BETA',None) |
---|
| 149 | apos = headIndex.get('A1',None) |
---|
| 150 | statpos = headIndex.get('STAT',None) |
---|
| 151 | wewant = [chrpos,rspos,offspos,testpos,statpos,ppos,coeffpos,apos] |
---|
| 152 | if None in wewant: # missing something |
---|
| 153 | logf.write('missing a required header in xformQassoc - headIndex=%s\n' % headIndex) |
---|
| 154 | return |
---|
| 155 | llen = len(headl) |
---|
| 156 | else: # no Nones! |
---|
| 157 | ll = row.split() |
---|
| 158 | if len(ll) >= llen: # valid line |
---|
| 159 | chrom,snp,offset,test,stat,p,coeff,allele = [ll[x] for x in wewant] |
---|
| 160 | snp = snp.strip() |
---|
| 161 | if p <> 'NA' : |
---|
| 162 | try: |
---|
| 163 | ffp = float(p) |
---|
| 164 | if ffp <> 0: |
---|
| 165 | lp = -math.log10(ffp) |
---|
| 166 | except: |
---|
| 167 | lp = 0.0 |
---|
| 168 | resdict.setdefault(test,{}) |
---|
| 169 | resdict[test][snp] = (stat,p,'%f' % lp) |
---|
| 170 | if rsdict.get(snp,None) == None: |
---|
| 171 | rsdict[snp] = (chrom,offset) |
---|
| 172 | markerlist.append(snp) |
---|
| 173 | # now have various tests indexed by rs |
---|
| 174 | tk = resdict.keys() |
---|
| 175 | tk.sort() # tests |
---|
| 176 | ohead = ['rs','chrom','offset'] |
---|
| 177 | for t in tk: # add headers |
---|
| 178 | ohead.append('%s_stat' % t) |
---|
| 179 | ohead.append('%s_p' % t) |
---|
| 180 | ohead.append('%s_log10p' % t) |
---|
| 181 | oheads = '\t'.join(ohead) |
---|
| 182 | res = [oheads,] |
---|
| 183 | for snp in markerlist: # retain original order |
---|
| 184 | chrom,offset = rsdict[snp] |
---|
| 185 | outl = [snp,chrom,offset] |
---|
| 186 | for t in tk: |
---|
| 187 | outl += resdict[t][snp] # add stat,p for this test |
---|
| 188 | outs = '\t'.join(outl) |
---|
| 189 | res.append(outs) |
---|
| 190 | f = file(outfname,'w') |
---|
| 191 | res.append('') |
---|
| 192 | f.write('\n'.join(res)) |
---|
| 193 | f.close() |
---|
| 194 | |
---|
| 195 | |
---|
| 196 | if __name__ == "__main__": |
---|
| 197 | """ |
---|
| 198 | |
---|
| 199 | <command interpreter="python"> |
---|
| 200 | rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name' |
---|
| 201 | "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name' |
---|
| 202 | '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$wigout' |
---|
| 203 | </command> |
---|
| 204 | """ |
---|
| 205 | topn = 1000 |
---|
| 206 | killme = string.punctuation+string.whitespace |
---|
| 207 | trantab = string.maketrans(killme,'_'*len(killme)) |
---|
| 208 | if len(sys.argv) < 17: |
---|
| 209 | s = 'rgGLM.py needs 17 params - got %s \n' % (sys.argv) |
---|
| 210 | sys.stderr.write(s) # print >>,s would probably also work? |
---|
| 211 | sys.exit(0) |
---|
| 212 | blurb = 'rgGLM.py called with %s' % sys.argv |
---|
| 213 | print >> sys.stdout,blurb |
---|
| 214 | bfname = sys.argv[1] |
---|
| 215 | phename = sys.argv[2] |
---|
| 216 | title = sys.argv[3] |
---|
| 217 | title.translate(trantab) |
---|
| 218 | predvar = sys.argv[4] |
---|
| 219 | covar = sys.argv[5].strip() |
---|
| 220 | outfname = sys.argv[6] |
---|
| 221 | logfname = sys.argv[7] |
---|
| 222 | op = os.path.split(logfname)[0] |
---|
| 223 | try: # for test - needs this done |
---|
| 224 | os.makedirs(op) |
---|
| 225 | except: |
---|
| 226 | pass |
---|
| 227 | basename = sys.argv[8].translate(trantab) |
---|
| 228 | inter = sys.argv[9] == '1' |
---|
| 229 | cond = sys.argv[10].strip() |
---|
| 230 | if cond == 'None': |
---|
| 231 | cond = '' |
---|
| 232 | gender = sys.argv[11] == '1' |
---|
| 233 | mind = sys.argv[12] |
---|
| 234 | geno = sys.argv[13] |
---|
| 235 | maf = sys.argv[14] |
---|
| 236 | logistic = sys.argv[15].strip()=='1' |
---|
| 237 | gffout = sys.argv[16] |
---|
| 238 | me = sys.argv[0] |
---|
| 239 | phepath = '%s.pphe' % phename |
---|
| 240 | twd = tempfile.mkdtemp(suffix='rgGLM') # make sure plink doesn't spew log file into the root! |
---|
| 241 | tplog = os.path.join(twd,'%s.log' % basename) # should be path to plink log |
---|
| 242 | vcl = [plinke,'--noweb','--bfile',bfname,'--pheno-name','"%s"' % predvar,'--pheno', |
---|
| 243 | phepath,'--out',basename,'--mind %s' % mind, '--geno %s' % geno, |
---|
| 244 | '--maf %s' % maf] |
---|
| 245 | if logistic: |
---|
| 246 | vcl.append('--logistic') |
---|
| 247 | resf = '%s.assoc.logistic' % basename # plink output is here we hope |
---|
| 248 | else: |
---|
| 249 | vcl.append('--linear') |
---|
| 250 | resf = '%s.assoc.linear' % basename # plink output is here we hope |
---|
| 251 | resf = os.path.join(twd,resf) |
---|
| 252 | if gender: |
---|
| 253 | vcl.append('--sex') |
---|
| 254 | if inter: |
---|
| 255 | vcl.append('--interaction') |
---|
| 256 | if covar > 'None': |
---|
| 257 | vcl += ['--covar',phepath,'--covar-name',covar] # comma sep list of covariates |
---|
| 258 | tcfile = None |
---|
| 259 | if len(cond) > 0: # plink wants these in a file.. |
---|
| 260 | dummy,tcfile = tempfile.mkstemp(suffix='condlist') # |
---|
| 261 | f = open(tcfile,'w') |
---|
| 262 | cl = cond.split() |
---|
| 263 | f.write('\n'.join(cl)) |
---|
| 264 | f.write('\n') |
---|
| 265 | f.close() |
---|
| 266 | vcl.append('--condition-list %s' % tcfile) |
---|
| 267 | p=subprocess.Popen(' '.join(vcl),shell=True,cwd=twd) |
---|
| 268 | retval = p.wait() |
---|
| 269 | if tcfile: |
---|
| 270 | os.unlink(tcfile) |
---|
| 271 | plinklog = file(tplog,'r').read() |
---|
| 272 | logf = file(logfname,'w') |
---|
| 273 | logf.write(blurb) |
---|
| 274 | logf.write('\n') |
---|
| 275 | logf.write('vcl=%s\n' % vcl) |
---|
| 276 | xformQassoc(resf=resf,outfname=outfname,logf=logf,twd=twd) # leaves the desired summary file |
---|
| 277 | makeGFF(resf=outfname,outfname=gffout,logf=logf,twd=twd,name='rgGLM_TopTable',description=title,topn=topn) |
---|
| 278 | logf.write('\n') |
---|
| 279 | logf.write(plinklog) |
---|
| 280 | logf.close() |
---|
| 281 | #shutil.rmtree(twd) # clean up |
---|
| 282 | |
---|
| 283 | |
---|
| 284 | |
---|
| 285 | |
---|
| 286 | |
---|