[2] | 1 | #!/usr/local/bin/python |
---|
| 2 | # hack to run and process a plink case control association |
---|
| 3 | # expects args as |
---|
| 4 | # bfilepath outname jobname outformat (wig,xls) |
---|
| 5 | # ross lazarus |
---|
| 6 | # for wig files, we need annotation so look for map file or complain |
---|
| 7 | """ |
---|
| 8 | Parameters for wiggle track definition lines |
---|
| 9 | All options are placed in a single line separated by spaces: |
---|
| 10 | |
---|
| 11 | track type=wiggle_0 name=track_label description=center_label \ |
---|
| 12 | visibility=display_mode color=r,g,b altColor=r,g,b \ |
---|
| 13 | priority=priority autoScale=on|off \ |
---|
| 14 | gridDefault=on|off maxHeightPixels=max:default:min \ |
---|
| 15 | graphType=bar|points viewLimits=lower:upper \ |
---|
| 16 | yLineMark=real-value yLineOnOff=on|off \ |
---|
| 17 | windowingFunction=maximum|mean|minimum smoothingWindow=off|2-16 |
---|
| 18 | """ |
---|
| 19 | |
---|
| 20 | import sys,math,shutil,subprocess,os,time,tempfile,string |
---|
| 21 | from os.path import abspath |
---|
| 22 | from rgutils import timenow, plinke |
---|
| 23 | |
---|
| 24 | imagedir = '/static/rg' # if needed for images |
---|
| 25 | myversion = 'V000.1 April 2007' |
---|
| 26 | verbose = False |
---|
| 27 | |
---|
| 28 | def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): |
---|
| 29 | """ |
---|
| 30 | score must be scaled to 0-1000 |
---|
| 31 | |
---|
| 32 | Want to make some wig tracks from each analysis |
---|
| 33 | Best n -log10(p). Make top hit the window. |
---|
| 34 | we use our tab output which has |
---|
| 35 | rs chrom offset ADD_stat ADD_p ADD_log10p |
---|
| 36 | rs3094315 1 792429 1.151 0.2528 0.597223 |
---|
| 37 | |
---|
| 38 | """ |
---|
| 39 | |
---|
| 40 | def is_number(s): |
---|
| 41 | try: |
---|
| 42 | float(s) |
---|
| 43 | return True |
---|
| 44 | except ValueError: |
---|
| 45 | return False |
---|
| 46 | header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) |
---|
| 47 | column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] |
---|
| 48 | halfwidth=100 |
---|
| 49 | resfpath = os.path.join(twd,resf) |
---|
| 50 | resf = open(resfpath,'r') |
---|
| 51 | resfl = resf.readlines() # dumb but convenient for millions of rows |
---|
| 52 | resfl = [x.split() for x in resfl] |
---|
| 53 | headl = resfl[0] |
---|
| 54 | resfl = resfl[1:] |
---|
| 55 | headl = [x.strip().upper() for x in headl] |
---|
| 56 | headIndex = dict(zip(headl,range(0,len(headl)))) |
---|
| 57 | chrpos = headIndex.get('CHR',None) |
---|
| 58 | rspos = headIndex.get('RS',None) |
---|
| 59 | offspos = headIndex.get('OFFSET',None) |
---|
| 60 | ppos = headIndex.get('LOG10ARMITAGEP',None) |
---|
| 61 | wewant = [chrpos,rspos,offspos,ppos] |
---|
| 62 | if None in wewant: # missing something |
---|
| 63 | logf.write('### Error missing a required header in makeGFF - headIndex=%s\n' % headIndex) |
---|
| 64 | return |
---|
| 65 | resfl = [x for x in resfl if x[ppos] > ''] |
---|
| 66 | resfl = [(float(x[ppos]),x) for x in resfl] # decorate |
---|
| 67 | resfl.sort() |
---|
| 68 | resfl.reverse() # using -log10 so larger is better |
---|
| 69 | resfl = resfl[:topn] # truncate |
---|
| 70 | pvals = [x[0] for x in resfl] # need to scale |
---|
| 71 | resfl = [x[1] for x in resfl] # drop decoration |
---|
| 72 | maxp = max(pvals) # need to scale |
---|
| 73 | minp = min(pvals) |
---|
| 74 | prange = abs(maxp-minp) + 0.5 # fudge |
---|
| 75 | scalefact = 1000.0/prange |
---|
| 76 | logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) |
---|
| 77 | for i,row in enumerate(resfl): |
---|
| 78 | row[ppos] = '%d' % (int(scalefact*pvals[i])) |
---|
| 79 | resfl[i] = row # replace |
---|
| 80 | outf = file(outfname,'w') |
---|
| 81 | outf.write(header) |
---|
| 82 | outres = [] # need to resort into chrom offset order |
---|
| 83 | for i,lrow in enumerate(resfl): |
---|
| 84 | chrom,snp,offset,p, = [lrow[x] for x in wewant] |
---|
| 85 | gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth), |
---|
| 86 | '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) |
---|
| 87 | outres.append(gff) |
---|
| 88 | outres = [(x[0],int(x[3]),x) for x in outres] # decorate |
---|
| 89 | outres.sort() # into chrom offset |
---|
| 90 | outres=[x[2] for x in outres] # undecorate |
---|
| 91 | outres = ['\t'.join(x) for x in outres] |
---|
| 92 | outf.write('\n'.join(outres)) |
---|
| 93 | outf.write('\n') |
---|
| 94 | outf.close() |
---|
| 95 | |
---|
| 96 | |
---|
| 97 | |
---|
| 98 | def plink_assocToGG(plinkout="hm",tag='test'): |
---|
| 99 | """ plink --assoc output looks like this |
---|
| 100 | # CHR SNP A1 F_A F_U A2 CHISQ P OR |
---|
| 101 | # 1 rs3094315 G 0.6685 0.1364 A 104.1 1.929e-24 12.77 |
---|
| 102 | # write as a genegraph input file |
---|
| 103 | """ |
---|
| 104 | inf = file('%s.assoc' % plinkout,'r') |
---|
| 105 | outf = file('%sassoc.xls' % plinkout,'w') |
---|
| 106 | res = ['rs\tlog10p%s\tFakeInvOR%s\tRealOR%s' % (tag,tag,tag),] # output header for ucsc genome graphs |
---|
| 107 | head = inf.next() |
---|
| 108 | for l in inf: |
---|
| 109 | ll = l.split() |
---|
| 110 | if len(ll) >= 8: |
---|
| 111 | p = float(ll[7]) |
---|
| 112 | if p <> 'NA': # eesh |
---|
| 113 | logp = '%9.9f' % -math.log10(p) |
---|
| 114 | else: |
---|
| 115 | logp = 'NA' |
---|
| 116 | try: |
---|
| 117 | orat = ll[8] |
---|
| 118 | except: |
---|
| 119 | orat = 'NA' |
---|
| 120 | orat2 = orat |
---|
| 121 | # invert large negative odds ratios |
---|
| 122 | if float(orat) < 1 and float(orat) > 0.0: |
---|
| 123 | orat2 = '%9.9f' % (1.0/float(orat)) |
---|
| 124 | outl = [ll[1],logp, orat2, orat] |
---|
| 125 | res.append('\t'.join(outl)) |
---|
| 126 | outf.write('\n'.join(res)) |
---|
| 127 | outf.write('\n') |
---|
| 128 | outf.close() |
---|
| 129 | inf.close() |
---|
| 130 | |
---|
| 131 | def xformModel(infname='',resf='',outfname='', |
---|
| 132 | name='foo',mapf='/usr/local/galaxy/data/rg/ped/x.bim',flog=None): |
---|
| 133 | """munge a plink .model file into either a ucsc track or an xls file |
---|
| 134 | rerla@meme ~/plink]$ head hmYRI_CEU.model |
---|
| 135 | CHR SNP TEST AFF UNAFF CHISQ DF P |
---|
| 136 | 1 rs3094315 GENO 41/37/11 0/24/64 NA NA NA |
---|
| 137 | 1 rs3094315 TREND 119/59 24/152 81.05 1 2.201e-19 |
---|
| 138 | 1 rs3094315 ALLELIC 119/59 24/152 104.1 1 1.929e-24 |
---|
| 139 | 1 rs3094315 DOM 78/11 24/64 NA NA NA |
---|
| 140 | |
---|
| 141 | bim file has |
---|
| 142 | [rerla@beast pbed]$ head plink_wgas1_example.bim |
---|
| 143 | 1 rs3094315 0.792429 792429 G A |
---|
| 144 | 1 rs6672353 0.817376 817376 A G |
---|
| 145 | """ |
---|
| 146 | if verbose: |
---|
| 147 | print 'Rgenetics rgCaCo.xformModel got resf=%s, outfname=%s' % (resf,outfname) |
---|
| 148 | res = [] |
---|
| 149 | rsdict = {} |
---|
| 150 | map = file(mapf,'r') |
---|
| 151 | for l in map: # plink map |
---|
| 152 | ll = l.strip().split() |
---|
| 153 | if len(ll) >= 3: |
---|
| 154 | rs=ll[1].strip() |
---|
| 155 | chrom = ll[0] |
---|
| 156 | if chrom.lower() == 'x': |
---|
| 157 | chrom='23' |
---|
| 158 | elif chrom.lower() == 'y': |
---|
| 159 | chrom = 24 |
---|
| 160 | elif chrom.lower() == 'mito': |
---|
| 161 | chrom = 25 |
---|
| 162 | offset = ll[3] |
---|
| 163 | rsdict[rs] = (chrom,offset) |
---|
| 164 | res.append('rs\tChr\tOffset\tGenop\tlog10Genop\tArmitagep\tlog10Armitagep\tAllelep\tlog10Allelep\tDomp\tlog10Domp') |
---|
| 165 | f = open(resf,'r') |
---|
| 166 | headl = f.readline() |
---|
| 167 | if headl.find('\t') <> -1: |
---|
| 168 | headl = headl.split('\t') |
---|
| 169 | delim = '\t' |
---|
| 170 | else: |
---|
| 171 | headl = headl.split() |
---|
| 172 | delim = None |
---|
| 173 | chrpos = headl.index('CHR') |
---|
| 174 | rspos = headl.index('SNP') |
---|
| 175 | testpos = headl.index('TEST') |
---|
| 176 | naffpos = headl.index('AFF') |
---|
| 177 | nuaffpos = headl.index('UNAFF') |
---|
| 178 | chisqpos = headl.index('CHISQ') |
---|
| 179 | ppos = headl.index('P') |
---|
| 180 | wewant = [chrpos,rspos,testpos,naffpos,nuaffpos,chisqpos,ppos] |
---|
| 181 | llen = len(headl) |
---|
| 182 | lnum = anum = 0 |
---|
| 183 | lastsnp = None # so we know when to write out a gg line |
---|
| 184 | outl = {} |
---|
| 185 | f.seek(0) |
---|
| 186 | for lnum,l in enumerate(f): |
---|
| 187 | if lnum == 0: |
---|
| 188 | continue |
---|
| 189 | ll = l.split() |
---|
| 190 | if delim: |
---|
| 191 | ll = l.split(delim) |
---|
| 192 | if len(ll) >= llen: # valid line |
---|
| 193 | chr,snp,test,naff,nuaff,chi,p = [ll[x] for x in wewant] |
---|
| 194 | snp = snp.strip() |
---|
| 195 | chrom,offset = rsdict.get(snp,(None,None)) |
---|
| 196 | anum += 1 |
---|
| 197 | fp = 1.0 # if NA |
---|
| 198 | lp = 0.0 |
---|
| 199 | try: |
---|
| 200 | fp = float(p) |
---|
| 201 | if fp > 0: |
---|
| 202 | lp = -math.log10(fp) |
---|
| 203 | else: |
---|
| 204 | fp = 9e-100 |
---|
| 205 | flog.write('### WARNING - Plink calculated %s for %s p value!!! 9e-100 substituted!\n' % (p,test)) |
---|
| 206 | flog.write('### offending line #%d in %s = %s' % (lnum,l)) |
---|
| 207 | except: |
---|
| 208 | pass |
---|
| 209 | if snp <> lastsnp: |
---|
| 210 | if len(outl.keys()) > 3: |
---|
| 211 | sl = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] |
---|
| 212 | res.append('\t'.join(sl)) # last snp line |
---|
| 213 | outl = {'snp':snp,'chrom':chrom,'offset':offset} # first 3 cols for gg line |
---|
| 214 | lastsnp = snp # reset for next marker |
---|
| 215 | #if p == 'NA': |
---|
| 216 | # p = 1.0 |
---|
| 217 | # let's pass downstream for handling R is fine? |
---|
| 218 | outl[test] = '%s\t%f' % (p,lp) |
---|
| 219 | if len(outl.keys()) > 3: |
---|
| 220 | l = [outl.get(x,'?') for x in ('snp','chrom','offset','GENO','TREND','ALLELIC','DOM')] |
---|
| 221 | res.append('\t'.join(l)) # last snp line |
---|
| 222 | f = file(outfname,'w') |
---|
| 223 | res.append('') |
---|
| 224 | f.write('\n'.join(res)) |
---|
| 225 | f.close() |
---|
| 226 | |
---|
| 227 | |
---|
| 228 | |
---|
| 229 | |
---|
| 230 | if __name__ == "__main__": |
---|
| 231 | """ |
---|
| 232 | # called as |
---|
| 233 | <command interpreter="python"> |
---|
| 234 | rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$name" |
---|
| 235 | '$out_file1' '$logf' '$logf.files_path' '$gffout' |
---|
| 236 | </command> </command> |
---|
| 237 | """ |
---|
| 238 | if len(sys.argv) < 7: |
---|
| 239 | s = 'rgCaCo.py needs 6 params - got %s \n' % (sys.argv) |
---|
| 240 | print >> sys.stdout, s |
---|
| 241 | sys.exit(0) |
---|
| 242 | bfname = sys.argv[1] |
---|
| 243 | name = sys.argv[2] |
---|
| 244 | killme = string.punctuation + string.whitespace |
---|
| 245 | trantab = string.maketrans(killme,'_'*len(killme)) |
---|
| 246 | name = name.translate(trantab) |
---|
| 247 | outfname = sys.argv[3] |
---|
| 248 | logf = sys.argv[4] |
---|
| 249 | logoutdir = sys.argv[5] |
---|
| 250 | gffout = sys.argv[6] |
---|
| 251 | topn = 1000 |
---|
| 252 | try: |
---|
| 253 | os.makedirs(logoutdir) |
---|
| 254 | except: |
---|
| 255 | pass |
---|
| 256 | map_file = None |
---|
| 257 | me = sys.argv[0] |
---|
| 258 | amapf = '%s.bim' % bfname # to decode map in xformModel |
---|
| 259 | flog = file(logf,'w') |
---|
| 260 | logme = [] |
---|
| 261 | cdir = os.getcwd() |
---|
| 262 | s = 'Rgenetics %s http://rgenetics.org Galaxy Tools, rgCaCo.py started %s\n' % (myversion,timenow()) |
---|
| 263 | print >> sys.stdout, s # so will appear as blurb for file |
---|
| 264 | logme.append(s) |
---|
| 265 | if verbose: |
---|
| 266 | s = 'rgCaCo.py: bfname=%s, logf=%s, argv = %s\n' % (bfname, logf, sys.argv) |
---|
| 267 | print >> sys.stdout, s # so will appear as blurb for file |
---|
| 268 | logme.append(s) |
---|
| 269 | twd = tempfile.mkdtemp(suffix='rgCaCo') # make sure plink doesn't spew log file into the root! |
---|
| 270 | tname = os.path.join(twd,name) |
---|
| 271 | vcl = [plinke,'--noweb','--bfile',bfname,'--out',name,'--model'] |
---|
| 272 | p=subprocess.Popen(' '.join(vcl),shell=True,stdout=flog,cwd=twd) |
---|
| 273 | retval = p.wait() |
---|
| 274 | resf = '%s.model' % tname # plink output is here we hope |
---|
| 275 | xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file |
---|
| 276 | makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgGLM_TopTable',description=name,topn=topn) |
---|
| 277 | flog.write('\n'.join(logme)) |
---|
| 278 | flog.close() # close the log used |
---|
| 279 | #shutil.copytree(twd,logoutdir) |
---|
| 280 | shutil.rmtree(twd) # clean up |
---|
| 281 | |
---|