[2] | 1 | """ |
---|
| 2 | July 1 2009 added relatedness filter - fo/oo or all |
---|
| 3 | released under the terms of the LGPL |
---|
| 4 | copyright ross lazarus August 2007 |
---|
| 5 | for the rgenetics project |
---|
| 6 | |
---|
| 7 | Special galaxy tool for the camp2007 data |
---|
| 8 | Allows grabbing genotypes from an arbitrary region |
---|
| 9 | |
---|
| 10 | Needs a mongo results file in the location hardwired below or could be passed in as |
---|
| 11 | a library parameter - but this file must have a very specific structure |
---|
| 12 | rs chrom offset float1...floatn |
---|
| 13 | |
---|
| 14 | called as |
---|
| 15 | |
---|
| 16 | <command interpreter="python2.4"> |
---|
| 17 | campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn" |
---|
| 18 | </command> |
---|
| 19 | |
---|
| 20 | |
---|
| 21 | """ |
---|
| 22 | |
---|
| 23 | |
---|
| 24 | import sys, array, os, string |
---|
| 25 | from rgutils import galhtmlprefix,plinke,readMap |
---|
| 26 | |
---|
| 27 | progname = os.path.split(sys.argv[0])[1] |
---|
| 28 | |
---|
| 29 | |
---|
| 30 | atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'} |
---|
| 31 | |
---|
| 32 | def doImport(outfile='test',flist=[]): |
---|
| 33 | """ import into one of the new html composite data types for Rgenetics |
---|
| 34 | Dan Blankenberg with mods by Ross Lazarus |
---|
| 35 | October 2007 |
---|
| 36 | """ |
---|
| 37 | out = open(outfile,'w') |
---|
| 38 | out.write(galhtmlprefix % progname) |
---|
| 39 | |
---|
| 40 | if len(flist) > 0: |
---|
| 41 | out.write('<ol>\n') |
---|
| 42 | for i, data in enumerate( flist ): |
---|
| 43 | out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) |
---|
| 44 | out.write('</ol>\n') |
---|
| 45 | else: |
---|
| 46 | out.write('No files found') |
---|
| 47 | out.write("</div></body></html>") |
---|
| 48 | out.close() |
---|
| 49 | |
---|
| 50 | def setupPedFilter(relfilter='oo',dfile=None): |
---|
| 51 | """ figure out who to drop to satisfy relative filtering |
---|
| 52 | note single offspring only from each family id |
---|
| 53 | ordering of pdict keys makes this 'random' as the first one only is |
---|
| 54 | kept if there are multiple sibs from same familyid. |
---|
| 55 | """ |
---|
| 56 | dropId = {} |
---|
| 57 | keepoff = (relfilter == 'oo') |
---|
| 58 | keepfounder = (relfilter == 'fo') |
---|
| 59 | pdict = {} |
---|
| 60 | for row in dfile: |
---|
| 61 | rowl = row.strip().split() |
---|
| 62 | if len(rowl) > 6: |
---|
| 63 | idk = (rowl[0],rowl[1]) |
---|
| 64 | pa = (rowl[0],rowl[2]) # key for father |
---|
| 65 | ma = (rowl[0],rowl[3]) # and mother |
---|
| 66 | pdict[idk] = (pa,ma) |
---|
| 67 | dfile.seek(0) # rewind |
---|
| 68 | pk = pdict.keys() |
---|
| 69 | for p in pk: |
---|
| 70 | parents = pdict[p] |
---|
| 71 | if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file |
---|
| 72 | if keepfounder: |
---|
| 73 | dropId[p] = 1 # flag for removal |
---|
| 74 | elif keepoff: |
---|
| 75 | dropId[p] = 1 # flag for removal |
---|
| 76 | if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...! |
---|
| 77 | famseen = {} |
---|
| 78 | for p in pk: # look for multiples from same family - drop all but first |
---|
| 79 | famid = p[0] |
---|
| 80 | if famseen.get(famid,None): |
---|
| 81 | dropId[p] = 1 # already got one from this family |
---|
| 82 | famseen.setdefault(famid,1) |
---|
| 83 | return dropId |
---|
| 84 | |
---|
| 85 | def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): |
---|
| 86 | """ fbat format version |
---|
| 87 | """ |
---|
| 88 | outname = os.path.join(outdir,basename) |
---|
| 89 | pedfname = '%s.ped' % outname |
---|
| 90 | ofile = file(pedfname, 'w') |
---|
| 91 | rsl = ' '.join(rslist) # rslist for fbat |
---|
| 92 | ofile.write(rsl) |
---|
| 93 | s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50]) |
---|
| 94 | lf.write(s) |
---|
| 95 | ofile.write('\n') |
---|
| 96 | nrows = 0 |
---|
| 97 | for line in dfile: |
---|
| 98 | line = line.strip() |
---|
| 99 | if not line: |
---|
| 100 | continue |
---|
| 101 | line = line.replace('D','N') |
---|
| 102 | fields = line.split() |
---|
| 103 | preamble = fields[:6] |
---|
| 104 | idk = (preamble[0],preamble[1]) |
---|
| 105 | dropme = dropId.get(idk,None) |
---|
| 106 | if not dropme: |
---|
| 107 | g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] |
---|
| 108 | g = ' '.join(g) |
---|
| 109 | g = g.split() # we'll get there |
---|
| 110 | g = [atrandic.get(x,'0') for x in g] # numeric alleles... |
---|
| 111 | # hack for framingham ND |
---|
| 112 | ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) |
---|
| 113 | nrows += 1 |
---|
| 114 | ofile.close() |
---|
| 115 | loglist = open(logfile,'r').readlines() # retrieve log to add to html file |
---|
| 116 | doImport(outfile,[pedfname,],loglist=loglist) |
---|
| 117 | return nrows,pedfname |
---|
| 118 | |
---|
| 119 | def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): |
---|
| 120 | """ split out |
---|
| 121 | """ |
---|
| 122 | outname = os.path.join(outdir,basename) |
---|
| 123 | mapfname = '%s.map' % outname |
---|
| 124 | pedfname = '%s.ped' % outname |
---|
| 125 | ofile = file(pedfname, 'w') |
---|
| 126 | # make a map file in the lped library |
---|
| 127 | mf = file(mapfname,'w') |
---|
| 128 | map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order |
---|
| 129 | mf.write('%s\n' % '\n'.join(map)) |
---|
| 130 | mf.close() |
---|
| 131 | nrows = 0 |
---|
| 132 | for line in dfile: |
---|
| 133 | line = line.strip() |
---|
| 134 | if not line: |
---|
| 135 | continue |
---|
| 136 | #line = line.replace('D','N') |
---|
| 137 | fields = line.split() |
---|
| 138 | preamble = fields[:6] |
---|
| 139 | idk = (preamble[0],preamble[1]) |
---|
| 140 | dropme = dropId.get(idk,None) |
---|
| 141 | if not dropme: |
---|
| 142 | g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] |
---|
| 143 | g = ' '.join(g) |
---|
| 144 | g = g.split() # we'll get there |
---|
| 145 | g = [atrandic.get(x,'0') for x in g] # numeric alleles... |
---|
| 146 | # hack for framingham ND |
---|
| 147 | ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) |
---|
| 148 | nrows += 1 |
---|
| 149 | ofile.close() |
---|
| 150 | loglist = open(logfile,'r').readlines() # retrieve log to add to html file |
---|
| 151 | doImport(outfile,[mapfname,pedfname,logfile]) |
---|
| 152 | return nrows,pedfname |
---|
| 153 | |
---|
| 154 | def subset(): |
---|
| 155 | """ ### Sanity check the arguments |
---|
| 156 | now passed in as |
---|
| 157 | <command interpreter="python"> |
---|
| 158 | rgPedSub.py $script_file |
---|
| 159 | </command> |
---|
| 160 | |
---|
| 161 | with |
---|
| 162 | <configfiles> |
---|
| 163 | <configfile name="script_file"> |
---|
| 164 | title~~~~$title |
---|
| 165 | output1~~~~$output1 |
---|
| 166 | log_file~~~~$log_file |
---|
| 167 | userId~~~~$userId |
---|
| 168 | outformat~~~~$outformat |
---|
| 169 | outdir~~~~$output1.extra_files_path |
---|
| 170 | relfilter~~~~$relfilter |
---|
| 171 | #if $d.source=='library' |
---|
| 172 | inped~~~~$d.lpedIn |
---|
| 173 | #else |
---|
| 174 | inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name |
---|
| 175 | #end if |
---|
| 176 | #if $m.mtype=='grslist' |
---|
| 177 | rslist~~~~$m.rslist |
---|
| 178 | region~~~~ |
---|
| 179 | #else |
---|
| 180 | rslist~~~~ |
---|
| 181 | region~~~~$m.region |
---|
| 182 | #end if |
---|
| 183 | </configfile> |
---|
| 184 | </configfiles> |
---|
| 185 | """ |
---|
| 186 | sep = '~~~~' # arbitrary choice |
---|
| 187 | conf = {} |
---|
| 188 | if len(sys.argv) < 2: |
---|
| 189 | print >> sys.stderr, "No configuration file passed as a parameter - cannot run" |
---|
| 190 | sys.exit(1) |
---|
| 191 | configf = sys.argv[1] |
---|
| 192 | config = file(configf,'r').readlines() |
---|
| 193 | for row in config: |
---|
| 194 | row = row.strip() |
---|
| 195 | if len(row) > 0: |
---|
| 196 | try: |
---|
| 197 | key,valu = row.split(sep) |
---|
| 198 | conf[key] = valu |
---|
| 199 | except: |
---|
| 200 | pass |
---|
| 201 | ss = '%s%s' % (string.punctuation,string.whitespace) |
---|
| 202 | ptran = string.maketrans(ss,'_'*len(ss)) |
---|
| 203 | ### Figure out what genomic region we are interested in |
---|
| 204 | region = conf.get('region','') |
---|
| 205 | orslist = conf.get('rslist','').replace('X',' ').lower() |
---|
| 206 | orslist = orslist.replace(',',' ').lower() |
---|
| 207 | # galaxy replaces newlines with XX - go figure |
---|
| 208 | title = conf.get('title','').translate(ptran) # for outputs |
---|
| 209 | outfile = conf.get('output1','') |
---|
| 210 | outdir = conf.get('outdir','') |
---|
| 211 | try: |
---|
| 212 | os.makedirs(outdir) |
---|
| 213 | except: |
---|
| 214 | pass |
---|
| 215 | outformat = conf.get('outformat','lped') |
---|
| 216 | basename = conf.get('basename',title) |
---|
| 217 | logfile = os.path.join(outdir,'%s.log' % title) |
---|
| 218 | userId = conf.get('userId','') # for library |
---|
| 219 | pedFileBase = conf.get('inped','') |
---|
| 220 | relfilter = conf.get('relfilter','') |
---|
| 221 | MAP_FILE = '%s.map' % pedFileBase |
---|
| 222 | DATA_FILE = '%s.ped' % pedFileBase |
---|
| 223 | title = conf.get('title','lped subset') |
---|
| 224 | lf = file(logfile,'w') |
---|
| 225 | lf.write('config file %s = \n' % configf) |
---|
| 226 | lf.write(''.join(config)) |
---|
| 227 | c = '' |
---|
| 228 | spos = epos = 0 |
---|
| 229 | rslist = [] |
---|
| 230 | rsdict = {} |
---|
| 231 | if region > '': |
---|
| 232 | try: # TODO make a regexp? |
---|
| 233 | c,rest = region.split(':') |
---|
| 234 | c = c.replace('chr','') |
---|
| 235 | rest = rest.replace(',','') # remove commas |
---|
| 236 | spos,epos = rest.split('-') |
---|
| 237 | spos = int(spos) |
---|
| 238 | epos = int(epos) |
---|
| 239 | s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos) |
---|
| 240 | lf.write(s) |
---|
| 241 | except: |
---|
| 242 | s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region) |
---|
| 243 | lf.write(s) |
---|
| 244 | lf.close() |
---|
| 245 | sys.exit(1) |
---|
| 246 | else: |
---|
| 247 | rslist = orslist.split() # galaxy replaces newlines with XX - go figure |
---|
| 248 | rsdict = dict(zip(rslist,rslist)) |
---|
| 249 | allmarkers = False |
---|
| 250 | if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something |
---|
| 251 | allmarkers = True |
---|
| 252 | ### Figure out which markers are in this region |
---|
| 253 | markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos) |
---|
| 254 | if len(rslist) == 0: |
---|
| 255 | s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3]) |
---|
| 256 | lf.write(s) |
---|
| 257 | lf.write('\n') |
---|
| 258 | lf.close() |
---|
| 259 | sys.exit(1) |
---|
| 260 | s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5]) |
---|
| 261 | lf.write(s) |
---|
| 262 | try: |
---|
| 263 | dfile = open(DATA_FILE, 'r') |
---|
| 264 | except: # bad input file name? |
---|
| 265 | s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE) |
---|
| 266 | lf.write(s) |
---|
| 267 | lf.write('\n') |
---|
| 268 | lf.close() |
---|
| 269 | print >> sys.stdout, s |
---|
| 270 | raise |
---|
| 271 | sys.exit(1) |
---|
| 272 | if relfilter <> 'all': # must read pedigree and figure out who to drop |
---|
| 273 | dropId = setupPedFilter(relfilter=relfilter,dfile=dfile) |
---|
| 274 | else: |
---|
| 275 | dropId = {} |
---|
| 276 | wewant = [(6+(2*snpcols[x])) for x in rslist] # |
---|
| 277 | # column indices of first geno of each marker pair to get the markers into genomic |
---|
| 278 | ### ... and then parse the rest of the ped file to pull out |
---|
| 279 | ### the genotypes for all subjects for those markers |
---|
| 280 | # /usr/local/galaxy/data/rg/1/lped/ |
---|
| 281 | if len(dropId.keys()) > 0: |
---|
| 282 | s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter |
---|
| 283 | lf.write(s) |
---|
| 284 | if relfilter == 'oo': |
---|
| 285 | s = '## note that one random offspring from each family was kept if there were multiple offspring\n' |
---|
| 286 | lf.write(s) |
---|
| 287 | s = 'FamilyId\tSubjectId\n' |
---|
| 288 | lf.write(s) |
---|
| 289 | dk = dropId.keys() |
---|
| 290 | dk.sort() |
---|
| 291 | for k in dk: |
---|
| 292 | s = '%s\t%s\n' % (k[0],k[1]) |
---|
| 293 | lf.write(s) |
---|
| 294 | lf.write('\n') |
---|
| 295 | lf.close() |
---|
| 296 | if outformat == 'lped': |
---|
| 297 | nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile, |
---|
| 298 | wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) |
---|
| 299 | elif outformat == 'fped': |
---|
| 300 | nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile, |
---|
| 301 | wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) |
---|
| 302 | dfile.close() |
---|
| 303 | |
---|
| 304 | if __name__ == "__main__": |
---|
| 305 | subset() |
---|