1 | """ |
---|
2 | July 1 2009 added relatedness filter - fo/oo or all |
---|
3 | released under the terms of the LGPL |
---|
4 | copyright ross lazarus August 2007 |
---|
5 | for the rgenetics project |
---|
6 | |
---|
7 | Special galaxy tool for the camp2007 data |
---|
8 | Allows grabbing genotypes from an arbitrary region |
---|
9 | |
---|
10 | Needs a mongo results file in the location hardwired below or could be passed in as |
---|
11 | a library parameter - but this file must have a very specific structure |
---|
12 | rs chrom offset float1...floatn |
---|
13 | |
---|
14 | called as |
---|
15 | |
---|
16 | <command interpreter="python2.4"> |
---|
17 | campRGeno2.py $region "$rslist" "$title" $output1 $log_file $userId "$lpedIn" "$lhistIn" |
---|
18 | </command> |
---|
19 | |
---|
20 | |
---|
21 | """ |
---|
22 | |
---|
23 | |
---|
24 | import sys, array, os, string |
---|
25 | from rgutils import galhtmlprefix,plinke,readMap |
---|
26 | |
---|
27 | progname = os.path.split(sys.argv[0])[1] |
---|
28 | |
---|
29 | |
---|
30 | atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'} |
---|
31 | |
---|
32 | def doImport(outfile='test',flist=[]): |
---|
33 | """ import into one of the new html composite data types for Rgenetics |
---|
34 | Dan Blankenberg with mods by Ross Lazarus |
---|
35 | October 2007 |
---|
36 | """ |
---|
37 | out = open(outfile,'w') |
---|
38 | out.write(galhtmlprefix % progname) |
---|
39 | |
---|
40 | if len(flist) > 0: |
---|
41 | out.write('<ol>\n') |
---|
42 | for i, data in enumerate( flist ): |
---|
43 | out.write('<li><a href="%s">%s</a></li>\n' % (os.path.split(data)[-1],os.path.split(data)[-1])) |
---|
44 | out.write('</ol>\n') |
---|
45 | else: |
---|
46 | out.write('No files found') |
---|
47 | out.write("</div></body></html>") |
---|
48 | out.close() |
---|
49 | |
---|
50 | def setupPedFilter(relfilter='oo',dfile=None): |
---|
51 | """ figure out who to drop to satisfy relative filtering |
---|
52 | note single offspring only from each family id |
---|
53 | ordering of pdict keys makes this 'random' as the first one only is |
---|
54 | kept if there are multiple sibs from same familyid. |
---|
55 | """ |
---|
56 | dropId = {} |
---|
57 | keepoff = (relfilter == 'oo') |
---|
58 | keepfounder = (relfilter == 'fo') |
---|
59 | pdict = {} |
---|
60 | for row in dfile: |
---|
61 | rowl = row.strip().split() |
---|
62 | if len(rowl) > 6: |
---|
63 | idk = (rowl[0],rowl[1]) |
---|
64 | pa = (rowl[0],rowl[2]) # key for father |
---|
65 | ma = (rowl[0],rowl[3]) # and mother |
---|
66 | pdict[idk] = (pa,ma) |
---|
67 | dfile.seek(0) # rewind |
---|
68 | pk = pdict.keys() |
---|
69 | for p in pk: |
---|
70 | parents = pdict[p] |
---|
71 | if pdict.get(parents[0],None) or pdict.get(parents[1],None): # parents are in this file |
---|
72 | if keepfounder: |
---|
73 | dropId[p] = 1 # flag for removal |
---|
74 | elif keepoff: |
---|
75 | dropId[p] = 1 # flag for removal |
---|
76 | if keepoff: # TODO keep only a random offspring if many - rely on pdict keys being randomly ordered...! |
---|
77 | famseen = {} |
---|
78 | for p in pk: # look for multiples from same family - drop all but first |
---|
79 | famid = p[0] |
---|
80 | if famseen.get(famid,None): |
---|
81 | dropId[p] = 1 # already got one from this family |
---|
82 | famseen.setdefault(famid,1) |
---|
83 | return dropId |
---|
84 | |
---|
85 | def writeFped(rslist=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): |
---|
86 | """ fbat format version |
---|
87 | """ |
---|
88 | outname = os.path.join(outdir,basename) |
---|
89 | pedfname = '%s.ped' % outname |
---|
90 | ofile = file(pedfname, 'w') |
---|
91 | rsl = ' '.join(rslist) # rslist for fbat |
---|
92 | ofile.write(rsl) |
---|
93 | s = 'wrote %d marker header to %s - %s\n' % (len(rslist),pedfname,rsl[:50]) |
---|
94 | lf.write(s) |
---|
95 | ofile.write('\n') |
---|
96 | nrows = 0 |
---|
97 | for line in dfile: |
---|
98 | line = line.strip() |
---|
99 | if not line: |
---|
100 | continue |
---|
101 | line = line.replace('D','N') |
---|
102 | fields = line.split() |
---|
103 | preamble = fields[:6] |
---|
104 | idk = (preamble[0],preamble[1]) |
---|
105 | dropme = dropId.get(idk,None) |
---|
106 | if not dropme: |
---|
107 | g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] |
---|
108 | g = ' '.join(g) |
---|
109 | g = g.split() # we'll get there |
---|
110 | g = [atrandic.get(x,'0') for x in g] # numeric alleles... |
---|
111 | # hack for framingham ND |
---|
112 | ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) |
---|
113 | nrows += 1 |
---|
114 | ofile.close() |
---|
115 | loglist = open(logfile,'r').readlines() # retrieve log to add to html file |
---|
116 | doImport(outfile,[pedfname,],loglist=loglist) |
---|
117 | return nrows,pedfname |
---|
118 | |
---|
119 | def writePed(markers=[],outdir=None,title='Title',basename='',dfile=None,wewant=[],dropId={},outfile=None,logfile=None): |
---|
120 | """ split out |
---|
121 | """ |
---|
122 | outname = os.path.join(outdir,basename) |
---|
123 | mapfname = '%s.map' % outname |
---|
124 | pedfname = '%s.ped' % outname |
---|
125 | ofile = file(pedfname, 'w') |
---|
126 | # make a map file in the lped library |
---|
127 | mf = file(mapfname,'w') |
---|
128 | map = ['%s\t%s\t0\t%d' % (x[0],x[2],x[1]) for x in markers] # chrom,abspos,snp in genomic order |
---|
129 | mf.write('%s\n' % '\n'.join(map)) |
---|
130 | mf.close() |
---|
131 | nrows = 0 |
---|
132 | for line in dfile: |
---|
133 | line = line.strip() |
---|
134 | if not line: |
---|
135 | continue |
---|
136 | #line = line.replace('D','N') |
---|
137 | fields = line.split() |
---|
138 | preamble = fields[:6] |
---|
139 | idk = (preamble[0],preamble[1]) |
---|
140 | dropme = dropId.get(idk,None) |
---|
141 | if not dropme: |
---|
142 | g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant] |
---|
143 | g = ' '.join(g) |
---|
144 | g = g.split() # we'll get there |
---|
145 | g = [atrandic.get(x,'0') for x in g] # numeric alleles... |
---|
146 | # hack for framingham ND |
---|
147 | ofile.write('%s %s\n' % (' '.join(preamble), ' '.join(g))) |
---|
148 | nrows += 1 |
---|
149 | ofile.close() |
---|
150 | loglist = open(logfile,'r').readlines() # retrieve log to add to html file |
---|
151 | doImport(outfile,[mapfname,pedfname,logfile]) |
---|
152 | return nrows,pedfname |
---|
153 | |
---|
154 | def subset(): |
---|
155 | """ ### Sanity check the arguments |
---|
156 | now passed in as |
---|
157 | <command interpreter="python"> |
---|
158 | rgPedSub.py $script_file |
---|
159 | </command> |
---|
160 | |
---|
161 | with |
---|
162 | <configfiles> |
---|
163 | <configfile name="script_file"> |
---|
164 | title~~~~$title |
---|
165 | output1~~~~$output1 |
---|
166 | log_file~~~~$log_file |
---|
167 | userId~~~~$userId |
---|
168 | outformat~~~~$outformat |
---|
169 | outdir~~~~$output1.extra_files_path |
---|
170 | relfilter~~~~$relfilter |
---|
171 | #if $d.source=='library' |
---|
172 | inped~~~~$d.lpedIn |
---|
173 | #else |
---|
174 | inped~~~~$d.lhistIn.extra_files_path/$d.lhistIn.metadata.base_name |
---|
175 | #end if |
---|
176 | #if $m.mtype=='grslist' |
---|
177 | rslist~~~~$m.rslist |
---|
178 | region~~~~ |
---|
179 | #else |
---|
180 | rslist~~~~ |
---|
181 | region~~~~$m.region |
---|
182 | #end if |
---|
183 | </configfile> |
---|
184 | </configfiles> |
---|
185 | """ |
---|
186 | sep = '~~~~' # arbitrary choice |
---|
187 | conf = {} |
---|
188 | if len(sys.argv) < 2: |
---|
189 | print >> sys.stderr, "No configuration file passed as a parameter - cannot run" |
---|
190 | sys.exit(1) |
---|
191 | configf = sys.argv[1] |
---|
192 | config = file(configf,'r').readlines() |
---|
193 | for row in config: |
---|
194 | row = row.strip() |
---|
195 | if len(row) > 0: |
---|
196 | try: |
---|
197 | key,valu = row.split(sep) |
---|
198 | conf[key] = valu |
---|
199 | except: |
---|
200 | pass |
---|
201 | ss = '%s%s' % (string.punctuation,string.whitespace) |
---|
202 | ptran = string.maketrans(ss,'_'*len(ss)) |
---|
203 | ### Figure out what genomic region we are interested in |
---|
204 | region = conf.get('region','') |
---|
205 | orslist = conf.get('rslist','').replace('X',' ').lower() |
---|
206 | orslist = orslist.replace(',',' ').lower() |
---|
207 | # galaxy replaces newlines with XX - go figure |
---|
208 | title = conf.get('title','').translate(ptran) # for outputs |
---|
209 | outfile = conf.get('output1','') |
---|
210 | outdir = conf.get('outdir','') |
---|
211 | try: |
---|
212 | os.makedirs(outdir) |
---|
213 | except: |
---|
214 | pass |
---|
215 | outformat = conf.get('outformat','lped') |
---|
216 | basename = conf.get('basename',title) |
---|
217 | logfile = os.path.join(outdir,'%s.log' % title) |
---|
218 | userId = conf.get('userId','') # for library |
---|
219 | pedFileBase = conf.get('inped','') |
---|
220 | relfilter = conf.get('relfilter','') |
---|
221 | MAP_FILE = '%s.map' % pedFileBase |
---|
222 | DATA_FILE = '%s.ped' % pedFileBase |
---|
223 | title = conf.get('title','lped subset') |
---|
224 | lf = file(logfile,'w') |
---|
225 | lf.write('config file %s = \n' % configf) |
---|
226 | lf.write(''.join(config)) |
---|
227 | c = '' |
---|
228 | spos = epos = 0 |
---|
229 | rslist = [] |
---|
230 | rsdict = {} |
---|
231 | if region > '': |
---|
232 | try: # TODO make a regexp? |
---|
233 | c,rest = region.split(':') |
---|
234 | c = c.replace('chr','') |
---|
235 | rest = rest.replace(',','') # remove commas |
---|
236 | spos,epos = rest.split('-') |
---|
237 | spos = int(spos) |
---|
238 | epos = int(epos) |
---|
239 | s = '## %s parsing chrom %s from %d to %d\n' % (progname,c,spos,epos) |
---|
240 | lf.write(s) |
---|
241 | except: |
---|
242 | s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region) |
---|
243 | lf.write(s) |
---|
244 | lf.close() |
---|
245 | sys.exit(1) |
---|
246 | else: |
---|
247 | rslist = orslist.split() # galaxy replaces newlines with XX - go figure |
---|
248 | rsdict = dict(zip(rslist,rslist)) |
---|
249 | allmarkers = False |
---|
250 | if len(rslist) == 0 and epos == 0: # must be a full extract - presumably remove relateds or something |
---|
251 | allmarkers = True |
---|
252 | ### Figure out which markers are in this region |
---|
253 | markers,snpcols,rslist,rsdict = readMap(mapfile=MAP_FILE,allmarkers=allmarkers,rsdict=rsdict,c=c,spos=spos,epos=epos) |
---|
254 | if len(rslist) == 0: |
---|
255 | s = '##! %s found no rs numbers in %s\n' % (progname,sys.argv[1:3]) |
---|
256 | lf.write(s) |
---|
257 | lf.write('\n') |
---|
258 | lf.close() |
---|
259 | sys.exit(1) |
---|
260 | s = '## %s looking for %d rs (%s....etc)\n' % (progname,len(rslist),rslist[:5]) |
---|
261 | lf.write(s) |
---|
262 | try: |
---|
263 | dfile = open(DATA_FILE, 'r') |
---|
264 | except: # bad input file name? |
---|
265 | s = '##! rgPedSub unable to open file %s\n' % (DATA_FILE) |
---|
266 | lf.write(s) |
---|
267 | lf.write('\n') |
---|
268 | lf.close() |
---|
269 | print >> sys.stdout, s |
---|
270 | raise |
---|
271 | sys.exit(1) |
---|
272 | if relfilter <> 'all': # must read pedigree and figure out who to drop |
---|
273 | dropId = setupPedFilter(relfilter=relfilter,dfile=dfile) |
---|
274 | else: |
---|
275 | dropId = {} |
---|
276 | wewant = [(6+(2*snpcols[x])) for x in rslist] # |
---|
277 | # column indices of first geno of each marker pair to get the markers into genomic |
---|
278 | ### ... and then parse the rest of the ped file to pull out |
---|
279 | ### the genotypes for all subjects for those markers |
---|
280 | # /usr/local/galaxy/data/rg/1/lped/ |
---|
281 | if len(dropId.keys()) > 0: |
---|
282 | s = '## dropped the following subjects to satisfy requirement that relfilter = %s\n' % relfilter |
---|
283 | lf.write(s) |
---|
284 | if relfilter == 'oo': |
---|
285 | s = '## note that one random offspring from each family was kept if there were multiple offspring\n' |
---|
286 | lf.write(s) |
---|
287 | s = 'FamilyId\tSubjectId\n' |
---|
288 | lf.write(s) |
---|
289 | dk = dropId.keys() |
---|
290 | dk.sort() |
---|
291 | for k in dk: |
---|
292 | s = '%s\t%s\n' % (k[0],k[1]) |
---|
293 | lf.write(s) |
---|
294 | lf.write('\n') |
---|
295 | lf.close() |
---|
296 | if outformat == 'lped': |
---|
297 | nrows,pedfname=writePed(markers=markers,outdir=outdir,title=title,basename=basename,dfile=dfile, |
---|
298 | wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) |
---|
299 | elif outformat == 'fped': |
---|
300 | nrows,pedfname=writeFped(rslist=rslist,outdir=outdir,title=title,basename=basename,dfile=dfile, |
---|
301 | wewant=wewant,dropId=dropId,outfile=outfile,logfile=logfile) |
---|
302 | dfile.close() |
---|
303 | |
---|
304 | if __name__ == "__main__": |
---|
305 | subset() |
---|