""" released under the terms of the LGPL copyright ross lazarus August 2007 for the rgenetics project Special galaxy tool for the camp2007 data Allows grabbing arbitrary columns from an arbitrary region Needs a mongo results file in the location hardwired below or could be passed in as a library parameter - but this file must have a very specific structure rs chrom offset float1...floatn called as rsRegion.py $infile '$cols' $r $tag $out_file1 cols is a delimited list of chosen column names for the subset r is a ucsc location region pasted into the tool """ import sys,string trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation)) print >> sys.stdout, '##rgRegion.py started' if len(sys.argv) <> 6: print >> sys.stdout, '##!expected params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv) sys.exit(1) print '##got %d - %s' % (len(sys.argv),sys.argv) # quick and dirty for galaxy - we always get something for each parameter fname = sys.argv[1] wewant = sys.argv[2].split(',') region = sys.argv[3].lower() tag = sys.argv[4].translate(trantab) ofname = sys.argv[5] myname = 'rgRegion' if len(wewant) == 0: # no columns selected? print >> sys.stdout, '##!%s: no columns selected - cannot run' % myname sys.exit(1) try: f = open(fname,'r') except: # bad input file name? print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname) sys.exit(1) try: # TODO make a regexp? c,rest = region.split(':') c = c.replace('chr','') # leave although will break strict genome graphs rest = rest.replace(',','') # remove commas spos,epos = rest.split('-') spos = int(spos) epos = int(epos) except: print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region) sys.exit(1) print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos) res = [] cnames = f.next().strip().split() # column titles for output linelen = len(cnames) wewant = [int(x) - 1 for x in wewant] # need col numbers base 0 for n,l in enumerate(f): ll = l.strip().split() thisc = ll[1] thispos = int(ll[2]) if (thisc == c) and (thispos >= spos) and (thispos <= epos): if len(ll) == linelen: res.append([ll[x] for x in wewant]) # subset of columns! else: print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll)) o = file(ofname,'w') res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res)) head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names o.write('%s\n' % '\t'.join(head)) # header row for output o.write(''.join(res)) o.close() f.close()