| 1 | """ | 
|---|
| 2 | released under the terms of the LGPL | 
|---|
| 3 | copyright ross lazarus August 2007 | 
|---|
| 4 | for the rgenetics project | 
|---|
| 5 |  | 
|---|
| 6 | Special galaxy tool for the camp2007 data | 
|---|
| 7 | Allows grabbing arbitrary columns from an arbitrary region | 
|---|
| 8 |  | 
|---|
| 9 | Needs a mongo results file in the location hardwired below or could be passed in as | 
|---|
| 10 | a library parameter - but this file must have a very specific structure | 
|---|
| 11 | rs chrom offset float1...floatn | 
|---|
| 12 |  | 
|---|
| 13 | called as | 
|---|
| 14 | <command interpreter="python"> | 
|---|
| 15 | rsRegion.py $infile '$cols' $r $tag $out_file1 | 
|---|
| 16 | </command> | 
|---|
| 17 |  | 
|---|
| 18 | cols is a delimited list of chosen column names for the subset | 
|---|
| 19 | r is a ucsc location region pasted into the tool | 
|---|
| 20 |  | 
|---|
| 21 | """ | 
|---|
| 22 |  | 
|---|
| 23 |  | 
|---|
| 24 | import sys,string | 
|---|
| 25 |  | 
|---|
| 26 | trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation)) | 
|---|
| 27 | print >> sys.stdout, '##rgRegion.py started' | 
|---|
| 28 | if len(sys.argv) <> 6: | 
|---|
| 29 | print >> sys.stdout, '##!expected  params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv) | 
|---|
| 30 | sys.exit(1) | 
|---|
| 31 | print '##got %d - %s' % (len(sys.argv),sys.argv) | 
|---|
| 32 | # quick and dirty for galaxy - we always get something for each parameter | 
|---|
| 33 | fname = sys.argv[1] | 
|---|
| 34 | wewant = sys.argv[2].split(',') | 
|---|
| 35 | region = sys.argv[3].lower() | 
|---|
| 36 | tag = sys.argv[4].translate(trantab) | 
|---|
| 37 | ofname = sys.argv[5] | 
|---|
| 38 | myname = 'rgRegion' | 
|---|
| 39 | if len(wewant) == 0: # no columns selected? | 
|---|
| 40 | print >> sys.stdout, '##!%s:  no columns selected - cannot run' % myname | 
|---|
| 41 | sys.exit(1) | 
|---|
| 42 | try: | 
|---|
| 43 | f = open(fname,'r') | 
|---|
| 44 | except: # bad input file name? | 
|---|
| 45 | print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname) | 
|---|
| 46 | sys.exit(1) | 
|---|
| 47 | try: # TODO make a regexp? | 
|---|
| 48 | c,rest = region.split(':') | 
|---|
| 49 | c = c.replace('chr','') # leave although will break strict genome graphs | 
|---|
| 50 | rest = rest.replace(',','') # remove commas | 
|---|
| 51 | spos,epos = rest.split('-') | 
|---|
| 52 | spos = int(spos) | 
|---|
| 53 | epos = int(epos) | 
|---|
| 54 | except: | 
|---|
| 55 | print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region) | 
|---|
| 56 | sys.exit(1) | 
|---|
| 57 | print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos) | 
|---|
| 58 | res = [] | 
|---|
| 59 | cnames = f.next().strip().split() # column titles for output | 
|---|
| 60 | linelen = len(cnames) | 
|---|
| 61 | wewant = [int(x) - 1 for x in wewant] # need col numbers base 0 | 
|---|
| 62 | for n,l in enumerate(f): | 
|---|
| 63 | ll = l.strip().split() | 
|---|
| 64 | thisc = ll[1] | 
|---|
| 65 | thispos = int(ll[2]) | 
|---|
| 66 | if (thisc == c) and (thispos >= spos) and (thispos <= epos): | 
|---|
| 67 | if len(ll) == linelen: | 
|---|
| 68 | res.append([ll[x] for x in wewant]) # subset of columns! | 
|---|
| 69 | else: | 
|---|
| 70 | print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll)) | 
|---|
| 71 | o = file(ofname,'w') | 
|---|
| 72 | res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string | 
|---|
| 73 | print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res)) | 
|---|
| 74 | head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names | 
|---|
| 75 | o.write('%s\n' % '\t'.join(head)) # header row for output | 
|---|
| 76 | o.write(''.join(res)) | 
|---|
| 77 | o.close() | 
|---|
| 78 | f.close() | 
|---|
| 79 |  | 
|---|
| 80 |  | 
|---|