1 | """ |
---|
2 | released under the terms of the LGPL |
---|
3 | copyright ross lazarus August 2007 |
---|
4 | for the rgenetics project |
---|
5 | |
---|
6 | Special galaxy tool for the camp2007 data |
---|
7 | Allows grabbing arbitrary columns from an arbitrary region |
---|
8 | |
---|
9 | Needs a mongo results file in the location hardwired below or could be passed in as |
---|
10 | a library parameter - but this file must have a very specific structure |
---|
11 | rs chrom offset float1...floatn |
---|
12 | |
---|
13 | called as |
---|
14 | <command interpreter="python"> |
---|
15 | rsRegion.py $infile '$cols' $r $tag $out_file1 |
---|
16 | </command> |
---|
17 | |
---|
18 | cols is a delimited list of chosen column names for the subset |
---|
19 | r is a ucsc location region pasted into the tool |
---|
20 | |
---|
21 | """ |
---|
22 | |
---|
23 | |
---|
24 | import sys,string |
---|
25 |
|
---|
26 | trantab = string.maketrans(string.punctuation,'_'*len(string.punctuation)) |
---|
27 | print >> sys.stdout, '##rgRegion.py started' |
---|
28 | if len(sys.argv) <> 6: |
---|
29 | print >> sys.stdout, '##!expected params in sys.argv, got %d - %s' % (len(sys.argv),sys.argv) |
---|
30 | sys.exit(1) |
---|
31 | print '##got %d - %s' % (len(sys.argv),sys.argv) |
---|
32 | # quick and dirty for galaxy - we always get something for each parameter |
---|
33 | fname = sys.argv[1] |
---|
34 | wewant = sys.argv[2].split(',') |
---|
35 | region = sys.argv[3].lower() |
---|
36 | tag = sys.argv[4].translate(trantab) |
---|
37 | ofname = sys.argv[5] |
---|
38 | myname = 'rgRegion' |
---|
39 | if len(wewant) == 0: # no columns selected? |
---|
40 | print >> sys.stdout, '##!%s: no columns selected - cannot run' % myname |
---|
41 | sys.exit(1) |
---|
42 | try: |
---|
43 | f = open(fname,'r') |
---|
44 | except: # bad input file name? |
---|
45 | print >> sys.stdout, '##!%s unable to open file %s' % (myname, fname) |
---|
46 | sys.exit(1) |
---|
47 | try: # TODO make a regexp? |
---|
48 | c,rest = region.split(':') |
---|
49 | c = c.replace('chr','') # leave although will break strict genome graphs |
---|
50 | rest = rest.replace(',','') # remove commas |
---|
51 | spos,epos = rest.split('-') |
---|
52 | spos = int(spos) |
---|
53 | epos = int(epos) |
---|
54 | except: |
---|
55 | print >> sys.stdout, '##!%s unable to parse region %s - MUST look like "chr8:10,000-100,000' % (myname,region) |
---|
56 | sys.exit(1) |
---|
57 | print >> sys.stdout, '##%s parsing chrom %s from %d to %d' % (myname, c,spos,epos) |
---|
58 | res = [] |
---|
59 | cnames = f.next().strip().split() # column titles for output |
---|
60 | linelen = len(cnames) |
---|
61 | wewant = [int(x) - 1 for x in wewant] # need col numbers base 0 |
---|
62 | for n,l in enumerate(f): |
---|
63 | ll = l.strip().split() |
---|
64 | thisc = ll[1] |
---|
65 | thispos = int(ll[2]) |
---|
66 | if (thisc == c) and (thispos >= spos) and (thispos <= epos): |
---|
67 | if len(ll) == linelen: |
---|
68 | res.append([ll[x] for x in wewant]) # subset of columns! |
---|
69 | else: |
---|
70 | print >> sys.stdout, '##! looking for %d fields - found %d in ll=%s' % (linelen,len(ll),str(ll)) |
---|
71 | o = file(ofname,'w') |
---|
72 | res = ['%s\n' % '\t'.join(x) for x in res] # turn into tab delim string |
---|
73 | print >> sys.stdout, '##%s selected and returning %d data rows' % (myname,len(res)) |
---|
74 | head = [cnames[x] for x in wewant] # ah, list comprehensions - list of needed column names |
---|
75 | o.write('%s\n' % '\t'.join(head)) # header row for output |
---|
76 | o.write(''.join(res)) |
---|
77 | o.close() |
---|
78 | f.close() |
---|
79 | |
---|
80 | |
---|