1 | #!/usr/bin/env python2.4 |
---|
2 | """ |
---|
3 | Returns all positions of a maf with any pwm score > threshold |
---|
4 | The positions are projected onto human coordinates |
---|
5 | """ |
---|
6 | |
---|
7 | import psyco_full |
---|
8 | from bx.align import maf as align_maf |
---|
9 | import position_weight_matrix as pwmx |
---|
10 | from bx.pwm.pwm_score_maf import MafMotifScorer |
---|
11 | import sys |
---|
12 | from bx import intervals |
---|
13 | import Numeric |
---|
14 | |
---|
15 | def isnan(x): |
---|
16 | return not x==x |
---|
17 | |
---|
18 | def main(): |
---|
19 | |
---|
20 | if len(sys.argv) < 5: |
---|
21 | print >>sys.stderr, "%s bedfile inmaf spec1,spec2,... string [string2,...]" % sys.argv[0] |
---|
22 | sys.exit(0) |
---|
23 | |
---|
24 | # read in intervals |
---|
25 | regions = {} |
---|
26 | for line in open( sys.argv[1] ): |
---|
27 | if line.startswith('#'): continue |
---|
28 | fields = line.strip().split() |
---|
29 | chrom, start, end = fields[0], int( fields[1] ), int( fields[2] ) |
---|
30 | try: |
---|
31 | name = fields[3] |
---|
32 | except: |
---|
33 | name = None |
---|
34 | if chrom not in regions: regions[chrom] = intervals.Intersecter() |
---|
35 | regions[chrom].add( start, end, name ) |
---|
36 | |
---|
37 | motif_strings = sys.argv[4:] |
---|
38 | if not isinstance(motif_strings, list): motif_strings = [motif_strings] |
---|
39 | inmaf = open(sys.argv[2]) |
---|
40 | threshold = 0.5 |
---|
41 | |
---|
42 | species = [] |
---|
43 | |
---|
44 | for sp in sys.argv[3].split(','): |
---|
45 | species.append( sp ) |
---|
46 | |
---|
47 | for maf in align_maf.Reader(inmaf): |
---|
48 | mafchrom = maf.components[0].src.split('.')[1] |
---|
49 | mafstart = maf.components[0].start |
---|
50 | mafend = maf.components[0].end |
---|
51 | reftext = maf.components[0].text |
---|
52 | r = regions[mafchrom].find( mafstart, mafend ) |
---|
53 | if mafchrom not in regions or len( r ) == 0: continue |
---|
54 | |
---|
55 | # maf block scores for each matrix |
---|
56 | for scoremax,width,headers in MafMotifScorer(species, maf, motif_strings): |
---|
57 | #print >>sys.stderr,headers |
---|
58 | blocklength = width |
---|
59 | mafsrc,mafstart,mafend = headers[0] |
---|
60 | mafchrom = mafsrc.split('.')[1] |
---|
61 | |
---|
62 | # lists of scores for each position in scoremax |
---|
63 | for mx_name,mx in scoremax.items(): |
---|
64 | #print >>sys.stderr, mx_name, len(pwm[mx_name]) |
---|
65 | |
---|
66 | for offset in range(blocklength): |
---|
67 | |
---|
68 | # scan all species with threshold |
---|
69 | for i in range(len(species)): |
---|
70 | if mx[i][offset] > threshold: |
---|
71 | refstart = mafstart + offset - reftext.count('-',0,offset) |
---|
72 | refend = refstart + len(mx_name) |
---|
73 | |
---|
74 | data = " ".join([ "%.2f" % mx[x][offset] for x in range(len(species))]) |
---|
75 | # quote the motif |
---|
76 | r = regions[mafchrom].find( refstart, refend ) |
---|
77 | if mafchrom in regions and len( r ) > 0: |
---|
78 | region_label = r[0].value |
---|
79 | else: |
---|
80 | #region_label = 0 |
---|
81 | continue |
---|
82 | v_name = mx_name.replace(' ','_') |
---|
83 | print mafchrom,refstart,refend,region_label,v_name,data |
---|
84 | break |
---|
85 | |
---|
86 | if __name__ == '__main__': main() |
---|