1 | #!/usr/bin/python2.6 |
---|
2 | |
---|
3 | """ |
---|
4 | |
---|
5 | Reads a maf file from stdin and applies the mapping file specified by |
---|
6 | `mapping_file` to produce a sequence of integers. Then for each possible word |
---|
7 | of length `motif_len` in this integer alphabet print the number of times |
---|
8 | that word occurs in the block. |
---|
9 | |
---|
10 | usage: %prog motif_len mapping_file < maf_file > counts |
---|
11 | """ |
---|
12 | |
---|
13 | from __future__ import division |
---|
14 | |
---|
15 | import psyco_full |
---|
16 | |
---|
17 | import bx.align.maf |
---|
18 | from bx import seqmapping |
---|
19 | import string |
---|
20 | import sys |
---|
21 | from Numeric import * |
---|
22 | |
---|
23 | def main(): |
---|
24 | |
---|
25 | word_length = int( sys.argv[1] ) |
---|
26 | align_count, alpha_map = seqmapping.alignment_mapping_from_file( file( sys.argv[2] ) ) |
---|
27 | |
---|
28 | for maf in bx.align.maf.Reader( sys.stdin ): |
---|
29 | assert len( maf.components ) == align_count |
---|
30 | # Translate alignment to ints |
---|
31 | ints = seqmapping.DNA.translate_list( [ c.text for c in maf.components ] ) |
---|
32 | # Apply mapping |
---|
33 | ints = alpha_map.translate( ints ) |
---|
34 | # Count words |
---|
35 | radix = alpha_map.get_out_size() |
---|
36 | counts = zeros( radix ** word_length, Int ) |
---|
37 | total = 0 |
---|
38 | for i in range( word_length, len( ints ) ): |
---|
39 | index = 0 |
---|
40 | factor = 1 |
---|
41 | skip = False |
---|
42 | for j in range( word_length ): |
---|
43 | assert 0 < i-j < len( ints ) |
---|
44 | letter = ints[i-j] |
---|
45 | if letter < 0: |
---|
46 | skip = True |
---|
47 | break |
---|
48 | index += letter * factor |
---|
49 | factor *= radix |
---|
50 | if skip: |
---|
51 | continue |
---|
52 | else: |
---|
53 | counts[ index ] += 1 |
---|
54 | total += 1 |
---|
55 | # Write ints separated by tabs |
---|
56 | print '\t'.join( [ str( total ) ] + map( str, counts ) ) |
---|
57 | |
---|
58 | if __name__ == "__main__": main() |
---|