1 | #!/usr/bin/python2.6 |
---|
2 | |
---|
3 | """ |
---|
4 | For every column that occurs in a multiple alignment print the column |
---|
5 | and the number of times it occurs (one column/count per line, tab |
---|
6 | separated), sorted by count descending. |
---|
7 | |
---|
8 | This version allows special handling of the 'wildcard' symbol in alignments. |
---|
9 | |
---|
10 | Note: all blocks must have exactly the same number of species. |
---|
11 | |
---|
12 | usage: %prog [options] < maf > column_counts |
---|
13 | -w, --wildcard: include wildcards |
---|
14 | -m, --maxwildcards=N: only allow N missing species |
---|
15 | """ |
---|
16 | |
---|
17 | import bx.align.maf |
---|
18 | import sys |
---|
19 | from bx.cookbook import doc_optparse, cross_lists |
---|
20 | |
---|
21 | from itertools import * |
---|
22 | |
---|
23 | counts = {} |
---|
24 | |
---|
25 | nspecies = None |
---|
26 | |
---|
27 | for block in bx.align.maf.Reader( sys.stdin ): |
---|
28 | # Ensure all blocks have the same number of rows |
---|
29 | if nspecies: assert len( block.components ) == nspecies |
---|
30 | else: nspecies = len( block.components ) |
---|
31 | # Increment count for each column |
---|
32 | for col in izip( * [ iter( comp.text.upper() ) for comp in block.components ] ): |
---|
33 | col = ''.join( col ) |
---|
34 | try: counts[ col ] += 1 |
---|
35 | except: counts[ col ] = 1 |
---|
36 | |
---|
37 | # counts = [ ( value, key ) for key, value in counts.iteritems() ] |
---|
38 | # counts.sort() |
---|
39 | # counts.reverse() |
---|
40 | |
---|
41 | ## for count, col in counts: |
---|
42 | ## print "".join(col), count |
---|
43 | |
---|
44 | options, args = doc_optparse.parse( __doc__ ) |
---|
45 | |
---|
46 | wildcard = False |
---|
47 | if options.wildcard: |
---|
48 | wildcard = True |
---|
49 | max_wildcard = nspecies - 1 |
---|
50 | if options.maxwildcards: |
---|
51 | wildcard = True |
---|
52 | max_wildcard = int( options.maxwildcards ) |
---|
53 | |
---|
54 | nucs = "ACGT-" |
---|
55 | if wildcard: |
---|
56 | nucs += "*" |
---|
57 | |
---|
58 | for col in cross_lists( *( [ nucs ] * nspecies ) ): |
---|
59 | col = ''.join( col ) |
---|
60 | if wildcard and col.count( "*" ) > max_wildcard: |
---|
61 | continue |
---|
62 | if col.count( "-" ) == nspecies: |
---|
63 | continue |
---|
64 | print col, counts.get( col, 0 ) |
---|