| 1 | #!/usr/bin/python2.6 |
|---|
| 2 | |
|---|
| 3 | """ |
|---|
| 4 | For every column that occurs in a multiple alignment print the column |
|---|
| 5 | and the number of times it occurs (one column/count per line, tab |
|---|
| 6 | separated), sorted by count descending. |
|---|
| 7 | |
|---|
| 8 | Note: all blocks must have exactly the same number of species. |
|---|
| 9 | |
|---|
| 10 | usage: %prog < maf > column_counts |
|---|
| 11 | """ |
|---|
| 12 | |
|---|
| 13 | import bx.align.maf |
|---|
| 14 | import sys |
|---|
| 15 | |
|---|
| 16 | from itertools import * |
|---|
| 17 | |
|---|
| 18 | counts = {} |
|---|
| 19 | |
|---|
| 20 | nspecies = None |
|---|
| 21 | |
|---|
| 22 | for block in bx.align.maf.Reader( sys.stdin ): |
|---|
| 23 | # Ensure all blocks have the same number of rows |
|---|
| 24 | if nspecies: assert len( block.components ) == nspecies |
|---|
| 25 | else: nspecies = len( block.components ) |
|---|
| 26 | # Increment count for each column |
|---|
| 27 | for col in izip( * [ iter( comp.text.upper() ) for comp in block.components ] ): |
|---|
| 28 | try: counts[ col ] += 1 |
|---|
| 29 | except: counts[ col ] = 1 |
|---|
| 30 | |
|---|
| 31 | counts = [ ( value, key ) for key, value in counts.iteritems() ] |
|---|
| 32 | counts.sort() |
|---|
| 33 | counts.reverse() |
|---|
| 34 | |
|---|
| 35 | # print len( counts ) |
|---|
| 36 | |
|---|
| 37 | for count, col in counts: |
|---|
| 38 | print "".join(col), count |
|---|