| 1 | #!/usr/bin/python2.6 |
|---|
| 2 | |
|---|
| 3 | """ |
|---|
| 4 | For every column that occurs in a multiple alignment print the column |
|---|
| 5 | and the number of times it occurs (one column/count per line, tab |
|---|
| 6 | separated), sorted by count descending. |
|---|
| 7 | |
|---|
| 8 | This version allows special handling of the 'wildcard' symbol in alignments. |
|---|
| 9 | |
|---|
| 10 | Note: all blocks must have exactly the same number of species. |
|---|
| 11 | |
|---|
| 12 | usage: %prog [options] < maf > column_counts |
|---|
| 13 | -w, --wildcard: include wildcards |
|---|
| 14 | -m, --maxwildcards=N: only allow N missing species |
|---|
| 15 | """ |
|---|
| 16 | |
|---|
| 17 | import bx.align.maf |
|---|
| 18 | import sys |
|---|
| 19 | from bx.cookbook import doc_optparse, cross_lists |
|---|
| 20 | |
|---|
| 21 | from itertools import * |
|---|
| 22 | |
|---|
| 23 | counts = {} |
|---|
| 24 | |
|---|
| 25 | nspecies = None |
|---|
| 26 | |
|---|
| 27 | for block in bx.align.maf.Reader( sys.stdin ): |
|---|
| 28 | # Ensure all blocks have the same number of rows |
|---|
| 29 | if nspecies: assert len( block.components ) == nspecies |
|---|
| 30 | else: nspecies = len( block.components ) |
|---|
| 31 | # Increment count for each column |
|---|
| 32 | for col in izip( * [ iter( comp.text.upper() ) for comp in block.components ] ): |
|---|
| 33 | col = ''.join( col ) |
|---|
| 34 | try: counts[ col ] += 1 |
|---|
| 35 | except: counts[ col ] = 1 |
|---|
| 36 | |
|---|
| 37 | # counts = [ ( value, key ) for key, value in counts.iteritems() ] |
|---|
| 38 | # counts.sort() |
|---|
| 39 | # counts.reverse() |
|---|
| 40 | |
|---|
| 41 | ## for count, col in counts: |
|---|
| 42 | ## print "".join(col), count |
|---|
| 43 | |
|---|
| 44 | options, args = doc_optparse.parse( __doc__ ) |
|---|
| 45 | |
|---|
| 46 | wildcard = False |
|---|
| 47 | if options.wildcard: |
|---|
| 48 | wildcard = True |
|---|
| 49 | max_wildcard = nspecies - 1 |
|---|
| 50 | if options.maxwildcards: |
|---|
| 51 | wildcard = True |
|---|
| 52 | max_wildcard = int( options.maxwildcards ) |
|---|
| 53 | |
|---|
| 54 | nucs = "ACGT-" |
|---|
| 55 | if wildcard: |
|---|
| 56 | nucs += "*" |
|---|
| 57 | |
|---|
| 58 | for col in cross_lists( *( [ nucs ] * nspecies ) ): |
|---|
| 59 | col = ''.join( col ) |
|---|
| 60 | if wildcard and col.count( "*" ) > max_wildcard: |
|---|
| 61 | continue |
|---|
| 62 | if col.count( "-" ) == nspecies: |
|---|
| 63 | continue |
|---|
| 64 | print col, counts.get( col, 0 ) |
|---|