| 1 | #!/usr/bin/python2.6 | 
|---|
| 2 |  | 
|---|
| 3 | """ | 
|---|
| 4 | Read a MAF from stdin and break into several new mafs containing no more than | 
|---|
| 5 | `chunk_size` columns. The new mafs will be written to `out_dir` along with a | 
|---|
| 6 | file "intervals.txt" specifying the range covered by each new maf file. A | 
|---|
| 7 | probability for writing each chunk can optionally be specified, resulting in | 
|---|
| 8 | a random fraction of chunks from the input MAF being produced. | 
|---|
| 9 |  | 
|---|
| 10 | usage: %prog [options] chunk_size out_dir < maf | 
|---|
| 11 |   --prob: probability of writing versus skipping each chunk. | 
|---|
| 12 | """ | 
|---|
| 13 |  | 
|---|
| 14 | usage = "usage: %prog chunk_size out_dir" | 
|---|
| 15 |  | 
|---|
| 16 | import sys | 
|---|
| 17 | from optparse import OptionParser | 
|---|
| 18 | import bx.align.maf | 
|---|
| 19 | import psyco_full | 
|---|
| 20 | import random | 
|---|
| 21 |  | 
|---|
| 22 | INF="inf" | 
|---|
| 23 |  | 
|---|
| 24 | def __main__(): | 
|---|
| 25 |  | 
|---|
| 26 |     # Parse command line arguments | 
|---|
| 27 |  | 
|---|
| 28 |     parser = OptionParser( "usage: %prog chunk_size out_dir" ) | 
|---|
| 29 |     parser.add_option( "--prob", action="store", default=None, type="float",  | 
|---|
| 30 |                        help="Probability of writing a given chunk" ) | 
|---|
| 31 |      | 
|---|
| 32 |     ( options, args ) = parser.parse_args() | 
|---|
| 33 |  | 
|---|
| 34 |     chunk_size = int( args[0] ) | 
|---|
| 35 |     out_dir = args[1] | 
|---|
| 36 |     prob = options.prob | 
|---|
| 37 |  | 
|---|
| 38 |     maf_reader = bx.align.maf.Reader( sys.stdin ) | 
|---|
| 39 |  | 
|---|
| 40 |     maf_writer = None | 
|---|
| 41 |  | 
|---|
| 42 |     count = 0 | 
|---|
| 43 |     current_chunk = -1 | 
|---|
| 44 |  | 
|---|
| 45 |     chunk_min = INF | 
|---|
| 46 |     chunk_max = 0 | 
|---|
| 47 |  | 
|---|
| 48 |     write_current_chunk = True | 
|---|
| 49 |  | 
|---|
| 50 |     interval_file = file( "%s/intervals.txt" % out_dir, "w" )    | 
|---|
| 51 |  | 
|---|
| 52 |     for m in maf_reader: | 
|---|
| 53 |         chunk_min = min( chunk_min, m.components[0].start ) | 
|---|
| 54 |         chunk_max = max( chunk_max, m.components[0].end ) | 
|---|
| 55 |         if not maf_writer or count + m.text_size > chunk_size: | 
|---|
| 56 |             current_chunk += 1 | 
|---|
| 57 |             # Finish the last chunk             | 
|---|
| 58 |             if maf_writer:  | 
|---|
| 59 |                 maf_writer.close() | 
|---|
| 60 |                 interval_file.write( "%s %s\n" % ( chunk_min, chunk_max ) ) | 
|---|
| 61 |                 chunk_min = INF | 
|---|
| 62 |                 chunk_max = 0 | 
|---|
| 63 |             # Decide if the new chunk will be written      | 
|---|
| 64 |             if prob: write_current_chunk = bool( random.random() <= prob ) | 
|---|
| 65 |             else: write_current_chunk = True | 
|---|
| 66 |             if write_current_chunk: | 
|---|
| 67 |                 maf_writer = bx.align.maf.Writer( file( "%s/%09d.maf" % ( out_dir, current_chunk ), "w" ) ) | 
|---|
| 68 |             else: | 
|---|
| 69 |                 maf_writer = None | 
|---|
| 70 |             count = 0 | 
|---|
| 71 |         if maf_writer: maf_writer.write( m ) | 
|---|
| 72 |         #count += m.text_size | 
|---|
| 73 |         count += m.components[0].size | 
|---|
| 74 |      | 
|---|
| 75 |     if maf_writer: | 
|---|
| 76 |         maf_writer.close() | 
|---|
| 77 |         interval_file.write( "%s %s\n" % ( chunk_min, chunk_max ) ) | 
|---|
| 78 |         interval_file.close() | 
|---|
| 79 |  | 
|---|
| 80 | if __name__ == "__main__": __main__() | 
|---|