[2] | 1 | #!/usr/bin/env python |
---|
| 2 | # This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools |
---|
| 3 | """ |
---|
| 4 | Input: fasta, minimal length, maximal length |
---|
| 5 | Output: fasta |
---|
| 6 | Return sequences whose lengths are within the range. |
---|
| 7 | """ |
---|
| 8 | |
---|
| 9 | import sys, os |
---|
| 10 | |
---|
| 11 | seq_hash = {} |
---|
| 12 | |
---|
| 13 | def __main__(): |
---|
| 14 | infile = sys.argv[1] |
---|
| 15 | outfile = sys.argv[2] |
---|
| 16 | title = '' |
---|
| 17 | sequence = '' |
---|
| 18 | sequence_count = 0 |
---|
| 19 | for i, line in enumerate( open( infile ) ): |
---|
| 20 | line = line.rstrip( '\r\n' ) |
---|
| 21 | if line.startswith( '>' ): |
---|
| 22 | if sequence: |
---|
| 23 | sequence_count += 1 |
---|
| 24 | seq_hash[( sequence_count, title )] = sequence |
---|
| 25 | title = line |
---|
| 26 | sequence = '' |
---|
| 27 | else: |
---|
| 28 | if line: |
---|
| 29 | sequence += line |
---|
| 30 | if line.split() and line.split()[0].isdigit(): |
---|
| 31 | sequence += ' ' |
---|
| 32 | if sequence: |
---|
| 33 | seq_hash[( sequence_count, title )] = sequence |
---|
| 34 | # return only those lengths are in the range |
---|
| 35 | out = open( outfile, 'w' ) |
---|
| 36 | title_keys = seq_hash.keys() |
---|
| 37 | title_keys.sort() |
---|
| 38 | for i, fasta_title in title_keys: |
---|
| 39 | sequence = seq_hash[( i, fasta_title )] |
---|
| 40 | print >> out, "%s\t%s" %( fasta_title, sequence ) |
---|
| 41 | out.close() |
---|
| 42 | |
---|
| 43 | if __name__ == "__main__" : __main__() |
---|