1 | #!/usr/bin/env python |
---|
2 | # This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools |
---|
3 | """ |
---|
4 | Input: fasta, minimal length, maximal length |
---|
5 | Output: fasta |
---|
6 | Return sequences whose lengths are within the range. |
---|
7 | """ |
---|
8 | |
---|
9 | import sys, os |
---|
10 | |
---|
11 | seq_hash = {} |
---|
12 | |
---|
13 | def __main__(): |
---|
14 | infile = sys.argv[1] |
---|
15 | outfile = sys.argv[2] |
---|
16 | title = '' |
---|
17 | sequence = '' |
---|
18 | sequence_count = 0 |
---|
19 | for i, line in enumerate( open( infile ) ): |
---|
20 | line = line.rstrip( '\r\n' ) |
---|
21 | if line.startswith( '>' ): |
---|
22 | if sequence: |
---|
23 | sequence_count += 1 |
---|
24 | seq_hash[( sequence_count, title )] = sequence |
---|
25 | title = line |
---|
26 | sequence = '' |
---|
27 | else: |
---|
28 | if line: |
---|
29 | sequence += line |
---|
30 | if line.split() and line.split()[0].isdigit(): |
---|
31 | sequence += ' ' |
---|
32 | if sequence: |
---|
33 | seq_hash[( sequence_count, title )] = sequence |
---|
34 | # return only those lengths are in the range |
---|
35 | out = open( outfile, 'w' ) |
---|
36 | title_keys = seq_hash.keys() |
---|
37 | title_keys.sort() |
---|
38 | for i, fasta_title in title_keys: |
---|
39 | sequence = seq_hash[( i, fasta_title )] |
---|
40 | print >> out, "%s\t%s" %( fasta_title, sequence ) |
---|
41 | out.close() |
---|
42 | |
---|
43 | if __name__ == "__main__" : __main__() |
---|