1 | """ |
---|
2 | Classes to support FASTA files. |
---|
3 | |
---|
4 | :Author: Bob Harris (rsharris@bx.psu.edu) |
---|
5 | |
---|
6 | A FASTA file contains multiple sequences. Each sequence is usually DNA. |
---|
7 | |
---|
8 | A typical FASTA file:: |
---|
9 | |
---|
10 | >mule |
---|
11 | TAATACCCCGGATATATGTCCTCACATAGTTCGAGGTCGAGAAAAATGAC |
---|
12 | TTCCCACCAAGTGGACTCAGCTCGAGTAAACGCCAACGATACGTCCATTA |
---|
13 | GGTGTGTGCCgcaactagtcggacccgttgtgacggaaacaggtccccgc |
---|
14 | caagtcacacgggcatgtcatggacTCTCGATCGTTCATCGCCTTCTTGG |
---|
15 | GTACCGCAGCCGCAATTAAGCCGTGTCTTCTTCCCCCTTCAAACGGGAAT |
---|
16 | CGTGTCGACTTCTTAGGAGCAGNNNNNNNNNNCTAACTCCAGAG |
---|
17 | >donkey |
---|
18 | TAATACCCCGGATATATGTCTTAACATAGTTCCAGGTCGAGAAGAATGAC |
---|
19 | TTGCCACCAAGTGGACTCAGATTCAGTCAACGCGAACGATAAGTCCATTA |
---|
20 | GGTGTGTACCgcaactagtgggacccgttgtgacggaaacaggtcaccgc |
---|
21 | caagtcacacgtgcatgtcatgtacTCTCGATCGTTTATCGCCTTCTTGG |
---|
22 | GTACCGCAGCCGAAATTAAGCCGTGTCTTCTTCCCACTTCAAACGGGAAT |
---|
23 | CGTGTCGACTTTACAGGAACAGNNNNNNNNNNATAACGCCAGAG |
---|
24 | ... more sequences |
---|
25 | |
---|
26 | Typical use: |
---|
27 | |
---|
28 | for seq in bx.seq.fasta.FastaReader(sys.stdin): |
---|
29 | print seq.name |
---|
30 | print seq.get(0,seq.length) |
---|
31 | |
---|
32 | """ |
---|
33 | |
---|
34 | from bx.seq.seq import SeqFile,SeqReader |
---|
35 | import sys, string |
---|
36 | |
---|
37 | class FastaFile(SeqFile): |
---|
38 | |
---|
39 | def __init__(self, file, revcomp=False, name="", gap=None, lookahead=None, contig=None): |
---|
40 | SeqFile.__init__(self,file,revcomp,name,gap) |
---|
41 | self.lookahead = lookahead |
---|
42 | if (contig == None): contig = 1 |
---|
43 | assert (contig >= 1), "contig %d is not legal" % contig |
---|
44 | |
---|
45 | # nota bene: certainly not the most efficient or elegant implementation |
---|
46 | |
---|
47 | currContig = 1 |
---|
48 | while (True): |
---|
49 | if (self.lookahead != None): |
---|
50 | (line,self.lookahead) = (self.lookahead,None) |
---|
51 | else: |
---|
52 | line = self.file.readline() |
---|
53 | if (line == ""): break |
---|
54 | if not line: |
---|
55 | break |
---|
56 | if (line.startswith(">")): |
---|
57 | if (self.text != None): |
---|
58 | if (currContig == contig): |
---|
59 | self.lookahead = line # (next sequence header) |
---|
60 | break |
---|
61 | currContig += 1 |
---|
62 | self.name = self.extract_name(line[1:]) |
---|
63 | self.text = [] |
---|
64 | continue |
---|
65 | line = line.split() # (remove whitespace) |
---|
66 | if (self.text == None): self.text = line # (allows headerless fasta) |
---|
67 | else: self.text.extend(line) |
---|
68 | assert (currContig == contig), \ |
---|
69 | "contig %d is not legal (file contains only %d)" % (contig,currContig) |
---|
70 | if (self.text != None): |
---|
71 | self.text = "".join(self.text) |
---|
72 | self.length = len(self.text) |
---|
73 | |
---|
74 | class FastaReader(SeqReader): |
---|
75 | |
---|
76 | def __init__(self, file, revcomp=False, name="", gap=None): |
---|
77 | SeqReader.__init__(self,file,revcomp,name,gap) |
---|
78 | self.lookahead = None |
---|
79 | |
---|
80 | def next(self): |
---|
81 | seq = FastaFile(self.file,self.revcomp,self.name,self.gap,self.lookahead) |
---|
82 | if (seq.text == None): return |
---|
83 | self.lookahead = seq.lookahead |
---|
84 | self.seqs_read += 1 |
---|
85 | return seq |
---|
86 | |
---|
87 | |
---|
88 | class FastaWriter(object): |
---|
89 | |
---|
90 | def __init__(self,file,columns=50): |
---|
91 | self.file = file |
---|
92 | self.columns = columns |
---|
93 | |
---|
94 | def write(self,seq): |
---|
95 | print >>self.file,">%s" % seq.name |
---|
96 | text = seq.text |
---|
97 | if (self.columns != None) and (self.columns > 0): |
---|
98 | text = "\n".join([text[ix:ix+self.columns] \ |
---|
99 | for ix in range(0,len(text),self.columns)]) |
---|
100 | print >>self.file,text |
---|
101 | |
---|
102 | def close(self): |
---|
103 | assert (self.file != None) |
---|
104 | self.file.close() |
---|
105 | self.file = None |
---|