| 1 | """ |
|---|
| 2 | Classes to support FASTA files. |
|---|
| 3 | |
|---|
| 4 | :Author: Bob Harris (rsharris@bx.psu.edu) |
|---|
| 5 | |
|---|
| 6 | A FASTA file contains multiple sequences. Each sequence is usually DNA. |
|---|
| 7 | |
|---|
| 8 | A typical FASTA file:: |
|---|
| 9 | |
|---|
| 10 | >mule |
|---|
| 11 | TAATACCCCGGATATATGTCCTCACATAGTTCGAGGTCGAGAAAAATGAC |
|---|
| 12 | TTCCCACCAAGTGGACTCAGCTCGAGTAAACGCCAACGATACGTCCATTA |
|---|
| 13 | GGTGTGTGCCgcaactagtcggacccgttgtgacggaaacaggtccccgc |
|---|
| 14 | caagtcacacgggcatgtcatggacTCTCGATCGTTCATCGCCTTCTTGG |
|---|
| 15 | GTACCGCAGCCGCAATTAAGCCGTGTCTTCTTCCCCCTTCAAACGGGAAT |
|---|
| 16 | CGTGTCGACTTCTTAGGAGCAGNNNNNNNNNNCTAACTCCAGAG |
|---|
| 17 | >donkey |
|---|
| 18 | TAATACCCCGGATATATGTCTTAACATAGTTCCAGGTCGAGAAGAATGAC |
|---|
| 19 | TTGCCACCAAGTGGACTCAGATTCAGTCAACGCGAACGATAAGTCCATTA |
|---|
| 20 | GGTGTGTACCgcaactagtgggacccgttgtgacggaaacaggtcaccgc |
|---|
| 21 | caagtcacacgtgcatgtcatgtacTCTCGATCGTTTATCGCCTTCTTGG |
|---|
| 22 | GTACCGCAGCCGAAATTAAGCCGTGTCTTCTTCCCACTTCAAACGGGAAT |
|---|
| 23 | CGTGTCGACTTTACAGGAACAGNNNNNNNNNNATAACGCCAGAG |
|---|
| 24 | ... more sequences |
|---|
| 25 | |
|---|
| 26 | Typical use: |
|---|
| 27 | |
|---|
| 28 | for seq in bx.seq.fasta.FastaReader(sys.stdin): |
|---|
| 29 | print seq.name |
|---|
| 30 | print seq.get(0,seq.length) |
|---|
| 31 | |
|---|
| 32 | """ |
|---|
| 33 | |
|---|
| 34 | from bx.seq.seq import SeqFile,SeqReader |
|---|
| 35 | import sys, string |
|---|
| 36 | |
|---|
| 37 | class FastaFile(SeqFile): |
|---|
| 38 | |
|---|
| 39 | def __init__(self, file, revcomp=False, name="", gap=None, lookahead=None, contig=None): |
|---|
| 40 | SeqFile.__init__(self,file,revcomp,name,gap) |
|---|
| 41 | self.lookahead = lookahead |
|---|
| 42 | if (contig == None): contig = 1 |
|---|
| 43 | assert (contig >= 1), "contig %d is not legal" % contig |
|---|
| 44 | |
|---|
| 45 | # nota bene: certainly not the most efficient or elegant implementation |
|---|
| 46 | |
|---|
| 47 | currContig = 1 |
|---|
| 48 | while (True): |
|---|
| 49 | if (self.lookahead != None): |
|---|
| 50 | (line,self.lookahead) = (self.lookahead,None) |
|---|
| 51 | else: |
|---|
| 52 | line = self.file.readline() |
|---|
| 53 | if (line == ""): break |
|---|
| 54 | if not line: |
|---|
| 55 | break |
|---|
| 56 | if (line.startswith(">")): |
|---|
| 57 | if (self.text != None): |
|---|
| 58 | if (currContig == contig): |
|---|
| 59 | self.lookahead = line # (next sequence header) |
|---|
| 60 | break |
|---|
| 61 | currContig += 1 |
|---|
| 62 | self.name = self.extract_name(line[1:]) |
|---|
| 63 | self.text = [] |
|---|
| 64 | continue |
|---|
| 65 | line = line.split() # (remove whitespace) |
|---|
| 66 | if (self.text == None): self.text = line # (allows headerless fasta) |
|---|
| 67 | else: self.text.extend(line) |
|---|
| 68 | assert (currContig == contig), \ |
|---|
| 69 | "contig %d is not legal (file contains only %d)" % (contig,currContig) |
|---|
| 70 | if (self.text != None): |
|---|
| 71 | self.text = "".join(self.text) |
|---|
| 72 | self.length = len(self.text) |
|---|
| 73 | |
|---|
| 74 | class FastaReader(SeqReader): |
|---|
| 75 | |
|---|
| 76 | def __init__(self, file, revcomp=False, name="", gap=None): |
|---|
| 77 | SeqReader.__init__(self,file,revcomp,name,gap) |
|---|
| 78 | self.lookahead = None |
|---|
| 79 | |
|---|
| 80 | def next(self): |
|---|
| 81 | seq = FastaFile(self.file,self.revcomp,self.name,self.gap,self.lookahead) |
|---|
| 82 | if (seq.text == None): return |
|---|
| 83 | self.lookahead = seq.lookahead |
|---|
| 84 | self.seqs_read += 1 |
|---|
| 85 | return seq |
|---|
| 86 | |
|---|
| 87 | |
|---|
| 88 | class FastaWriter(object): |
|---|
| 89 | |
|---|
| 90 | def __init__(self,file,columns=50): |
|---|
| 91 | self.file = file |
|---|
| 92 | self.columns = columns |
|---|
| 93 | |
|---|
| 94 | def write(self,seq): |
|---|
| 95 | print >>self.file,">%s" % seq.name |
|---|
| 96 | text = seq.text |
|---|
| 97 | if (self.columns != None) and (self.columns > 0): |
|---|
| 98 | text = "\n".join([text[ix:ix+self.columns] \ |
|---|
| 99 | for ix in range(0,len(text),self.columns)]) |
|---|
| 100 | print >>self.file,text |
|---|
| 101 | |
|---|
| 102 | def close(self): |
|---|
| 103 | assert (self.file != None) |
|---|
| 104 | self.file.close() |
|---|
| 105 | self.file = None |
|---|