root/galaxy-central/eggs/docutils-0.4-py2.6.egg/docutils/parsers/rst/tableparser.py @ 3

リビジョン 3, 19.9 KB (コミッタ: kohda, 14 年 前)

Install Unix tools  http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号 
1# Author: David Goodger
2# Contact: goodger@users.sourceforge.net
3# Revision: $Revision: 4152 $
4# Date: $Date: 2005-12-08 00:46:30 +0100 (Thu, 08 Dec 2005) $
5# Copyright: This module has been placed in the public domain.
6
7"""
8This module defines table parser classes,which parse plaintext-graphic tables
9and produce a well-formed data structure suitable for building a CALS table.
10
11:Classes:
12    - `GridTableParser`: Parse fully-formed tables represented with a grid.
13    - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
14      borders.
15
16:Exception class: `TableMarkupError`
17
18:Function:
19    `update_dict_of_lists()`: Merge two dictionaries containing list values.
20"""
21
22__docformat__ = 'reStructuredText'
23
24
25import re
26import sys
27from docutils import DataError
28
29
30class TableMarkupError(DataError): pass
31
32
33class TableParser:
34
35    """
36    Abstract superclass for the common parts of the syntax-specific parsers.
37    """
38
39    head_body_separator_pat = None
40    """Matches the row separator between head rows and body rows."""
41
42    double_width_pad_char = '\x00'
43    """Padding character for East Asian double-width text."""
44
45    def parse(self, block):
46        """
47        Analyze the text `block` and return a table data structure.
48
49        Given a plaintext-graphic table in `block` (list of lines of text; no
50        whitespace padding), parse the table, construct and return the data
51        necessary to construct a CALS table or equivalent.
52
53        Raise `TableMarkupError` if there is any problem with the markup.
54        """
55        self.setup(block)
56        self.find_head_body_sep()
57        self.parse_table()
58        structure = self.structure_from_cells()
59        return structure
60
61    def find_head_body_sep(self):
62        """Look for a head/body row separator line; store the line index."""
63        for i in range(len(self.block)):
64            line = self.block[i]
65            if self.head_body_separator_pat.match(line):
66                if self.head_body_sep:
67                    raise TableMarkupError(
68                        'Multiple head/body row separators in table (at line '
69                        'offset %s and %s); only one allowed.'
70                        % (self.head_body_sep, i))
71                else:
72                    self.head_body_sep = i
73                    self.block[i] = line.replace('=', '-')
74        if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
75                                                             - 1):
76            raise TableMarkupError('The head/body row separator may not be '
77                                   'the first or last line of the table.')
78
79
80class GridTableParser(TableParser):
81
82    """
83    Parse a grid table using `parse()`.
84
85    Here's an example of a grid table::
86
87        +------------------------+------------+----------+----------+
88        | Header row, column 1   | Header 2   | Header 3 | Header 4 |
89        +========================+============+==========+==========+
90        | body row 1, column 1   | column 2   | column 3 | column 4 |
91        +------------------------+------------+----------+----------+
92        | body row 2             | Cells may span columns.          |
93        +------------------------+------------+---------------------+
94        | body row 3             | Cells may  | - Table cells       |
95        +------------------------+ span rows. | - contain           |
96        | body row 4             |            | - body elements.    |
97        +------------------------+------------+---------------------+
98
99    Intersections use '+', row separators use '-' (except for one optional
100    head/body row separator, which uses '='), and column separators use '|'.
101
102    Passing the above table to the `parse()` method will result in the
103    following data structure::
104
105        ([24, 12, 10, 10],
106         [[(0, 0, 1, ['Header row, column 1']),
107           (0, 0, 1, ['Header 2']),
108           (0, 0, 1, ['Header 3']),
109           (0, 0, 1, ['Header 4'])]],
110         [[(0, 0, 3, ['body row 1, column 1']),
111           (0, 0, 3, ['column 2']),
112           (0, 0, 3, ['column 3']),
113           (0, 0, 3, ['column 4'])],
114          [(0, 0, 5, ['body row 2']),
115           (0, 2, 5, ['Cells may span columns.']),
116           None,
117           None],
118          [(0, 0, 7, ['body row 3']),
119           (1, 0, 7, ['Cells may', 'span rows.', '']),
120           (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
121           None],
122          [(0, 0, 9, ['body row 4']), None, None, None]])
123
124    The first item is a list containing column widths (colspecs). The second
125    item is a list of head rows, and the third is a list of body rows. Each
126    row contains a list of cells. Each cell is either None (for a cell unused
127    because of another cell's span), or a tuple. A cell tuple contains four
128    items: the number of extra rows used by the cell in a vertical span
129    (morerows); the number of extra columns used by the cell in a horizontal
130    span (morecols); the line offset of the first line of the cell contents;
131    and the cell contents, a list of lines of text.
132    """
133
134    head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
135
136    def setup(self, block):
137        self.block = block[:]           # make a copy; it may be modified
138        self.block.disconnect()         # don't propagate changes to parent
139        self.bottom = len(block) - 1
140        self.right = len(block[0]) - 1
141        self.head_body_sep = None
142        self.done = [-1] * len(block[0])
143        self.cells = []
144        self.rowseps = {0: [0]}
145        self.colseps = {0: [0]}
146
147    def parse_table(self):
148        """
149        Start with a queue of upper-left corners, containing the upper-left
150        corner of the table itself. Trace out one rectangular cell, remember
151        it, and add its upper-right and lower-left corners to the queue of
152        potential upper-left corners of further cells. Process the queue in
153        top-to-bottom order, keeping track of how much of each text column has
154        been seen.
155
156        We'll end up knowing all the row and column boundaries, cell positions
157        and their dimensions.
158        """
159        corners = [(0, 0)]
160        while corners:
161            top, left = corners.pop(0)
162            if top == self.bottom or left == self.right \
163                  or top <= self.done[left]:
164                continue
165            result = self.scan_cell(top, left)
166            if not result:
167                continue
168            bottom, right, rowseps, colseps = result
169            update_dict_of_lists(self.rowseps, rowseps)
170            update_dict_of_lists(self.colseps, colseps)
171            self.mark_done(top, left, bottom, right)
172            cellblock = self.block.get_2D_block(top + 1, left + 1,
173                                                bottom, right)
174            cellblock.disconnect()      # lines in cell can't sync with parent
175            cellblock.replace(self.double_width_pad_char, '')
176            self.cells.append((top, left, bottom, right, cellblock))
177            corners.extend([(top, right), (bottom, left)])
178            corners.sort()
179        if not self.check_parse_complete():
180            raise TableMarkupError('Malformed table; parse incomplete.')
181
182    def mark_done(self, top, left, bottom, right):
183        """For keeping track of how much of each text column has been seen."""
184        before = top - 1
185        after = bottom - 1
186        for col in range(left, right):
187            assert self.done[col] == before
188            self.done[col] = after
189
190    def check_parse_complete(self):
191        """Each text column should have been completely seen."""
192        last = self.bottom - 1
193        for col in range(self.right):
194            if self.done[col] != last:
195                return None
196        return 1
197
198    def scan_cell(self, top, left):
199        """Starting at the top-left corner, start tracing out a cell."""
200        assert self.block[top][left] == '+'
201        result = self.scan_right(top, left)
202        return result
203
204    def scan_right(self, top, left):
205        """
206        Look for the top-right corner of the cell, and make note of all column
207        boundaries ('+').
208        """
209        colseps = {}
210        line = self.block[top]
211        for i in range(left + 1, self.right + 1):
212            if line[i] == '+':
213                colseps[i] = [top]
214                result = self.scan_down(top, left, i)
215                if result:
216                    bottom, rowseps, newcolseps = result
217                    update_dict_of_lists(colseps, newcolseps)
218                    return bottom, i, rowseps, colseps
219            elif line[i] != '-':
220                return None
221        return None
222
223    def scan_down(self, top, left, right):
224        """
225        Look for the bottom-right corner of the cell, making note of all row
226        boundaries.
227        """
228        rowseps = {}
229        for i in range(top + 1, self.bottom + 1):
230            if self.block[i][right] == '+':
231                rowseps[i] = [right]
232                result = self.scan_left(top, left, i, right)
233                if result:
234                    newrowseps, colseps = result
235                    update_dict_of_lists(rowseps, newrowseps)
236                    return i, rowseps, colseps
237            elif self.block[i][right] != '|':
238                return None
239        return None
240
241    def scan_left(self, top, left, bottom, right):
242        """
243        Noting column boundaries, look for the bottom-left corner of the cell.
244        It must line up with the starting point.
245        """
246        colseps = {}
247        line = self.block[bottom]
248        for i in range(right - 1, left, -1):
249            if line[i] == '+':
250                colseps[i] = [bottom]
251            elif line[i] != '-':
252                return None
253        if line[left] != '+':
254            return None
255        result = self.scan_up(top, left, bottom, right)
256        if result is not None:
257            rowseps = result
258            return rowseps, colseps
259        return None
260
261    def scan_up(self, top, left, bottom, right):
262        """
263        Noting row boundaries, see if we can return to the starting point.
264        """
265        rowseps = {}
266        for i in range(bottom - 1, top, -1):
267            if self.block[i][left] == '+':
268                rowseps[i] = [left]
269            elif self.block[i][left] != '|':
270                return None
271        return rowseps
272
273    def structure_from_cells(self):
274        """
275        From the data collected by `scan_cell()`, convert to the final data
276        structure.
277        """
278        rowseps = self.rowseps.keys()   # list of row boundaries
279        rowseps.sort()
280        rowindex = {}
281        for i in range(len(rowseps)):
282            rowindex[rowseps[i]] = i    # row boundary -> row number mapping
283        colseps = self.colseps.keys()   # list of column boundaries
284        colseps.sort()
285        colindex = {}
286        for i in range(len(colseps)):
287            colindex[colseps[i]] = i    # column boundary -> col number map
288        colspecs = [(colseps[i] - colseps[i - 1] - 1)
289                    for i in range(1, len(colseps))] # list of column widths
290        # prepare an empty table with the correct number of rows & columns
291        onerow = [None for i in range(len(colseps) - 1)]
292        rows = [onerow[:] for i in range(len(rowseps) - 1)]
293        # keep track of # of cells remaining; should reduce to zero
294        remaining = (len(rowseps) - 1) * (len(colseps) - 1)
295        for top, left, bottom, right, block in self.cells:
296            rownum = rowindex[top]
297            colnum = colindex[left]
298            assert rows[rownum][colnum] is None, (
299                  'Cell (row %s, column %s) already used.'
300                  % (rownum + 1, colnum + 1))
301            morerows = rowindex[bottom] - rownum - 1
302            morecols = colindex[right] - colnum - 1
303            remaining -= (morerows + 1) * (morecols + 1)
304            # write the cell into the table
305            rows[rownum][colnum] = (morerows, morecols, top + 1, block)
306        assert remaining == 0, 'Unused cells remaining.'
307        if self.head_body_sep:          # separate head rows from body rows
308            numheadrows = rowindex[self.head_body_sep]
309            headrows = rows[:numheadrows]
310            bodyrows = rows[numheadrows:]
311        else:
312            headrows = []
313            bodyrows = rows
314        return (colspecs, headrows, bodyrows)
315
316
317class SimpleTableParser(TableParser):
318
319    """
320    Parse a simple table using `parse()`.
321
322    Here's an example of a simple table::
323
324        =====  =====
325        col 1  col 2
326        =====  =====
327        1      Second column of row 1.
328        2      Second column of row 2.
329               Second line of paragraph.
330        3      - Second column of row 3.
331
332               - Second item in bullet
333                 list (row 3, column 2).
334        4 is a span
335        ------------
336        5
337        =====  =====
338
339    Top and bottom borders use '=', column span underlines use '-', column
340    separation is indicated with spaces.
341
342    Passing the above table to the `parse()` method will result in the
343    following data structure, whose interpretation is the same as for
344    `GridTableParser`::
345
346        ([5, 25],
347         [[(0, 0, 1, ['col 1']),
348           (0, 0, 1, ['col 2'])]],
349         [[(0, 0, 3, ['1']),
350           (0, 0, 3, ['Second column of row 1.'])],
351          [(0, 0, 4, ['2']),
352           (0, 0, 4, ['Second column of row 2.',
353                      'Second line of paragraph.'])],
354          [(0, 0, 6, ['3']),
355           (0, 0, 6, ['- Second column of row 3.',
356                      '',
357                      '- Second item in bullet',
358                      '  list (row 3, column 2).'])],
359          [(0, 1, 10, ['4 is a span'])],
360          [(0, 0, 12, ['5']),
361           (0, 0, 12, [''])]])
362    """
363
364    head_body_separator_pat = re.compile('=[ =]*$')
365    span_pat = re.compile('-[ -]*$')
366
367    def setup(self, block):
368        self.block = block[:]           # make a copy; it will be modified
369        self.block.disconnect()         # don't propagate changes to parent
370        # Convert top & bottom borders to column span underlines:
371        self.block[0] = self.block[0].replace('=', '-')
372        self.block[-1] = self.block[-1].replace('=', '-')
373        self.head_body_sep = None
374        self.columns = []
375        self.border_end = None
376        self.table = []
377        self.done = [-1] * len(block[0])
378        self.rowseps = {0: [0]}
379        self.colseps = {0: [0]}
380
381    def parse_table(self):
382        """
383        First determine the column boundaries from the top border, then
384        process rows.  Each row may consist of multiple lines; accumulate
385        lines until a row is complete.  Call `self.parse_row` to finish the
386        job.
387        """
388        # Top border must fully describe all table columns.
389        self.columns = self.parse_columns(self.block[0], 0)
390        self.border_end = self.columns[-1][1]
391        firststart, firstend = self.columns[0]
392        offset = 1                      # skip top border
393        start = 1
394        text_found = None
395        while offset < len(self.block):
396            line = self.block[offset]
397            if self.span_pat.match(line):
398                # Column span underline or border; row is complete.
399                self.parse_row(self.block[start:offset], start,
400                               (line.rstrip(), offset))
401                start = offset + 1
402                text_found = None
403            elif line[firststart:firstend].strip():
404                # First column not blank, therefore it's a new row.
405                if text_found and offset != start:
406                    self.parse_row(self.block[start:offset], start)
407                start = offset
408                text_found = 1
409            elif not text_found:
410                start = offset + 1
411            offset += 1
412
413    def parse_columns(self, line, offset):
414        """
415        Given a column span underline, return a list of (begin, end) pairs.
416        """
417        cols = []
418        end = 0
419        while 1:
420            begin = line.find('-', end)
421            end = line.find(' ', begin)
422            if begin < 0:
423                break
424            if end < 0:
425                end = len(line)
426            cols.append((begin, end))
427        if self.columns:
428            if cols[-1][1] != self.border_end:
429                raise TableMarkupError('Column span incomplete at line '
430                                       'offset %s.' % offset)
431            # Allow for an unbounded rightmost column:
432            cols[-1] = (cols[-1][0], self.columns[-1][1])
433        return cols
434
435    def init_row(self, colspec, offset):
436        i = 0
437        cells = []
438        for start, end in colspec:
439            morecols = 0
440            try:
441                assert start == self.columns[i][0]
442                while end != self.columns[i][1]:
443                    i += 1
444                    morecols += 1
445            except (AssertionError, IndexError):
446                raise TableMarkupError('Column span alignment problem at '
447                                       'line offset %s.' % (offset + 1))
448            cells.append([0, morecols, offset, []])
449            i += 1
450        return cells
451
452    def parse_row(self, lines, start, spanline=None):
453        """
454        Given the text `lines` of a row, parse it and append to `self.table`.
455
456        The row is parsed according to the current column spec (either
457        `spanline` if provided or `self.columns`).  For each column, extract
458        text from each line, and check for text in column margins.  Finally,
459        adjust for insigificant whitespace.
460        """
461        if not (lines or spanline):
462            # No new row, just blank lines.
463            return
464        if spanline:
465            columns = self.parse_columns(*spanline)
466            span_offset = spanline[1]
467        else:
468            columns = self.columns[:]
469            span_offset = start
470        self.check_columns(lines, start, columns)
471        row = self.init_row(columns, start)
472        for i in range(len(columns)):
473            start, end = columns[i]
474            cellblock = lines.get_2D_block(0, start, len(lines), end)
475            cellblock.disconnect()      # lines in cell can't sync with parent
476            cellblock.replace(self.double_width_pad_char, '')
477            row[i][3] = cellblock
478        self.table.append(row)
479
480    def check_columns(self, lines, first_line, columns):
481        """
482        Check for text in column margins and text overflow in the last column.
483        Raise TableMarkupError if anything but whitespace is in column margins.
484        Adjust the end value for the last column if there is text overflow.
485        """
486        # "Infinite" value for a dummy last column's beginning, used to
487        # check for text overflow:
488        columns.append((sys.maxint, None))
489        lastcol = len(columns) - 2
490        for i in range(len(columns) - 1):
491            start, end = columns[i]
492            nextstart = columns[i+1][0]
493            offset = 0
494            for line in lines:
495                if i == lastcol and line[end:].strip():
496                    text = line[start:].rstrip()
497                    new_end = start + len(text)
498                    columns[i] = (start, new_end)
499                    main_start, main_end = self.columns[-1]
500                    if new_end > main_end:
501                        self.columns[-1] = (main_start, new_end)
502                elif line[end:nextstart].strip():
503                    raise TableMarkupError('Text in column margin at line '
504                                           'offset %s.' % (first_line + offset))
505                offset += 1
506        columns.pop()
507
508    def structure_from_cells(self):
509        colspecs = [end - start for start, end in self.columns]
510        first_body_row = 0
511        if self.head_body_sep:
512            for i in range(len(self.table)):
513                if self.table[i][0][2] > self.head_body_sep:
514                    first_body_row = i
515                    break
516        return (colspecs, self.table[:first_body_row],
517                self.table[first_body_row:])
518
519
520def update_dict_of_lists(master, newdata):
521    """
522    Extend the list values of `master` with those from `newdata`.
523
524    Both parameters must be dictionaries containing list values.
525    """
526    for key, values in newdata.items():
527        master.setdefault(key, []).extend(values)
Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。