1 | # Author: David Goodger |
---|
2 | # Contact: goodger@users.sourceforge.net |
---|
3 | # Revision: $Revision: 4152 $ |
---|
4 | # Date: $Date: 2005-12-08 00:46:30 +0100 (Thu, 08 Dec 2005) $ |
---|
5 | # Copyright: This module has been placed in the public domain. |
---|
6 | |
---|
7 | """ |
---|
8 | This module defines table parser classes,which parse plaintext-graphic tables |
---|
9 | and produce a well-formed data structure suitable for building a CALS table. |
---|
10 | |
---|
11 | :Classes: |
---|
12 | - `GridTableParser`: Parse fully-formed tables represented with a grid. |
---|
13 | - `SimpleTableParser`: Parse simple tables, delimited by top & bottom |
---|
14 | borders. |
---|
15 | |
---|
16 | :Exception class: `TableMarkupError` |
---|
17 | |
---|
18 | :Function: |
---|
19 | `update_dict_of_lists()`: Merge two dictionaries containing list values. |
---|
20 | """ |
---|
21 | |
---|
22 | __docformat__ = 'reStructuredText' |
---|
23 | |
---|
24 | |
---|
25 | import re |
---|
26 | import sys |
---|
27 | from docutils import DataError |
---|
28 | |
---|
29 | |
---|
30 | class TableMarkupError(DataError): pass |
---|
31 | |
---|
32 | |
---|
33 | class TableParser: |
---|
34 | |
---|
35 | """ |
---|
36 | Abstract superclass for the common parts of the syntax-specific parsers. |
---|
37 | """ |
---|
38 | |
---|
39 | head_body_separator_pat = None |
---|
40 | """Matches the row separator between head rows and body rows.""" |
---|
41 | |
---|
42 | double_width_pad_char = '\x00' |
---|
43 | """Padding character for East Asian double-width text.""" |
---|
44 | |
---|
45 | def parse(self, block): |
---|
46 | """ |
---|
47 | Analyze the text `block` and return a table data structure. |
---|
48 | |
---|
49 | Given a plaintext-graphic table in `block` (list of lines of text; no |
---|
50 | whitespace padding), parse the table, construct and return the data |
---|
51 | necessary to construct a CALS table or equivalent. |
---|
52 | |
---|
53 | Raise `TableMarkupError` if there is any problem with the markup. |
---|
54 | """ |
---|
55 | self.setup(block) |
---|
56 | self.find_head_body_sep() |
---|
57 | self.parse_table() |
---|
58 | structure = self.structure_from_cells() |
---|
59 | return structure |
---|
60 | |
---|
61 | def find_head_body_sep(self): |
---|
62 | """Look for a head/body row separator line; store the line index.""" |
---|
63 | for i in range(len(self.block)): |
---|
64 | line = self.block[i] |
---|
65 | if self.head_body_separator_pat.match(line): |
---|
66 | if self.head_body_sep: |
---|
67 | raise TableMarkupError( |
---|
68 | 'Multiple head/body row separators in table (at line ' |
---|
69 | 'offset %s and %s); only one allowed.' |
---|
70 | % (self.head_body_sep, i)) |
---|
71 | else: |
---|
72 | self.head_body_sep = i |
---|
73 | self.block[i] = line.replace('=', '-') |
---|
74 | if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) |
---|
75 | - 1): |
---|
76 | raise TableMarkupError('The head/body row separator may not be ' |
---|
77 | 'the first or last line of the table.') |
---|
78 | |
---|
79 | |
---|
80 | class GridTableParser(TableParser): |
---|
81 | |
---|
82 | """ |
---|
83 | Parse a grid table using `parse()`. |
---|
84 | |
---|
85 | Here's an example of a grid table:: |
---|
86 | |
---|
87 | +------------------------+------------+----------+----------+ |
---|
88 | | Header row, column 1 | Header 2 | Header 3 | Header 4 | |
---|
89 | +========================+============+==========+==========+ |
---|
90 | | body row 1, column 1 | column 2 | column 3 | column 4 | |
---|
91 | +------------------------+------------+----------+----------+ |
---|
92 | | body row 2 | Cells may span columns. | |
---|
93 | +------------------------+------------+---------------------+ |
---|
94 | | body row 3 | Cells may | - Table cells | |
---|
95 | +------------------------+ span rows. | - contain | |
---|
96 | | body row 4 | | - body elements. | |
---|
97 | +------------------------+------------+---------------------+ |
---|
98 | |
---|
99 | Intersections use '+', row separators use '-' (except for one optional |
---|
100 | head/body row separator, which uses '='), and column separators use '|'. |
---|
101 | |
---|
102 | Passing the above table to the `parse()` method will result in the |
---|
103 | following data structure:: |
---|
104 | |
---|
105 | ([24, 12, 10, 10], |
---|
106 | [[(0, 0, 1, ['Header row, column 1']), |
---|
107 | (0, 0, 1, ['Header 2']), |
---|
108 | (0, 0, 1, ['Header 3']), |
---|
109 | (0, 0, 1, ['Header 4'])]], |
---|
110 | [[(0, 0, 3, ['body row 1, column 1']), |
---|
111 | (0, 0, 3, ['column 2']), |
---|
112 | (0, 0, 3, ['column 3']), |
---|
113 | (0, 0, 3, ['column 4'])], |
---|
114 | [(0, 0, 5, ['body row 2']), |
---|
115 | (0, 2, 5, ['Cells may span columns.']), |
---|
116 | None, |
---|
117 | None], |
---|
118 | [(0, 0, 7, ['body row 3']), |
---|
119 | (1, 0, 7, ['Cells may', 'span rows.', '']), |
---|
120 | (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), |
---|
121 | None], |
---|
122 | [(0, 0, 9, ['body row 4']), None, None, None]]) |
---|
123 | |
---|
124 | The first item is a list containing column widths (colspecs). The second |
---|
125 | item is a list of head rows, and the third is a list of body rows. Each |
---|
126 | row contains a list of cells. Each cell is either None (for a cell unused |
---|
127 | because of another cell's span), or a tuple. A cell tuple contains four |
---|
128 | items: the number of extra rows used by the cell in a vertical span |
---|
129 | (morerows); the number of extra columns used by the cell in a horizontal |
---|
130 | span (morecols); the line offset of the first line of the cell contents; |
---|
131 | and the cell contents, a list of lines of text. |
---|
132 | """ |
---|
133 | |
---|
134 | head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') |
---|
135 | |
---|
136 | def setup(self, block): |
---|
137 | self.block = block[:] # make a copy; it may be modified |
---|
138 | self.block.disconnect() # don't propagate changes to parent |
---|
139 | self.bottom = len(block) - 1 |
---|
140 | self.right = len(block[0]) - 1 |
---|
141 | self.head_body_sep = None |
---|
142 | self.done = [-1] * len(block[0]) |
---|
143 | self.cells = [] |
---|
144 | self.rowseps = {0: [0]} |
---|
145 | self.colseps = {0: [0]} |
---|
146 | |
---|
147 | def parse_table(self): |
---|
148 | """ |
---|
149 | Start with a queue of upper-left corners, containing the upper-left |
---|
150 | corner of the table itself. Trace out one rectangular cell, remember |
---|
151 | it, and add its upper-right and lower-left corners to the queue of |
---|
152 | potential upper-left corners of further cells. Process the queue in |
---|
153 | top-to-bottom order, keeping track of how much of each text column has |
---|
154 | been seen. |
---|
155 | |
---|
156 | We'll end up knowing all the row and column boundaries, cell positions |
---|
157 | and their dimensions. |
---|
158 | """ |
---|
159 | corners = [(0, 0)] |
---|
160 | while corners: |
---|
161 | top, left = corners.pop(0) |
---|
162 | if top == self.bottom or left == self.right \ |
---|
163 | or top <= self.done[left]: |
---|
164 | continue |
---|
165 | result = self.scan_cell(top, left) |
---|
166 | if not result: |
---|
167 | continue |
---|
168 | bottom, right, rowseps, colseps = result |
---|
169 | update_dict_of_lists(self.rowseps, rowseps) |
---|
170 | update_dict_of_lists(self.colseps, colseps) |
---|
171 | self.mark_done(top, left, bottom, right) |
---|
172 | cellblock = self.block.get_2D_block(top + 1, left + 1, |
---|
173 | bottom, right) |
---|
174 | cellblock.disconnect() # lines in cell can't sync with parent |
---|
175 | cellblock.replace(self.double_width_pad_char, '') |
---|
176 | self.cells.append((top, left, bottom, right, cellblock)) |
---|
177 | corners.extend([(top, right), (bottom, left)]) |
---|
178 | corners.sort() |
---|
179 | if not self.check_parse_complete(): |
---|
180 | raise TableMarkupError('Malformed table; parse incomplete.') |
---|
181 | |
---|
182 | def mark_done(self, top, left, bottom, right): |
---|
183 | """For keeping track of how much of each text column has been seen.""" |
---|
184 | before = top - 1 |
---|
185 | after = bottom - 1 |
---|
186 | for col in range(left, right): |
---|
187 | assert self.done[col] == before |
---|
188 | self.done[col] = after |
---|
189 | |
---|
190 | def check_parse_complete(self): |
---|
191 | """Each text column should have been completely seen.""" |
---|
192 | last = self.bottom - 1 |
---|
193 | for col in range(self.right): |
---|
194 | if self.done[col] != last: |
---|
195 | return None |
---|
196 | return 1 |
---|
197 | |
---|
198 | def scan_cell(self, top, left): |
---|
199 | """Starting at the top-left corner, start tracing out a cell.""" |
---|
200 | assert self.block[top][left] == '+' |
---|
201 | result = self.scan_right(top, left) |
---|
202 | return result |
---|
203 | |
---|
204 | def scan_right(self, top, left): |
---|
205 | """ |
---|
206 | Look for the top-right corner of the cell, and make note of all column |
---|
207 | boundaries ('+'). |
---|
208 | """ |
---|
209 | colseps = {} |
---|
210 | line = self.block[top] |
---|
211 | for i in range(left + 1, self.right + 1): |
---|
212 | if line[i] == '+': |
---|
213 | colseps[i] = [top] |
---|
214 | result = self.scan_down(top, left, i) |
---|
215 | if result: |
---|
216 | bottom, rowseps, newcolseps = result |
---|
217 | update_dict_of_lists(colseps, newcolseps) |
---|
218 | return bottom, i, rowseps, colseps |
---|
219 | elif line[i] != '-': |
---|
220 | return None |
---|
221 | return None |
---|
222 | |
---|
223 | def scan_down(self, top, left, right): |
---|
224 | """ |
---|
225 | Look for the bottom-right corner of the cell, making note of all row |
---|
226 | boundaries. |
---|
227 | """ |
---|
228 | rowseps = {} |
---|
229 | for i in range(top + 1, self.bottom + 1): |
---|
230 | if self.block[i][right] == '+': |
---|
231 | rowseps[i] = [right] |
---|
232 | result = self.scan_left(top, left, i, right) |
---|
233 | if result: |
---|
234 | newrowseps, colseps = result |
---|
235 | update_dict_of_lists(rowseps, newrowseps) |
---|
236 | return i, rowseps, colseps |
---|
237 | elif self.block[i][right] != '|': |
---|
238 | return None |
---|
239 | return None |
---|
240 | |
---|
241 | def scan_left(self, top, left, bottom, right): |
---|
242 | """ |
---|
243 | Noting column boundaries, look for the bottom-left corner of the cell. |
---|
244 | It must line up with the starting point. |
---|
245 | """ |
---|
246 | colseps = {} |
---|
247 | line = self.block[bottom] |
---|
248 | for i in range(right - 1, left, -1): |
---|
249 | if line[i] == '+': |
---|
250 | colseps[i] = [bottom] |
---|
251 | elif line[i] != '-': |
---|
252 | return None |
---|
253 | if line[left] != '+': |
---|
254 | return None |
---|
255 | result = self.scan_up(top, left, bottom, right) |
---|
256 | if result is not None: |
---|
257 | rowseps = result |
---|
258 | return rowseps, colseps |
---|
259 | return None |
---|
260 | |
---|
261 | def scan_up(self, top, left, bottom, right): |
---|
262 | """ |
---|
263 | Noting row boundaries, see if we can return to the starting point. |
---|
264 | """ |
---|
265 | rowseps = {} |
---|
266 | for i in range(bottom - 1, top, -1): |
---|
267 | if self.block[i][left] == '+': |
---|
268 | rowseps[i] = [left] |
---|
269 | elif self.block[i][left] != '|': |
---|
270 | return None |
---|
271 | return rowseps |
---|
272 | |
---|
273 | def structure_from_cells(self): |
---|
274 | """ |
---|
275 | From the data collected by `scan_cell()`, convert to the final data |
---|
276 | structure. |
---|
277 | """ |
---|
278 | rowseps = self.rowseps.keys() # list of row boundaries |
---|
279 | rowseps.sort() |
---|
280 | rowindex = {} |
---|
281 | for i in range(len(rowseps)): |
---|
282 | rowindex[rowseps[i]] = i # row boundary -> row number mapping |
---|
283 | colseps = self.colseps.keys() # list of column boundaries |
---|
284 | colseps.sort() |
---|
285 | colindex = {} |
---|
286 | for i in range(len(colseps)): |
---|
287 | colindex[colseps[i]] = i # column boundary -> col number map |
---|
288 | colspecs = [(colseps[i] - colseps[i - 1] - 1) |
---|
289 | for i in range(1, len(colseps))] # list of column widths |
---|
290 | # prepare an empty table with the correct number of rows & columns |
---|
291 | onerow = [None for i in range(len(colseps) - 1)] |
---|
292 | rows = [onerow[:] for i in range(len(rowseps) - 1)] |
---|
293 | # keep track of # of cells remaining; should reduce to zero |
---|
294 | remaining = (len(rowseps) - 1) * (len(colseps) - 1) |
---|
295 | for top, left, bottom, right, block in self.cells: |
---|
296 | rownum = rowindex[top] |
---|
297 | colnum = colindex[left] |
---|
298 | assert rows[rownum][colnum] is None, ( |
---|
299 | 'Cell (row %s, column %s) already used.' |
---|
300 | % (rownum + 1, colnum + 1)) |
---|
301 | morerows = rowindex[bottom] - rownum - 1 |
---|
302 | morecols = colindex[right] - colnum - 1 |
---|
303 | remaining -= (morerows + 1) * (morecols + 1) |
---|
304 | # write the cell into the table |
---|
305 | rows[rownum][colnum] = (morerows, morecols, top + 1, block) |
---|
306 | assert remaining == 0, 'Unused cells remaining.' |
---|
307 | if self.head_body_sep: # separate head rows from body rows |
---|
308 | numheadrows = rowindex[self.head_body_sep] |
---|
309 | headrows = rows[:numheadrows] |
---|
310 | bodyrows = rows[numheadrows:] |
---|
311 | else: |
---|
312 | headrows = [] |
---|
313 | bodyrows = rows |
---|
314 | return (colspecs, headrows, bodyrows) |
---|
315 | |
---|
316 | |
---|
317 | class SimpleTableParser(TableParser): |
---|
318 | |
---|
319 | """ |
---|
320 | Parse a simple table using `parse()`. |
---|
321 | |
---|
322 | Here's an example of a simple table:: |
---|
323 | |
---|
324 | ===== ===== |
---|
325 | col 1 col 2 |
---|
326 | ===== ===== |
---|
327 | 1 Second column of row 1. |
---|
328 | 2 Second column of row 2. |
---|
329 | Second line of paragraph. |
---|
330 | 3 - Second column of row 3. |
---|
331 | |
---|
332 | - Second item in bullet |
---|
333 | list (row 3, column 2). |
---|
334 | 4 is a span |
---|
335 | ------------ |
---|
336 | 5 |
---|
337 | ===== ===== |
---|
338 | |
---|
339 | Top and bottom borders use '=', column span underlines use '-', column |
---|
340 | separation is indicated with spaces. |
---|
341 | |
---|
342 | Passing the above table to the `parse()` method will result in the |
---|
343 | following data structure, whose interpretation is the same as for |
---|
344 | `GridTableParser`:: |
---|
345 | |
---|
346 | ([5, 25], |
---|
347 | [[(0, 0, 1, ['col 1']), |
---|
348 | (0, 0, 1, ['col 2'])]], |
---|
349 | [[(0, 0, 3, ['1']), |
---|
350 | (0, 0, 3, ['Second column of row 1.'])], |
---|
351 | [(0, 0, 4, ['2']), |
---|
352 | (0, 0, 4, ['Second column of row 2.', |
---|
353 | 'Second line of paragraph.'])], |
---|
354 | [(0, 0, 6, ['3']), |
---|
355 | (0, 0, 6, ['- Second column of row 3.', |
---|
356 | '', |
---|
357 | '- Second item in bullet', |
---|
358 | ' list (row 3, column 2).'])], |
---|
359 | [(0, 1, 10, ['4 is a span'])], |
---|
360 | [(0, 0, 12, ['5']), |
---|
361 | (0, 0, 12, [''])]]) |
---|
362 | """ |
---|
363 | |
---|
364 | head_body_separator_pat = re.compile('=[ =]*$') |
---|
365 | span_pat = re.compile('-[ -]*$') |
---|
366 | |
---|
367 | def setup(self, block): |
---|
368 | self.block = block[:] # make a copy; it will be modified |
---|
369 | self.block.disconnect() # don't propagate changes to parent |
---|
370 | # Convert top & bottom borders to column span underlines: |
---|
371 | self.block[0] = self.block[0].replace('=', '-') |
---|
372 | self.block[-1] = self.block[-1].replace('=', '-') |
---|
373 | self.head_body_sep = None |
---|
374 | self.columns = [] |
---|
375 | self.border_end = None |
---|
376 | self.table = [] |
---|
377 | self.done = [-1] * len(block[0]) |
---|
378 | self.rowseps = {0: [0]} |
---|
379 | self.colseps = {0: [0]} |
---|
380 | |
---|
381 | def parse_table(self): |
---|
382 | """ |
---|
383 | First determine the column boundaries from the top border, then |
---|
384 | process rows. Each row may consist of multiple lines; accumulate |
---|
385 | lines until a row is complete. Call `self.parse_row` to finish the |
---|
386 | job. |
---|
387 | """ |
---|
388 | # Top border must fully describe all table columns. |
---|
389 | self.columns = self.parse_columns(self.block[0], 0) |
---|
390 | self.border_end = self.columns[-1][1] |
---|
391 | firststart, firstend = self.columns[0] |
---|
392 | offset = 1 # skip top border |
---|
393 | start = 1 |
---|
394 | text_found = None |
---|
395 | while offset < len(self.block): |
---|
396 | line = self.block[offset] |
---|
397 | if self.span_pat.match(line): |
---|
398 | # Column span underline or border; row is complete. |
---|
399 | self.parse_row(self.block[start:offset], start, |
---|
400 | (line.rstrip(), offset)) |
---|
401 | start = offset + 1 |
---|
402 | text_found = None |
---|
403 | elif line[firststart:firstend].strip(): |
---|
404 | # First column not blank, therefore it's a new row. |
---|
405 | if text_found and offset != start: |
---|
406 | self.parse_row(self.block[start:offset], start) |
---|
407 | start = offset |
---|
408 | text_found = 1 |
---|
409 | elif not text_found: |
---|
410 | start = offset + 1 |
---|
411 | offset += 1 |
---|
412 | |
---|
413 | def parse_columns(self, line, offset): |
---|
414 | """ |
---|
415 | Given a column span underline, return a list of (begin, end) pairs. |
---|
416 | """ |
---|
417 | cols = [] |
---|
418 | end = 0 |
---|
419 | while 1: |
---|
420 | begin = line.find('-', end) |
---|
421 | end = line.find(' ', begin) |
---|
422 | if begin < 0: |
---|
423 | break |
---|
424 | if end < 0: |
---|
425 | end = len(line) |
---|
426 | cols.append((begin, end)) |
---|
427 | if self.columns: |
---|
428 | if cols[-1][1] != self.border_end: |
---|
429 | raise TableMarkupError('Column span incomplete at line ' |
---|
430 | 'offset %s.' % offset) |
---|
431 | # Allow for an unbounded rightmost column: |
---|
432 | cols[-1] = (cols[-1][0], self.columns[-1][1]) |
---|
433 | return cols |
---|
434 | |
---|
435 | def init_row(self, colspec, offset): |
---|
436 | i = 0 |
---|
437 | cells = [] |
---|
438 | for start, end in colspec: |
---|
439 | morecols = 0 |
---|
440 | try: |
---|
441 | assert start == self.columns[i][0] |
---|
442 | while end != self.columns[i][1]: |
---|
443 | i += 1 |
---|
444 | morecols += 1 |
---|
445 | except (AssertionError, IndexError): |
---|
446 | raise TableMarkupError('Column span alignment problem at ' |
---|
447 | 'line offset %s.' % (offset + 1)) |
---|
448 | cells.append([0, morecols, offset, []]) |
---|
449 | i += 1 |
---|
450 | return cells |
---|
451 | |
---|
452 | def parse_row(self, lines, start, spanline=None): |
---|
453 | """ |
---|
454 | Given the text `lines` of a row, parse it and append to `self.table`. |
---|
455 | |
---|
456 | The row is parsed according to the current column spec (either |
---|
457 | `spanline` if provided or `self.columns`). For each column, extract |
---|
458 | text from each line, and check for text in column margins. Finally, |
---|
459 | adjust for insigificant whitespace. |
---|
460 | """ |
---|
461 | if not (lines or spanline): |
---|
462 | # No new row, just blank lines. |
---|
463 | return |
---|
464 | if spanline: |
---|
465 | columns = self.parse_columns(*spanline) |
---|
466 | span_offset = spanline[1] |
---|
467 | else: |
---|
468 | columns = self.columns[:] |
---|
469 | span_offset = start |
---|
470 | self.check_columns(lines, start, columns) |
---|
471 | row = self.init_row(columns, start) |
---|
472 | for i in range(len(columns)): |
---|
473 | start, end = columns[i] |
---|
474 | cellblock = lines.get_2D_block(0, start, len(lines), end) |
---|
475 | cellblock.disconnect() # lines in cell can't sync with parent |
---|
476 | cellblock.replace(self.double_width_pad_char, '') |
---|
477 | row[i][3] = cellblock |
---|
478 | self.table.append(row) |
---|
479 | |
---|
480 | def check_columns(self, lines, first_line, columns): |
---|
481 | """ |
---|
482 | Check for text in column margins and text overflow in the last column. |
---|
483 | Raise TableMarkupError if anything but whitespace is in column margins. |
---|
484 | Adjust the end value for the last column if there is text overflow. |
---|
485 | """ |
---|
486 | # "Infinite" value for a dummy last column's beginning, used to |
---|
487 | # check for text overflow: |
---|
488 | columns.append((sys.maxint, None)) |
---|
489 | lastcol = len(columns) - 2 |
---|
490 | for i in range(len(columns) - 1): |
---|
491 | start, end = columns[i] |
---|
492 | nextstart = columns[i+1][0] |
---|
493 | offset = 0 |
---|
494 | for line in lines: |
---|
495 | if i == lastcol and line[end:].strip(): |
---|
496 | text = line[start:].rstrip() |
---|
497 | new_end = start + len(text) |
---|
498 | columns[i] = (start, new_end) |
---|
499 | main_start, main_end = self.columns[-1] |
---|
500 | if new_end > main_end: |
---|
501 | self.columns[-1] = (main_start, new_end) |
---|
502 | elif line[end:nextstart].strip(): |
---|
503 | raise TableMarkupError('Text in column margin at line ' |
---|
504 | 'offset %s.' % (first_line + offset)) |
---|
505 | offset += 1 |
---|
506 | columns.pop() |
---|
507 | |
---|
508 | def structure_from_cells(self): |
---|
509 | colspecs = [end - start for start, end in self.columns] |
---|
510 | first_body_row = 0 |
---|
511 | if self.head_body_sep: |
---|
512 | for i in range(len(self.table)): |
---|
513 | if self.table[i][0][2] > self.head_body_sep: |
---|
514 | first_body_row = i |
---|
515 | break |
---|
516 | return (colspecs, self.table[:first_body_row], |
---|
517 | self.table[first_body_row:]) |
---|
518 | |
---|
519 | |
---|
520 | def update_dict_of_lists(master, newdata): |
---|
521 | """ |
---|
522 | Extend the list values of `master` with those from `newdata`. |
---|
523 | |
---|
524 | Both parameters must be dictionaries containing list values. |
---|
525 | """ |
---|
526 | for key, values in newdata.items(): |
---|
527 | master.setdefault(key, []).extend(values) |
---|