Context Navigation

tableparser.py

リビジョン 3, 19.9 KB (コミッタ: kohda, 15 年前)
Install Unix tools http://hannonlab.cshl.edu/galaxy_unix_tools/galaxy.html

行番号
1	# Author: David Goodger
2	# Contact: goodger@users.sourceforge.net
3	# Revision: $Revision: 4152 $
4	# Date: $Date: 2005-12-08 00:46:30 +0100 (Thu, 08 Dec 2005) $
5	# Copyright: This module has been placed in the public domain.
6
7	"""
8	This module defines table parser classes,which parse plaintext-graphic tables
9	and produce a well-formed data structure suitable for building a CALS table.
10
11	:Classes:
12	- `GridTableParser`: Parse fully-formed tables represented with a grid.
13	- `SimpleTableParser`: Parse simple tables, delimited by top & bottom
14	borders.
15
16	:Exception class: `TableMarkupError`
17
18	:Function:
19	`update_dict_of_lists()`: Merge two dictionaries containing list values.
20	"""
21
22	__docformat__ = 'reStructuredText'
23
24
25	import re
26	import sys
27	from docutils import DataError
28
29
30	class TableMarkupError(DataError): pass
31
32
33	class TableParser:
34
35	"""
36	Abstract superclass for the common parts of the syntax-specific parsers.
37	"""
38
39	head_body_separator_pat = None
40	"""Matches the row separator between head rows and body rows."""
41
42	double_width_pad_char = '\x00'
43	"""Padding character for East Asian double-width text."""
44
45	def parse(self, block):
46	"""
47	Analyze the text `block` and return a table data structure.
48
49	Given a plaintext-graphic table in `block` (list of lines of text; no
50	whitespace padding), parse the table, construct and return the data
51	necessary to construct a CALS table or equivalent.
52
53	Raise `TableMarkupError` if there is any problem with the markup.
54	"""
55	self.setup(block)
56	self.find_head_body_sep()
57	self.parse_table()
58	structure = self.structure_from_cells()
59	return structure
60
61	def find_head_body_sep(self):
62	"""Look for a head/body row separator line; store the line index."""
63	for i in range(len(self.block)):
64	line = self.block[i]
65	if self.head_body_separator_pat.match(line):
66	if self.head_body_sep:
67	raise TableMarkupError(
68	'Multiple head/body row separators in table (at line '
69	'offset %s and %s); only one allowed.'
70	% (self.head_body_sep, i))
71	else:
72	self.head_body_sep = i
73	self.block[i] = line.replace('=', '-')
74	if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
75	- 1):
76	raise TableMarkupError('The head/body row separator may not be '
77	'the first or last line of the table.')
78
79
80	class GridTableParser(TableParser):
81
82	"""
83	Parse a grid table using `parse()`.
84
85	Here's an example of a grid table::
86
87	+------------------------+------------+----------+----------+
88	\| Header row, column 1 \| Header 2 \| Header 3 \| Header 4 \|
89	+========================+============+==========+==========+
90	\| body row 1, column 1 \| column 2 \| column 3 \| column 4 \|
91	+------------------------+------------+----------+----------+
92	\| body row 2 \| Cells may span columns. \|
93	+------------------------+------------+---------------------+
94	\| body row 3 \| Cells may \| - Table cells \|
95	+------------------------+ span rows. \| - contain \|
96	\| body row 4 \| \| - body elements. \|
97	+------------------------+------------+---------------------+
98
99	Intersections use '+', row separators use '-' (except for one optional
100	head/body row separator, which uses '='), and column separators use '\|'.
101
102	Passing the above table to the `parse()` method will result in the
103	following data structure::
104
105	([24, 12, 10, 10],
106	[[(0, 0, 1, ['Header row, column 1']),
107	(0, 0, 1, ['Header 2']),
108	(0, 0, 1, ['Header 3']),
109	(0, 0, 1, ['Header 4'])]],
110	[[(0, 0, 3, ['body row 1, column 1']),
111	(0, 0, 3, ['column 2']),
112	(0, 0, 3, ['column 3']),
113	(0, 0, 3, ['column 4'])],
114	[(0, 0, 5, ['body row 2']),
115	(0, 2, 5, ['Cells may span columns.']),
116	None,
117	None],
118	[(0, 0, 7, ['body row 3']),
119	(1, 0, 7, ['Cells may', 'span rows.', '']),
120	(1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
121	None],
122	[(0, 0, 9, ['body row 4']), None, None, None]])
123
124	The first item is a list containing column widths (colspecs). The second
125	item is a list of head rows, and the third is a list of body rows. Each
126	row contains a list of cells. Each cell is either None (for a cell unused
127	because of another cell's span), or a tuple. A cell tuple contains four
128	items: the number of extra rows used by the cell in a vertical span
129	(morerows); the number of extra columns used by the cell in a horizontal
130	span (morecols); the line offset of the first line of the cell contents;
131	and the cell contents, a list of lines of text.
132	"""
133
134	head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
135
136	def setup(self, block):
137	self.block = block[:] # make a copy; it may be modified
138	self.block.disconnect() # don't propagate changes to parent
139	self.bottom = len(block) - 1
140	self.right = len(block[0]) - 1
141	self.head_body_sep = None
142	self.done = [-1] * len(block[0])
143	self.cells = []
144	self.rowseps = {0: [0]}
145	self.colseps = {0: [0]}
146
147	def parse_table(self):
148	"""
149	Start with a queue of upper-left corners, containing the upper-left
150	corner of the table itself. Trace out one rectangular cell, remember
151	it, and add its upper-right and lower-left corners to the queue of
152	potential upper-left corners of further cells. Process the queue in
153	top-to-bottom order, keeping track of how much of each text column has
154	been seen.
155
156	We'll end up knowing all the row and column boundaries, cell positions
157	and their dimensions.
158	"""
159	corners = [(0, 0)]
160	while corners:
161	top, left = corners.pop(0)
162	if top == self.bottom or left == self.right \
163	or top <= self.done[left]:
164	continue
165	result = self.scan_cell(top, left)
166	if not result:
167	continue
168	bottom, right, rowseps, colseps = result
169	update_dict_of_lists(self.rowseps, rowseps)
170	update_dict_of_lists(self.colseps, colseps)
171	self.mark_done(top, left, bottom, right)
172	cellblock = self.block.get_2D_block(top + 1, left + 1,
173	bottom, right)
174	cellblock.disconnect() # lines in cell can't sync with parent
175	cellblock.replace(self.double_width_pad_char, '')
176	self.cells.append((top, left, bottom, right, cellblock))
177	corners.extend([(top, right), (bottom, left)])
178	corners.sort()
179	if not self.check_parse_complete():
180	raise TableMarkupError('Malformed table; parse incomplete.')
181
182	def mark_done(self, top, left, bottom, right):
183	"""For keeping track of how much of each text column has been seen."""
184	before = top - 1
185	after = bottom - 1
186	for col in range(left, right):
187	assert self.done[col] == before
188	self.done[col] = after
189
190	def check_parse_complete(self):
191	"""Each text column should have been completely seen."""
192	last = self.bottom - 1
193	for col in range(self.right):
194	if self.done[col] != last:
195	return None
196	return 1
197
198	def scan_cell(self, top, left):
199	"""Starting at the top-left corner, start tracing out a cell."""
200	assert self.block[top][left] == '+'
201	result = self.scan_right(top, left)
202	return result
203
204	def scan_right(self, top, left):
205	"""
206	Look for the top-right corner of the cell, and make note of all column
207	boundaries ('+').
208	"""
209	colseps = {}
210	line = self.block[top]
211	for i in range(left + 1, self.right + 1):
212	if line[i] == '+':
213	colseps[i] = [top]
214	result = self.scan_down(top, left, i)
215	if result:
216	bottom, rowseps, newcolseps = result
217	update_dict_of_lists(colseps, newcolseps)
218	return bottom, i, rowseps, colseps
219	elif line[i] != '-':
220	return None
221	return None
222
223	def scan_down(self, top, left, right):
224	"""
225	Look for the bottom-right corner of the cell, making note of all row
226	boundaries.
227	"""
228	rowseps = {}
229	for i in range(top + 1, self.bottom + 1):
230	if self.block[i][right] == '+':
231	rowseps[i] = [right]
232	result = self.scan_left(top, left, i, right)
233	if result:
234	newrowseps, colseps = result
235	update_dict_of_lists(rowseps, newrowseps)
236	return i, rowseps, colseps
237	elif self.block[i][right] != '\|':
238	return None
239	return None
240
241	def scan_left(self, top, left, bottom, right):
242	"""
243	Noting column boundaries, look for the bottom-left corner of the cell.
244	It must line up with the starting point.
245	"""
246	colseps = {}
247	line = self.block[bottom]
248	for i in range(right - 1, left, -1):
249	if line[i] == '+':
250	colseps[i] = [bottom]
251	elif line[i] != '-':
252	return None
253	if line[left] != '+':
254	return None
255	result = self.scan_up(top, left, bottom, right)
256	if result is not None:
257	rowseps = result
258	return rowseps, colseps
259	return None
260
261	def scan_up(self, top, left, bottom, right):
262	"""
263	Noting row boundaries, see if we can return to the starting point.
264	"""
265	rowseps = {}
266	for i in range(bottom - 1, top, -1):
267	if self.block[i][left] == '+':
268	rowseps[i] = [left]
269	elif self.block[i][left] != '\|':
270	return None
271	return rowseps
272
273	def structure_from_cells(self):
274	"""
275	From the data collected by `scan_cell()`, convert to the final data
276	structure.
277	"""
278	rowseps = self.rowseps.keys() # list of row boundaries
279	rowseps.sort()
280	rowindex = {}
281	for i in range(len(rowseps)):
282	rowindex[rowseps[i]] = i # row boundary -> row number mapping
283	colseps = self.colseps.keys() # list of column boundaries
284	colseps.sort()
285	colindex = {}
286	for i in range(len(colseps)):
287	colindex[colseps[i]] = i # column boundary -> col number map
288	colspecs = [(colseps[i] - colseps[i - 1] - 1)
289	for i in range(1, len(colseps))] # list of column widths
290	# prepare an empty table with the correct number of rows & columns
291	onerow = [None for i in range(len(colseps) - 1)]
292	rows = [onerow[:] for i in range(len(rowseps) - 1)]
293	# keep track of # of cells remaining; should reduce to zero
294	remaining = (len(rowseps) - 1) * (len(colseps) - 1)
295	for top, left, bottom, right, block in self.cells:
296	rownum = rowindex[top]
297	colnum = colindex[left]
298	assert rows[rownum][colnum] is None, (
299	'Cell (row %s, column %s) already used.'
300	% (rownum + 1, colnum + 1))
301	morerows = rowindex[bottom] - rownum - 1
302	morecols = colindex[right] - colnum - 1
303	remaining -= (morerows + 1) * (morecols + 1)
304	# write the cell into the table
305	rows[rownum][colnum] = (morerows, morecols, top + 1, block)
306	assert remaining == 0, 'Unused cells remaining.'
307	if self.head_body_sep: # separate head rows from body rows
308	numheadrows = rowindex[self.head_body_sep]
309	headrows = rows[:numheadrows]
310	bodyrows = rows[numheadrows:]
311	else:
312	headrows = []
313	bodyrows = rows
314	return (colspecs, headrows, bodyrows)
315
316
317	class SimpleTableParser(TableParser):
318
319	"""
320	Parse a simple table using `parse()`.
321
322	Here's an example of a simple table::
323
324	===== =====
325	col 1 col 2
326	===== =====
327	1 Second column of row 1.
328	2 Second column of row 2.
329	Second line of paragraph.
330	3 - Second column of row 3.
331
332	- Second item in bullet
333	list (row 3, column 2).
334	4 is a span
335	------------
336	5
337	===== =====
338
339	Top and bottom borders use '=', column span underlines use '-', column
340	separation is indicated with spaces.
341
342	Passing the above table to the `parse()` method will result in the
343	following data structure, whose interpretation is the same as for
344	`GridTableParser`::
345
346	([5, 25],
347	[[(0, 0, 1, ['col 1']),
348	(0, 0, 1, ['col 2'])]],
349	[[(0, 0, 3, ['1']),
350	(0, 0, 3, ['Second column of row 1.'])],
351	[(0, 0, 4, ['2']),
352	(0, 0, 4, ['Second column of row 2.',
353	'Second line of paragraph.'])],
354	[(0, 0, 6, ['3']),
355	(0, 0, 6, ['- Second column of row 3.',
356	'',
357	'- Second item in bullet',
358	' list (row 3, column 2).'])],
359	[(0, 1, 10, ['4 is a span'])],
360	[(0, 0, 12, ['5']),
361	(0, 0, 12, [''])]])
362	"""
363
364	head_body_separator_pat = re.compile('=[ =]*$')
365	span_pat = re.compile('-[ -]*$')
366
367	def setup(self, block):
368	self.block = block[:] # make a copy; it will be modified
369	self.block.disconnect() # don't propagate changes to parent
370	# Convert top & bottom borders to column span underlines:
371	self.block[0] = self.block[0].replace('=', '-')
372	self.block[-1] = self.block[-1].replace('=', '-')
373	self.head_body_sep = None
374	self.columns = []
375	self.border_end = None
376	self.table = []
377	self.done = [-1] * len(block[0])
378	self.rowseps = {0: [0]}
379	self.colseps = {0: [0]}
380
381	def parse_table(self):
382	"""
383	First determine the column boundaries from the top border, then
384	process rows. Each row may consist of multiple lines; accumulate
385	lines until a row is complete. Call `self.parse_row` to finish the
386	job.
387	"""
388	# Top border must fully describe all table columns.
389	self.columns = self.parse_columns(self.block[0], 0)
390	self.border_end = self.columns[-1][1]
391	firststart, firstend = self.columns[0]
392	offset = 1 # skip top border
393	start = 1
394	text_found = None
395	while offset < len(self.block):
396	line = self.block[offset]
397	if self.span_pat.match(line):
398	# Column span underline or border; row is complete.
399	self.parse_row(self.block[start:offset], start,
400	(line.rstrip(), offset))
401	start = offset + 1
402	text_found = None
403	elif line[firststart:firstend].strip():
404	# First column not blank, therefore it's a new row.
405	if text_found and offset != start:
406	self.parse_row(self.block[start:offset], start)
407	start = offset
408	text_found = 1
409	elif not text_found:
410	start = offset + 1
411	offset += 1
412
413	def parse_columns(self, line, offset):
414	"""
415	Given a column span underline, return a list of (begin, end) pairs.
416	"""
417	cols = []
418	end = 0
419	while 1:
420	begin = line.find('-', end)
421	end = line.find(' ', begin)
422	if begin < 0:
423	break
424	if end < 0:
425	end = len(line)
426	cols.append((begin, end))
427	if self.columns:
428	if cols[-1][1] != self.border_end:
429	raise TableMarkupError('Column span incomplete at line '
430	'offset %s.' % offset)
431	# Allow for an unbounded rightmost column:
432	cols[-1] = (cols[-1][0], self.columns[-1][1])
433	return cols
434
435	def init_row(self, colspec, offset):
436	i = 0
437	cells = []
438	for start, end in colspec:
439	morecols = 0
440	try:
441	assert start == self.columns[i][0]
442	while end != self.columns[i][1]:
443	i += 1
444	morecols += 1
445	except (AssertionError, IndexError):
446	raise TableMarkupError('Column span alignment problem at '
447	'line offset %s.' % (offset + 1))
448	cells.append([0, morecols, offset, []])
449	i += 1
450	return cells
451
452	def parse_row(self, lines, start, spanline=None):
453	"""
454	Given the text `lines` of a row, parse it and append to `self.table`.
455
456	The row is parsed according to the current column spec (either
457	`spanline` if provided or `self.columns`). For each column, extract
458	text from each line, and check for text in column margins. Finally,
459	adjust for insigificant whitespace.
460	"""
461	if not (lines or spanline):
462	# No new row, just blank lines.
463	return
464	if spanline:
465	columns = self.parse_columns(*spanline)
466	span_offset = spanline[1]
467	else:
468	columns = self.columns[:]
469	span_offset = start
470	self.check_columns(lines, start, columns)
471	row = self.init_row(columns, start)
472	for i in range(len(columns)):
473	start, end = columns[i]
474	cellblock = lines.get_2D_block(0, start, len(lines), end)
475	cellblock.disconnect() # lines in cell can't sync with parent
476	cellblock.replace(self.double_width_pad_char, '')
477	row[i][3] = cellblock
478	self.table.append(row)
479
480	def check_columns(self, lines, first_line, columns):
481	"""
482	Check for text in column margins and text overflow in the last column.
483	Raise TableMarkupError if anything but whitespace is in column margins.
484	Adjust the end value for the last column if there is text overflow.
485	"""
486	# "Infinite" value for a dummy last column's beginning, used to
487	# check for text overflow:
488	columns.append((sys.maxint, None))
489	lastcol = len(columns) - 2
490	for i in range(len(columns) - 1):
491	start, end = columns[i]
492	nextstart = columns[i+1][0]
493	offset = 0
494	for line in lines:
495	if i == lastcol and line[end:].strip():
496	text = line[start:].rstrip()
497	new_end = start + len(text)
498	columns[i] = (start, new_end)
499	main_start, main_end = self.columns[-1]
500	if new_end > main_end:
501	self.columns[-1] = (main_start, new_end)
502	elif line[end:nextstart].strip():
503	raise TableMarkupError('Text in column margin at line '
504	'offset %s.' % (first_line + offset))
505	offset += 1
506	columns.pop()
507
508	def structure_from_cells(self):
509	colspecs = [end - start for start, end in self.columns]
510	first_body_row = 0
511	if self.head_body_sep:
512	for i in range(len(self.table)):
513	if self.table[i][0][2] > self.head_body_sep:
514	first_body_row = i
515	break
516	return (colspecs, self.table[:first_body_row],
517	self.table[first_body_row:])
518
519
520	def update_dict_of_lists(master, newdata):
521	"""
522	Extend the list values of `master` with those from `newdata`.
523
524	Both parameters must be dictionaries containing list values.
525	"""
526	for key, values in newdata.items():
527	master.setdefault(key, []).extend(values)

Note: リポジトリブラウザについてのヘルプは TracBrowser を参照してください。

Context Navigation

root/galaxy-central/eggs/docutils-0.4-py2.6.egg/docutils/parsers/rst/tableparser.py

異なるフォーマットでダウンロード: