#! /usr/bin/env python3 # :Copyright: © 2024 Günter Milde. # :License: Released under the terms of the `2-Clause BSD license`_, in short: # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright # notice and this notice are preserved. # This file is offered as-is, without any warranty. # # .. _2-Clause BSD license: https://opensource.org/licenses/BSD-2-Clause # # Revision: $Revision: 9981 $ # Date: $Date: 2024-11-14 17:01:33 +0100 (Do, 14. Nov 2024) $ """A Docutils-XML parser. Provisional: The API is not fixed yet. Defined objects may be renamed or changed in any Docutils release without prior notice. """ import re import xml.etree.ElementTree as ET from docutils import frontend, nodes, parsers, utils class Parser(parsers.Parser): """A Docutils-XML parser.""" supported = ('xml', 'docutils-xml') """Aliases this parser supports.""" config_section = 'xml parser' config_section_dependencies = ('parsers',) settings_default_overrides = {'doctitle_xform': False, 'validate': True, } def parse(self, inputstring, document) -> None: """ Parse `inputstring` and populate `document`, a "document tree". Provisional. """ self.setup_parse(inputstring, document) node = parse_element(inputstring, document) if not isinstance(node, nodes.document): document.append(node) self.finish_parse() class Unknown(nodes.Special, nodes.Inline, nodes.Element): """An unknown element found by the XML parser.""" content_model = (((nodes.Element, nodes.Text), '*'),) # no restrictions def parse_element(inputstring, document=None): """ Parse `inputstring` as "Docutils XML", return `nodes.Element` instance. :inputstring: XML source. :document: `nodes.document` instance (default: a new dummy instance). Provides settings and reporter. Populated and returned, if the inputstring's root element is . Caution: The function does not detect invalid XML. To check the validity of the returned node, you may use its `validate()` method:: node = parse_element('text') node.validate() Provisional. """ root = None parser = ET.XMLPullParser(events=('start',)) for i, line in enumerate(inputstring.splitlines(keepends=True)): try: parser.feed(line) for event, element in parser.read_events(): if root is None: root = element element.attrib['source line'] = str(i+1) except ET.ParseError as e: if document is None: raise document.reporter.error(f'XML parse error: {e}.', source=document.settings._source, line=e.position[0]) break return element2node(root, document) def element2node(element, document=None, unindent=True): """ Convert an `etree` element and its children to Docutils doctree nodes. :element: `xml.etree` element :document: see `parse_element()` :unindent: Remove formatting indentation of follow-up text lines? Cf. `append_text()`. TODO: do we need an "unindent" configuration setting? Return a `docutils.nodes.Element` instance. Internal. """ if document is None: document = utils.new_document('xml input', frontend.get_default_settings(Parser)) document.source == 'xml input' if element is None: problem = nodes.problematic('', 'No XML element found.') return nodes.paragraph('', '', problem) # Get the corresponding `nodes.Element` instance: try: nodeclass = getattr(nodes, element.tag) if not issubclass(nodeclass, nodes.Element): nodeclass = Unknown except AttributeError: nodeclass = Unknown if nodeclass == nodes.document: node = document document.source = document.source or document.settings._source else: node = nodeclass() node.line = int(element.get('source line')) if isinstance(node, Unknown): node.tagname = element.tag document.reporter.warning( f'Unknown element type <{element.tag}>.', base_node=node) # Attributes: convert and add to `node.attributes`. for key, value in element.items(): if key.startswith('{') or key == 'source line': continue # skip duplicate attributes with namespace URL try: node.attributes[key] = nodes.ATTRIBUTE_VALIDATORS[key](value) except (ValueError, KeyError): if key in node.list_attributes: value = value.split() node.attributes[key] = value # node becomes invalid! # Bookkeeping (register some elements/attributes in document-wide lists) if isinstance(node, nodes.decoration): document.decoration = node elif isinstance(node, nodes.substitution_definition): document.note_substitution_def(node, ' '.join(node['names']), document) if node['ids']: # register, check for duplicates document.set_id(node) # TODO: anything missing? # Append content: # update "unindent" flag: change line indentation? unindent = unindent and not isinstance( node, (nodes.FixedTextElement, nodes.literal, Unknown)) # (leading) text append_text(node, element.text, unindent) # children and their tailing text for child in element: node.append(element2node(child, document, unindent)) # Text after a child node append_text(node, child.tail, unindent) return node def append_text(node, text, unindent) -> None: # Format `text`, wrap in a TextElement and append to `node`. # Skip if `text` is empty or just formatting whitespace. if not text: return if unindent: text = re.sub('\n +', '\n', text) if isinstance(node, nodes.TextElement): node.append(nodes.Text(text)) elif text.strip(): # no TextElement: ignore formatting whitespace # but append other text (node becomes invalid!) node.append(nodes.Text(text.strip()))