From 27199452ba5c232e0d8c43ab22aa1bf83aad6c64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Noelia=20Ruiz=20Mart=C3=ADnez?= Date: Thu, 2 Nov 2023 06:38:38 +0100 Subject: [PATCH] Use xml from Python 3.11 --- .../readFeeds/xml/dom/expatbuilder.py | 8 +- .../readFeeds/xml/dom/minidom.py | 58 +- .../readFeeds/xml/dom/pulldom.py | 6 - .../readFeeds/xml/dom/xmlbuilder.py | 1 - .../readFeeds/xml/etree/ElementInclude.py | 58 +- .../readFeeds/xml/etree/ElementPath.py | 160 ++++- .../readFeeds/xml/etree/ElementTree.py | 677 ++++++++++++++---- .../readFeeds/xml/etree/__init__.py | 2 +- .../readFeeds/xml/sax/__init__.py | 10 +- .../readFeeds/xml/sax/expatreader.py | 14 +- .../readFeeds/xml/sax/handler.py | 45 ++ .../readFeeds/xml/sax/saxutils.py | 5 +- 12 files changed, 837 insertions(+), 207 deletions(-) diff --git a/addon/globalPlugins/readFeeds/xml/dom/expatbuilder.py b/addon/globalPlugins/readFeeds/xml/dom/expatbuilder.py index 2bd835b0..199c22d0 100644 --- a/addon/globalPlugins/readFeeds/xml/dom/expatbuilder.py +++ b/addon/globalPlugins/readFeeds/xml/dom/expatbuilder.py @@ -204,11 +204,11 @@ def parseFile(self, file): buffer = file.read(16*1024) if not buffer: break - parser.Parse(buffer, 0) + parser.Parse(buffer, False) if first_buffer and self.document.documentElement: self._setup_subset(buffer) first_buffer = False - parser.Parse("", True) + parser.Parse(b"", True) except ParseEscape: pass doc = self.document @@ -637,7 +637,7 @@ def parseString(self, string): nsattrs = self._getNSattrs() # get ns decls from node's ancestors document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs) try: - parser.Parse(document, 1) + parser.Parse(document, True) except: self.reset() raise @@ -697,7 +697,7 @@ def external_entity_ref_handler(self, context, base, systemId, publicId): self.fragment = self.document.createDocumentFragment() self.curNode = self.fragment try: - parser.Parse(self._source, 1) + parser.Parse(self._source, True) finally: self.curNode = old_cur_node self.document = old_document diff --git a/addon/globalPlugins/readFeeds/xml/dom/minidom.py b/addon/globalPlugins/readFeeds/xml/dom/minidom.py index 24957ea1..ef8a1598 100644 --- a/addon/globalPlugins/readFeeds/xml/dom/minidom.py +++ b/addon/globalPlugins/readFeeds/xml/dom/minidom.py @@ -43,10 +43,11 @@ class Node(xml.dom.Node): def __bool__(self): return True - def toxml(self, encoding=None): - return self.toprettyxml("", "", encoding) + def toxml(self, encoding=None, standalone=None): + return self.toprettyxml("", "", encoding, standalone) - def toprettyxml(self, indent="\t", newl="\n", encoding=None): + def toprettyxml(self, indent="\t", newl="\n", encoding=None, + standalone=None): if encoding is None: writer = io.StringIO() else: @@ -56,7 +57,7 @@ def toprettyxml(self, indent="\t", newl="\n", encoding=None): newline='\n') if self.nodeType == Node.DOCUMENT_NODE: # Can pass encoding only to document, to put it into XML header - self.writexml(writer, "", indent, newl, encoding) + self.writexml(writer, "", indent, newl, encoding, standalone) else: self.writexml(writer, "", indent, newl) if encoding is None: @@ -357,6 +358,8 @@ def __init__(self, qName, namespaceURI=EMPTY_NAMESPACE, localName=None, self._name = qName self.namespaceURI = namespaceURI self._prefix = prefix + if localName is not None: + self._localName = localName self.childNodes = NodeList() # Add the single child node that represents the value of the attr @@ -718,6 +721,14 @@ def unlink(self): Node.unlink(self) def getAttribute(self, attname): + """Returns the value of the specified attribute. + + Returns the value of the element's attribute named attname as + a string. An empty string is returned if the element does not + have such an attribute. Note that an empty string may also be + returned as an explicitly given attribute value, use the + hasAttribute method to distinguish these two cases. + """ if self._attrs is None: return "" try: @@ -823,10 +834,16 @@ def removeAttributeNode(self, node): # Restore this since the node is still useful and otherwise # unlinked node.ownerDocument = self.ownerDocument + return node removeAttributeNodeNS = removeAttributeNode def hasAttribute(self, name): + """Checks whether the element has an attribute with the specified name. + + Returns True if the element has an attribute with the specified name. + Otherwise, returns False. + """ if self._attrs is None: return False return name in self._attrs @@ -837,6 +854,11 @@ def hasAttributeNS(self, namespaceURI, localName): return (namespaceURI, localName) in self._attrsNS def getElementsByTagName(self, name): + """Returns all descendant elements with the given tag name. + + Returns the list of all descendant elements (not direct children + only) with the specified tag name. + """ return _get_elements_by_tagName_helper(self, name, NodeList()) def getElementsByTagNameNS(self, namespaceURI, localName): @@ -847,22 +869,27 @@ def __repr__(self): return "" % (self.tagName, id(self)) def writexml(self, writer, indent="", addindent="", newl=""): + """Write an XML element to a file-like object + + Write the element to the writer object that must provide + a write method (e.g. a file or StringIO object). + """ # indent = current indentation # addindent = indentation to add to higher levels # newl = newline string writer.write(indent+"<" + self.tagName) attrs = self._get_attributes() - a_names = sorted(attrs.keys()) - for a_name in a_names: + for a_name in attrs.keys(): writer.write(" %s=\"" % a_name) _write_data(writer, attrs[a_name].value) writer.write("\"") if self.childNodes: writer.write(">") if (len(self.childNodes) == 1 and - self.childNodes[0].nodeType == Node.TEXT_NODE): + self.childNodes[0].nodeType in ( + Node.TEXT_NODE, Node.CDATA_SECTION_NODE)): self.childNodes[0].writexml(writer, '', '', '') else: writer.write(newl) @@ -1786,12 +1813,17 @@ def importNode(self, node, deep): raise xml.dom.NotSupportedErr("cannot import document type nodes") return _clone_node(node, deep, self) - def writexml(self, writer, indent="", addindent="", newl="", encoding=None): - if encoding is None: - writer.write(''+newl) - else: - writer.write('%s' % ( - encoding, newl)) + def writexml(self, writer, indent="", addindent="", newl="", encoding=None, + standalone=None): + declarations = [] + + if encoding: + declarations.append(f'encoding="{encoding}"') + if standalone is not None: + declarations.append(f'standalone="{"yes" if standalone else "no"}"') + + writer.write(f'{newl}') + for node in self.childNodes: node.writexml(writer, indent, addindent, newl) diff --git a/addon/globalPlugins/readFeeds/xml/dom/pulldom.py b/addon/globalPlugins/readFeeds/xml/dom/pulldom.py index 43504f76..913141cd 100644 --- a/addon/globalPlugins/readFeeds/xml/dom/pulldom.py +++ b/addon/globalPlugins/readFeeds/xml/dom/pulldom.py @@ -216,12 +216,6 @@ def reset(self): self.parser.setFeature(xml.sax.handler.feature_namespaces, 1) self.parser.setContentHandler(self.pulldom) - def __getitem__(self, pos): - rc = self.getEvent() - if rc: - return rc - raise IndexError - def __next__(self): rc = self.getEvent() if rc: diff --git a/addon/globalPlugins/readFeeds/xml/dom/xmlbuilder.py b/addon/globalPlugins/readFeeds/xml/dom/xmlbuilder.py index 213ab145..8a200263 100644 --- a/addon/globalPlugins/readFeeds/xml/dom/xmlbuilder.py +++ b/addon/globalPlugins/readFeeds/xml/dom/xmlbuilder.py @@ -1,7 +1,6 @@ """Implementation of the DOM Level 3 'LS-Load' feature.""" import copy -import warnings import xml.dom from xml.dom.NodeFilter import NodeFilter diff --git a/addon/globalPlugins/readFeeds/xml/etree/ElementInclude.py b/addon/globalPlugins/readFeeds/xml/etree/ElementInclude.py index 963470e3..40a9b222 100644 --- a/addon/globalPlugins/readFeeds/xml/etree/ElementInclude.py +++ b/addon/globalPlugins/readFeeds/xml/etree/ElementInclude.py @@ -42,7 +42,7 @@ # -------------------------------------------------------------------- # Licensed to PSF under a Contributor Agreement. -# See http://www.python.org/psf/license for licensing details. +# See https://www.python.org/psf/license for licensing details. ## # Limited XInclude support for the ElementTree package. @@ -50,18 +50,28 @@ import copy from . import ElementTree +from urllib.parse import urljoin XINCLUDE = "{http://www.w3.org/2001/XInclude}" XINCLUDE_INCLUDE = XINCLUDE + "include" XINCLUDE_FALLBACK = XINCLUDE + "fallback" +# For security reasons, the inclusion depth is limited to this read-only value by default. +DEFAULT_MAX_INCLUSION_DEPTH = 6 + + ## # Fatal include error. class FatalIncludeError(SyntaxError): pass + +class LimitedRecursiveIncludeError(FatalIncludeError): + pass + + ## # Default loader. This loader reads an included resource from disk. # @@ -92,13 +102,33 @@ def default_loader(href, parse, encoding=None): # @param loader Optional resource loader. If omitted, it defaults # to {@link default_loader}. If given, it should be a callable # that implements the same interface as default_loader. +# @param base_url The base URL of the original file, to resolve +# relative include file references. +# @param max_depth The maximum number of recursive inclusions. +# Limited to reduce the risk of malicious content explosion. +# Pass a negative value to disable the limitation. +# @throws LimitedRecursiveIncludeError If the {@link max_depth} was exceeded. # @throws FatalIncludeError If the function fails to include a given # resource, or if the tree contains malformed XInclude elements. -# @throws OSError If the function fails to load a given resource. +# @throws IOError If the function fails to load a given resource. +# @returns the node or its replacement if it was an XInclude node -def include(elem, loader=None): +def include(elem, loader=None, base_url=None, + max_depth=DEFAULT_MAX_INCLUSION_DEPTH): + if max_depth is None: + max_depth = -1 + elif max_depth < 0: + raise ValueError("expected non-negative depth or None for 'max_depth', got %r" % max_depth) + + if hasattr(elem, 'getroot'): + elem = elem.getroot() if loader is None: loader = default_loader + + _include(elem, loader, base_url, max_depth, set()) + + +def _include(elem, loader, base_url, max_depth, _parent_hrefs): # look for xinclude elements i = 0 while i < len(elem): @@ -106,14 +136,24 @@ def include(elem, loader=None): if e.tag == XINCLUDE_INCLUDE: # process xinclude directive href = e.get("href") + if base_url: + href = urljoin(base_url, href) parse = e.get("parse", "xml") if parse == "xml": + if href in _parent_hrefs: + raise FatalIncludeError("recursive include of %s" % href) + if max_depth == 0: + raise LimitedRecursiveIncludeError( + "maximum xinclude depth reached when including file %s" % href) + _parent_hrefs.add(href) node = loader(href, parse) if node is None: raise FatalIncludeError( "cannot load %r as %r" % (href, parse) ) - node = copy.copy(node) + node = copy.copy(node) # FIXME: this makes little sense with recursive includes + _include(node, loader, href, max_depth - 1, _parent_hrefs) + _parent_hrefs.remove(href) if e.tail: node.tail = (node.tail or "") + e.tail elem[i] = node @@ -123,11 +163,13 @@ def include(elem, loader=None): raise FatalIncludeError( "cannot load %r as %r" % (href, parse) ) + if e.tail: + text += e.tail if i: node = elem[i-1] - node.tail = (node.tail or "") + text + (e.tail or "") + node.tail = (node.tail or "") + text else: - elem.text = (elem.text or "") + text + (e.tail or "") + elem.text = (elem.text or "") + text del elem[i] continue else: @@ -139,5 +181,5 @@ def include(elem, loader=None): "xi:fallback tag must be child of xi:include (%r)" % e.tag ) else: - include(e, loader) - i = i + 1 + _include(e, loader, base_url, max_depth, _parent_hrefs) + i += 1 diff --git a/addon/globalPlugins/readFeeds/xml/etree/ElementPath.py b/addon/globalPlugins/readFeeds/xml/etree/ElementPath.py index ef32917b..dc6bd28c 100644 --- a/addon/globalPlugins/readFeeds/xml/etree/ElementPath.py +++ b/addon/globalPlugins/readFeeds/xml/etree/ElementPath.py @@ -48,7 +48,7 @@ # -------------------------------------------------------------------- # Licensed to PSF under a Contributor Agreement. -# See http://www.python.org/psf/license for licensing details. +# See https://www.python.org/psf/license for licensing details. ## # Implementation module for XPath support. There's usually no reason @@ -65,24 +65,35 @@ r"//?|" r"\.\.|" r"\(\)|" + r"!=|" r"[/.*:\[\]\(\)@=])|" - r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|" + r"((?:\{[^}]+\})?[^/\[\]\(\)@!=\s]+)|" r"\s+" ) def xpath_tokenizer(pattern, namespaces=None): + default_namespace = namespaces.get('') if namespaces else None + parsing_attribute = False for token in xpath_tokenizer_re.findall(pattern): - tag = token[1] - if tag and tag[0] != "{" and ":" in tag: - try: + ttype, tag = token + if tag and tag[0] != "{": + if ":" in tag: prefix, uri = tag.split(":", 1) - if not namespaces: - raise KeyError - yield token[0], "{%s}%s" % (namespaces[prefix], uri) - except KeyError: - raise SyntaxError("prefix %r not found in prefix map" % prefix) from None + try: + if not namespaces: + raise KeyError + yield ttype, "{%s}%s" % (namespaces[prefix], uri) + except KeyError: + raise SyntaxError("prefix %r not found in prefix map" % prefix) from None + elif default_namespace and not parsing_attribute: + yield ttype, "{%s}%s" % (default_namespace, tag) + else: + yield token + parsing_attribute = False else: yield token + parsing_attribute = ttype == '@' + def get_parent_map(context): parent_map = context.parent_map @@ -93,13 +104,69 @@ def get_parent_map(context): parent_map[e] = p return parent_map + +def _is_wildcard_tag(tag): + return tag[:3] == '{*}' or tag[-2:] == '}*' + + +def _prepare_tag(tag): + _isinstance, _str = isinstance, str + if tag == '{*}*': + # Same as '*', but no comments or processing instructions. + # It can be a surprise that '*' includes those, but there is no + # justification for '{*}*' doing the same. + def select(context, result): + for elem in result: + if _isinstance(elem.tag, _str): + yield elem + elif tag == '{}*': + # Any tag that is not in a namespace. + def select(context, result): + for elem in result: + el_tag = elem.tag + if _isinstance(el_tag, _str) and el_tag[0] != '{': + yield elem + elif tag[:3] == '{*}': + # The tag in any (or no) namespace. + suffix = tag[2:] # '}name' + no_ns = slice(-len(suffix), None) + tag = tag[3:] + def select(context, result): + for elem in result: + el_tag = elem.tag + if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix: + yield elem + elif tag[-2:] == '}*': + # Any tag in the given namespace. + ns = tag[:-1] + ns_only = slice(None, len(ns)) + def select(context, result): + for elem in result: + el_tag = elem.tag + if _isinstance(el_tag, _str) and el_tag[ns_only] == ns: + yield elem + else: + raise RuntimeError(f"internal parser error, got {tag}") + return select + + def prepare_child(next, token): tag = token[1] - def select(context, result): - for elem in result: - for e in elem: - if e.tag == tag: - yield e + if _is_wildcard_tag(tag): + select_tag = _prepare_tag(tag) + def select(context, result): + def select_child(result): + for elem in result: + yield from elem + return select_tag(context, select_child(result)) + else: + if tag[:2] == '{}': + tag = tag[2:] # '{}tag' == 'tag' + def select(context, result): + for elem in result: + for e in elem: + if e.tag == tag: + yield e return select def prepare_star(next, token): @@ -124,11 +191,24 @@ def prepare_descendant(next, token): tag = token[1] else: raise SyntaxError("invalid descendant") - def select(context, result): - for elem in result: - for e in elem.iter(tag): - if e is not elem: - yield e + + if _is_wildcard_tag(tag): + select_tag = _prepare_tag(tag) + def select(context, result): + def select_child(result): + for elem in result: + for e in elem.iter(): + if e is not elem: + yield e + return select_tag(context, select_child(result)) + else: + if tag[:2] == '{}': + tag = tag[2:] # '{}tag' == 'tag' + def select(context, result): + for elem in result: + for e in elem.iter(tag): + if e is not elem: + yield e return select def prepare_parent(next, token): @@ -146,7 +226,6 @@ def select(context, result): def prepare_predicate(next, token): # FIXME: replace with real parser!!! refs: - # http://effbot.org/zone/simple-iterator-parser.htm # http://javascript.crockford.com/tdop/tdop.html signature = [] predicate = [] @@ -174,15 +253,19 @@ def select(context, result): if elem.get(key) is not None: yield elem return select - if signature == "@-='": - # [@attribute='value'] + if signature == "@-='" or signature == "@-!='": + # [@attribute='value'] or [@attribute!='value'] key = predicate[1] value = predicate[-1] def select(context, result): for elem in result: if elem.get(key) == value: yield elem - return select + def select_negated(context, result): + for elem in result: + if (attr_value := elem.get(key)) is not None and attr_value != value: + yield elem + return select_negated if '!=' in signature else select if signature == "-" and not re.match(r"\-?\d+$", predicate[0]): # [tag] tag = predicate[0] @@ -191,8 +274,10 @@ def select(context, result): if elem.find(tag) is not None: yield elem return select - if signature == ".='" or (signature == "-='" and not re.match(r"\-?\d+$", predicate[0])): - # [.='value'] or [tag='value'] + if signature == ".='" or signature == ".!='" or ( + (signature == "-='" or signature == "-!='") + and not re.match(r"\-?\d+$", predicate[0])): + # [.='value'] or [tag='value'] or [.!='value'] or [tag!='value'] tag = predicate[0] value = predicate[-1] if tag: @@ -202,12 +287,22 @@ def select(context, result): if "".join(e.itertext()) == value: yield elem break + def select_negated(context, result): + for elem in result: + for e in elem.iterfind(tag): + if "".join(e.itertext()) != value: + yield elem + break else: def select(context, result): for elem in result: if "".join(elem.itertext()) == value: yield elem - return select + def select_negated(context, result): + for elem in result: + if "".join(elem.itertext()) != value: + yield elem + return select_negated if '!=' in signature else select if signature == "-" or signature == "-()" or signature == "-()-": # [index] or [last()] or [last()-index] if signature == "-": @@ -264,10 +359,13 @@ def __init__(self, root): def iterfind(elem, path, namespaces=None): # compile selector pattern - cache_key = (path, None if namespaces is None - else tuple(sorted(namespaces.items()))) if path[-1:] == "/": path = path + "*" # implicit all (FIXME: keep this?) + + cache_key = (path,) + if namespaces: + cache_key += tuple(sorted(namespaces.items())) + try: selector = _cache[cache_key] except KeyError: @@ -318,6 +416,8 @@ def findall(elem, path, namespaces=None): def findtext(elem, path, default=None, namespaces=None): try: elem = next(iterfind(elem, path, namespaces)) - return elem.text or "" + if elem.text is None: + return "" + return elem.text except StopIteration: return default diff --git a/addon/globalPlugins/readFeeds/xml/etree/ElementTree.py b/addon/globalPlugins/readFeeds/xml/etree/ElementTree.py index 87277045..1dc80351 100644 --- a/addon/globalPlugins/readFeeds/xml/etree/ElementTree.py +++ b/addon/globalPlugins/readFeeds/xml/etree/ElementTree.py @@ -35,7 +35,7 @@ #--------------------------------------------------------------------- # Licensed to PSF under a Contributor Agreement. -# See http://www.python.org/psf/license for licensing details. +# See https://www.python.org/psf/license for licensing details. # # ElementTree # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved. @@ -76,7 +76,7 @@ "dump", "Element", "ElementTree", "fromstring", "fromstringlist", - "iselement", "iterparse", + "indent", "iselement", "iterparse", "parse", "ParseError", "PI", "ProcessingInstruction", "QName", @@ -87,6 +87,7 @@ "XML", "XMLID", "XMLParser", "XMLPullParser", "register_namespace", + "canonicalize", "C14NWriterTarget", ] VERSION = "1.3.0" @@ -169,10 +170,8 @@ def __init__(self, tag, attrib={}, **extra): if not isinstance(attrib, dict): raise TypeError("attrib must be dict, not %s" % ( attrib.__class__.__name__,)) - attrib = attrib.copy() - attrib.update(extra) self.tag = tag - self.attrib = attrib + self.attrib = {**attrib, **extra} self._children = [] def __repr__(self): @@ -196,6 +195,13 @@ def copy(self): original tree. """ + warnings.warn( + "elem.copy() is deprecated. Use copy.copy(elem) instead.", + DeprecationWarning + ) + return self.__copy__() + + def __copy__(self): elem = self.makeelement(self.tag, self.attrib) elem.text = self.text elem.tail = self.tail @@ -217,11 +223,11 @@ def __getitem__(self, index): return self._children[index] def __setitem__(self, index, element): - # if isinstance(index, slice): - # for elt in element: - # assert iselement(elt) - # else: - # assert iselement(element) + if isinstance(index, slice): + for elt in element: + self._assert_is_element(elt) + else: + self._assert_is_element(element) self._children[index] = element def __delitem__(self, index): @@ -246,7 +252,7 @@ def extend(self, elements): """ for element in elements: self._assert_is_element(element) - self._children.extend(elements) + self._children.append(element) def insert(self, index, subelement): """Insert *subelement* at position *index*.""" @@ -274,19 +280,6 @@ def remove(self, subelement): # assert iselement(element) self._children.remove(subelement) - def getchildren(self): - """(Deprecated) Return all subelements. - - Elements are returned in document order. - - """ - warnings.warn( - "This method will be removed in future versions. " - "Use 'list(elem)' or iteration over elem instead.", - DeprecationWarning, stacklevel=2 - ) - return self._children - def find(self, path, namespaces=None): """Find first matching element by tag name or path. @@ -410,16 +403,6 @@ def iter(self, tag=None): for e in self._children: yield from e.iter(tag) - # compatibility - def getiterator(self, tag=None): - # Change for a DeprecationWarning in 1.4 - warnings.warn( - "This method will be removed in future versions. " - "Use 'elem.iter()' or 'list(elem.iter())' instead.", - PendingDeprecationWarning, stacklevel=2 - ) - return list(self.iter(tag)) - def itertext(self): """Create text iterator. @@ -452,8 +435,7 @@ def SubElement(parent, tag, attrib={}, **extra): additional attributes given as keyword arguments. """ - attrib = attrib.copy() - attrib.update(extra) + attrib = {**attrib, **extra} element = parent.makeelement(tag, attrib) parent.append(element) return element @@ -620,16 +602,6 @@ def iter(self, tag=None): # assert self._root is not None return self._root.iter(tag) - # compatibility - def getiterator(self, tag=None): - # Change for a DeprecationWarning in 1.4 - warnings.warn( - "This method will be removed in future versions. " - "Use 'tree.iter()' or 'list(tree.iter())' instead.", - PendingDeprecationWarning, stacklevel=2 - ) - return list(self.iter(tag)) - def find(self, path, namespaces=None): """Find first matching element by tag name or path. @@ -756,16 +728,11 @@ def write(self, file_or_filename, encoding = "utf-8" else: encoding = "us-ascii" - enc_lower = encoding.lower() - with _get_writer(file_or_filename, enc_lower) as write: + with _get_writer(file_or_filename, encoding) as (write, declared_encoding): if method == "xml" and (xml_declaration or (xml_declaration is None and - enc_lower not in ("utf-8", "us-ascii", "unicode"))): - declared_encoding = encoding - if enc_lower == "unicode": - # Retrieve the default encoding for the xml declaration - import locale - declared_encoding = locale.getpreferredencoding() + encoding.lower() != "unicode" and + declared_encoding.lower() not in ("utf-8", "us-ascii"))): write("\n" % ( declared_encoding,)) if method == "text": @@ -790,19 +757,17 @@ def _get_writer(file_or_filename, encoding): write = file_or_filename.write except AttributeError: # file_or_filename is a file name - if encoding == "unicode": - file = open(file_or_filename, "w") - else: - file = open(file_or_filename, "w", encoding=encoding, - errors="xmlcharrefreplace") - with file: - yield file.write + if encoding.lower() == "unicode": + encoding="utf-8" + with open(file_or_filename, "w", encoding=encoding, + errors="xmlcharrefreplace") as file: + yield file.write, encoding else: # file_or_filename is a file-like object # encoding determines if it is a text or binary writer - if encoding == "unicode": + if encoding.lower() == "unicode": # use a text writer as is - yield write + yield write, getattr(file_or_filename, "encoding", None) or "utf-8" else: # wrap a binary writer with TextIOWrapper with contextlib.ExitStack() as stack: @@ -833,7 +798,7 @@ def _get_writer(file_or_filename, encoding): # Keep the original file open when the TextIOWrapper is # destroyed stack.callback(file.detach) - yield file.write + yield file.write, encoding def _namespaces(elem, default_namespace=None): # identify namespaces used in this tree @@ -925,7 +890,7 @@ def _serialize_xml(write, elem, qnames, namespaces, k, _escape_attrib(v) )) - for k, v in sorted(items): # lexical order + for k, v in items: if isinstance(k, QName): k = k.text if isinstance(v, QName): @@ -946,13 +911,9 @@ def _serialize_xml(write, elem, qnames, namespaces, if elem.tail: write(_escape_cdata(elem.tail)) -HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr", - "img", "input", "isindex", "link", "meta", "param") - -try: - HTML_EMPTY = set(HTML_EMPTY) -except NameError: - pass +HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr", + "img", "input", "isindex", "link", "meta", "param", "source", + "track", "wbr"} def _serialize_html(write, elem, qnames, namespaces, **kwargs): tag = elem.tag @@ -981,7 +942,7 @@ def _serialize_html(write, elem, qnames, namespaces, **kwargs): k, _escape_attrib(v) )) - for k, v in sorted(items): # lexical order + for k, v in items: if isinstance(k, QName): k = k.text if isinstance(v, QName): @@ -1085,15 +1046,15 @@ def _escape_attrib(text): text = text.replace(">", ">") if "\"" in text: text = text.replace("\"", """) - # The following business with carriage returns is to satisfy - # Section 2.11 of the XML specification, stating that - # CR or CR LN should be replaced with just LN + # Although section 2.11 of the XML specification states that CR or + # CR LN should be replaced with just LN, it applies only to EOLNs + # which take part of organizing file into lines. Within attributes, + # we are replacing these with entity numbers, so they do not count. # http://www.w3.org/TR/REC-xml/#sec-line-ends - if "\r\n" in text: - text = text.replace("\r\n", "\n") + # The current solution, contained in following six lines, was + # discussed in issue 17582 and 39011. if "\r" in text: - text = text.replace("\r", "\n") - #The following four lines are issue 17582 + text = text.replace("\r", " ") if "\n" in text: text = text.replace("\n", " ") if "\t" in text: @@ -1118,6 +1079,7 @@ def _escape_attrib_html(text): # -------------------------------------------------------------------- def tostring(element, encoding=None, method=None, *, + xml_declaration=None, default_namespace=None, short_empty_elements=True): """Generate string representation of XML element. @@ -1126,13 +1088,17 @@ def tostring(element, encoding=None, method=None, *, *element* is an Element instance, *encoding* is an optional output encoding defaulting to US-ASCII, *method* is an optional output which can - be one of "xml" (default), "html", "text" or "c14n". + be one of "xml" (default), "html", "text" or "c14n", *default_namespace* + sets the default XML namespace (for "xmlns"). Returns an (optionally) encoded string containing the XML data. """ stream = io.StringIO() if encoding == 'unicode' else io.BytesIO() - ElementTree(element).write(stream, encoding, method=method, + ElementTree(element).write(stream, encoding, + xml_declaration=xml_declaration, + default_namespace=default_namespace, + method=method, short_empty_elements=short_empty_elements) return stream.getvalue() @@ -1154,10 +1120,14 @@ def tell(self): return len(self.lst) def tostringlist(element, encoding=None, method=None, *, + xml_declaration=None, default_namespace=None, short_empty_elements=True): lst = [] stream = _ListDataStream(lst) - ElementTree(element).write(stream, encoding, method=method, + ElementTree(element).write(stream, encoding, + xml_declaration=xml_declaration, + default_namespace=default_namespace, + method=method, short_empty_elements=short_empty_elements) return lst @@ -1180,6 +1150,57 @@ def dump(elem): if not tail or tail[-1] != "\n": sys.stdout.write("\n") + +def indent(tree, space=" ", level=0): + """Indent an XML document by inserting newlines and indentation space + after elements. + + *tree* is the ElementTree or Element to modify. The (root) element + itself will not be changed, but the tail text of all elements in its + subtree will be adapted. + + *space* is the whitespace to insert for each indentation level, two + space characters by default. + + *level* is the initial indentation level. Setting this to a higher + value than 0 can be used for indenting subtrees that are more deeply + nested inside of a document. + """ + if isinstance(tree, ElementTree): + tree = tree.getroot() + if level < 0: + raise ValueError(f"Initial indentation level must be >= 0, got {level}") + if not len(tree): + return + + # Reduce the memory consumption by reusing indentation strings. + indentations = ["\n" + level * space] + + def _indent_children(elem, level): + # Start a new indentation level for the first child. + child_level = level + 1 + try: + child_indentation = indentations[child_level] + except IndexError: + child_indentation = indentations[level] + space + indentations.append(child_indentation) + + if not elem.text or not elem.text.strip(): + elem.text = child_indentation + + for child in elem: + if len(child): + _indent_children(child, child_level) + if not child.tail or not child.tail.strip(): + child.tail = child_indentation + + # Dedent after the last child by overwriting the previous indentation. + if not child.tail.strip(): + child.tail = indentations[level] + + _indent_children(tree, 0) + + # -------------------------------------------------------------------- # parsing @@ -1216,8 +1237,14 @@ def iterparse(source, events=None, parser=None): # Use the internal, undocumented _parser argument for now; When the # parser argument of iterparse is removed, this can be killed. pullparser = XMLPullParser(events=events, _parser=parser) - def iterator(): + + def iterator(source): + close_source = False try: + if not hasattr(source, "read"): + source = open(source, "rb") + close_source = True + yield None while True: yield from pullparser.read_events() # load event buffer @@ -1233,16 +1260,12 @@ def iterator(): source.close() class IterParseIterator(collections.abc.Iterator): - __next__ = iterator().__next__ + __next__ = iterator(source).__next__ it = IterParseIterator() it.root = None del iterator, IterParseIterator - close_source = False - if not hasattr(source, "read"): - source = open(source, "rb") - close_source = True - + next(it) return it @@ -1251,7 +1274,7 @@ class XMLPullParser: def __init__(self, events=None, *, _parser=None): # The _parser argument is for internal use only and must not be relied # upon in user code. It will be removed in a future release. - # See http://bugs.python.org/issue17741 for more details. + # See https://bugs.python.org/issue17741 for more details. self._events_queue = collections.deque() self._parser = _parser or XMLParser(target=TreeBuilder()) @@ -1370,12 +1393,30 @@ class TreeBuilder: *element_factory* is an optional element factory which is called to create new Element instances, as necessary. + *comment_factory* is a factory to create comments to be used instead of + the standard factory. If *insert_comments* is false (the default), + comments will not be inserted into the tree. + + *pi_factory* is a factory to create processing instructions to be used + instead of the standard factory. If *insert_pis* is false (the default), + processing instructions will not be inserted into the tree. """ - def __init__(self, element_factory=None): + def __init__(self, element_factory=None, *, + comment_factory=None, pi_factory=None, + insert_comments=False, insert_pis=False): self._data = [] # data collector self._elem = [] # element stack self._last = None # last element + self._root = None # root element self._tail = None # true if we're after an end tag + if comment_factory is None: + comment_factory = Comment + self._comment_factory = comment_factory + self.insert_comments = insert_comments + if pi_factory is None: + pi_factory = ProcessingInstruction + self._pi_factory = pi_factory + self.insert_pis = insert_pis if element_factory is None: element_factory = Element self._factory = element_factory @@ -1383,8 +1424,8 @@ def __init__(self, element_factory=None): def close(self): """Flush builder buffers and return toplevel document Element.""" assert len(self._elem) == 0, "missing end tags" - assert self._last is not None, "missing toplevel element" - return self._last + assert self._root is not None, "missing toplevel element" + return self._root def _flush(self): if self._data: @@ -1413,6 +1454,8 @@ def start(self, tag, attrs): self._last = elem = self._factory(tag, attrs) if self._elem: self._elem[-1].append(elem) + elif self._root is None: + self._root = elem self._elem.append(elem) self._tail = 0 return elem @@ -1431,13 +1474,38 @@ def end(self, tag): self._tail = 1 return self._last -_sentinel = ['sentinel'] + def comment(self, text): + """Create a comment using the comment_factory. + + *text* is the text of the comment. + """ + return self._handle_single( + self._comment_factory, self.insert_comments, text) + + def pi(self, target, text=None): + """Create a processing instruction using the pi_factory. + + *target* is the target name of the processing instruction. + *text* is the data of the processing instruction, or ''. + """ + return self._handle_single( + self._pi_factory, self.insert_pis, target, text) + + def _handle_single(self, factory, insert, *args): + elem = factory(*args) + if insert: + self._flush() + self._last = elem + if self._elem: + self._elem[-1].append(elem) + self._tail = 1 + return elem + # also see ElementTree and TreeBuilder class XMLParser: """Element structure builder for XML source data based on the expat parser. - *html* are predefined HTML entities (deprecated and not supported), *target* is an optional target object which defaults to an instance of the standard TreeBuilder class, *encoding* is an optional encoding string which if given, overrides the encoding specified in the XML file: @@ -1445,11 +1513,7 @@ class XMLParser: """ - def __init__(self, html=_sentinel, target=None, encoding=None): - if html is not _sentinel: - warnings.warn( - "The html argument of XMLParser() is deprecated", - DeprecationWarning, stacklevel=2) + def __init__(self, *, target=None, encoding=None): try: from xml.parsers import expat except ImportError: @@ -1473,6 +1537,10 @@ def __init__(self, html=_sentinel, target=None, encoding=None): parser.StartElementHandler = self._start if hasattr(target, 'end'): parser.EndElementHandler = self._end + if hasattr(target, 'start_ns'): + parser.StartNamespaceDeclHandler = self._start_ns + if hasattr(target, 'end_ns'): + parser.EndNamespaceDeclHandler = self._end_ns if hasattr(target, 'data'): parser.CharacterDataHandler = target.data # miscellaneous callbacks @@ -1483,7 +1551,6 @@ def __init__(self, html=_sentinel, target=None, encoding=None): # Configure pyexpat: buffering, new-style attribute handling. parser.buffer_text = 1 parser.ordered_attributes = 1 - parser.specified_attributes = 1 self._doctype = None self.entity = {} try: @@ -1503,7 +1570,6 @@ def _setevents(self, events_queue, events_to_report): for event_name in events_to_report: if event_name == "start": parser.ordered_attributes = 1 - parser.specified_attributes = 1 def handler(tag, attrib_in, event=event_name, append=append, start=self._start): append((event, start(tag, attrib_in))) @@ -1514,13 +1580,34 @@ def handler(tag, event=event_name, append=append, append((event, end(tag))) parser.EndElementHandler = handler elif event_name == "start-ns": - def handler(prefix, uri, event=event_name, append=append): - append((event, (prefix or "", uri or ""))) + # TreeBuilder does not implement .start_ns() + if hasattr(self.target, "start_ns"): + def handler(prefix, uri, event=event_name, append=append, + start_ns=self._start_ns): + append((event, start_ns(prefix, uri))) + else: + def handler(prefix, uri, event=event_name, append=append): + append((event, (prefix or '', uri or ''))) parser.StartNamespaceDeclHandler = handler elif event_name == "end-ns": - def handler(prefix, event=event_name, append=append): - append((event, None)) + # TreeBuilder does not implement .end_ns() + if hasattr(self.target, "end_ns"): + def handler(prefix, event=event_name, append=append, + end_ns=self._end_ns): + append((event, end_ns(prefix))) + else: + def handler(prefix, event=event_name, append=append): + append((event, None)) parser.EndNamespaceDeclHandler = handler + elif event_name == 'comment': + def handler(text, event=event_name, append=append, self=self): + append((event, self.target.comment(text))) + parser.CommentHandler = handler + elif event_name == 'pi': + def handler(pi_target, data, event=event_name, append=append, + self=self): + append((event, self.target.pi(pi_target, data))) + parser.ProcessingInstructionHandler = handler else: raise ValueError("unknown event %r" % event_name) @@ -1541,6 +1628,12 @@ def _fixname(self, key): self._names[key] = name return name + def _start_ns(self, prefix, uri): + return self.target.start_ns(prefix or '', uri or '') + + def _end_ns(self, prefix): + return self.target.end_ns(prefix or '') + def _start(self, tag, attr_list): # Handler for expat's StartElementHandler. Since ordered_attributes # is set, the attributes are reported as a list of alternating @@ -1602,39 +1695,25 @@ def _default(self, text): return if hasattr(self.target, "doctype"): self.target.doctype(name, pubid, system[1:-1]) - elif self.doctype != self._XMLParser__doctype: - # warn about deprecated call - self._XMLParser__doctype(name, pubid, system[1:-1]) - self.doctype(name, pubid, system[1:-1]) - self._doctype = None - - def doctype(self, name, pubid, system): - """(Deprecated) Handle doctype declaration - - *name* is the Doctype name, *pubid* is the public identifier, - and *system* is the system identifier. - - """ - warnings.warn( - "This method of XMLParser is deprecated. Define doctype() " - "method on the TreeBuilder target.", - DeprecationWarning, - ) + elif hasattr(self, "doctype"): + warnings.warn( + "The doctype() method of XMLParser is ignored. " + "Define doctype() method on the TreeBuilder target.", + RuntimeWarning) - # sentinel, if doctype is redefined in a subclass - __doctype = doctype + self._doctype = None def feed(self, data): """Feed encoded data to parser.""" try: - self.parser.Parse(data, 0) + self.parser.Parse(data, False) except self._error as v: self._raiseerror(v) def close(self): """Finish feeding data to parser and return element structure.""" try: - self.parser.Parse("", 1) # end of data + self.parser.Parse(b"", True) # end of data except self._error as v: self._raiseerror(v) try: @@ -1649,6 +1728,341 @@ def close(self): del self.target, self._target +# -------------------------------------------------------------------- +# C14N 2.0 + +def canonicalize(xml_data=None, *, out=None, from_file=None, **options): + """Convert XML to its C14N 2.0 serialised form. + + If *out* is provided, it must be a file or file-like object that receives + the serialised canonical XML output (text, not bytes) through its ``.write()`` + method. To write to a file, open it in text mode with encoding "utf-8". + If *out* is not provided, this function returns the output as text string. + + Either *xml_data* (an XML string) or *from_file* (a file path or + file-like object) must be provided as input. + + The configuration options are the same as for the ``C14NWriterTarget``. + """ + if xml_data is None and from_file is None: + raise ValueError("Either 'xml_data' or 'from_file' must be provided as input") + sio = None + if out is None: + sio = out = io.StringIO() + + parser = XMLParser(target=C14NWriterTarget(out.write, **options)) + + if xml_data is not None: + parser.feed(xml_data) + parser.close() + elif from_file is not None: + parse(from_file, parser=parser) + + return sio.getvalue() if sio is not None else None + + +_looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match + + +class C14NWriterTarget: + """ + Canonicalization writer target for the XMLParser. + + Serialises parse events to XML C14N 2.0. + + The *write* function is used for writing out the resulting data stream + as text (not bytes). To write to a file, open it in text mode with encoding + "utf-8" and pass its ``.write`` method. + + Configuration options: + + - *with_comments*: set to true to include comments + - *strip_text*: set to true to strip whitespace before and after text content + - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}" + - *qname_aware_tags*: a set of qname aware tag names in which prefixes + should be replaced in text content + - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes + should be replaced in text content + - *exclude_attrs*: a set of attribute names that should not be serialised + - *exclude_tags*: a set of tag names that should not be serialised + """ + def __init__(self, write, *, + with_comments=False, strip_text=False, rewrite_prefixes=False, + qname_aware_tags=None, qname_aware_attrs=None, + exclude_attrs=None, exclude_tags=None): + self._write = write + self._data = [] + self._with_comments = with_comments + self._strip_text = strip_text + self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None + self._exclude_tags = set(exclude_tags) if exclude_tags else None + + self._rewrite_prefixes = rewrite_prefixes + if qname_aware_tags: + self._qname_aware_tags = set(qname_aware_tags) + else: + self._qname_aware_tags = None + if qname_aware_attrs: + self._find_qname_aware_attrs = set(qname_aware_attrs).intersection + else: + self._find_qname_aware_attrs = None + + # Stack with globally and newly declared namespaces as (uri, prefix) pairs. + self._declared_ns_stack = [[ + ("http://www.w3.org/XML/1998/namespace", "xml"), + ]] + # Stack with user declared namespace prefixes as (uri, prefix) pairs. + self._ns_stack = [] + if not rewrite_prefixes: + self._ns_stack.append(list(_namespace_map.items())) + self._ns_stack.append([]) + self._prefix_map = {} + self._preserve_space = [False] + self._pending_start = None + self._root_seen = False + self._root_done = False + self._ignored_depth = 0 + + def _iter_namespaces(self, ns_stack, _reversed=reversed): + for namespaces in _reversed(ns_stack): + if namespaces: # almost no element declares new namespaces + yield from namespaces + + def _resolve_prefix_name(self, prefixed_name): + prefix, name = prefixed_name.split(':', 1) + for uri, p in self._iter_namespaces(self._ns_stack): + if p == prefix: + return f'{{{uri}}}{name}' + raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope') + + def _qname(self, qname, uri=None): + if uri is None: + uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname) + else: + tag = qname + + prefixes_seen = set() + for u, prefix in self._iter_namespaces(self._declared_ns_stack): + if u == uri and prefix not in prefixes_seen: + return f'{prefix}:{tag}' if prefix else tag, tag, uri + prefixes_seen.add(prefix) + + # Not declared yet => add new declaration. + if self._rewrite_prefixes: + if uri in self._prefix_map: + prefix = self._prefix_map[uri] + else: + prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}' + self._declared_ns_stack[-1].append((uri, prefix)) + return f'{prefix}:{tag}', tag, uri + + if not uri and '' not in prefixes_seen: + # No default namespace declared => no prefix needed. + return tag, tag, uri + + for u, prefix in self._iter_namespaces(self._ns_stack): + if u == uri: + self._declared_ns_stack[-1].append((uri, prefix)) + return f'{prefix}:{tag}' if prefix else tag, tag, uri + + if not uri: + # As soon as a default namespace is defined, + # anything that has no namespace (and thus, no prefix) goes there. + return tag, tag, uri + + raise ValueError(f'Namespace "{uri}" is not declared in scope') + + def data(self, data): + if not self._ignored_depth: + self._data.append(data) + + def _flush(self, _join_text=''.join): + data = _join_text(self._data) + del self._data[:] + if self._strip_text and not self._preserve_space[-1]: + data = data.strip() + if self._pending_start is not None: + args, self._pending_start = self._pending_start, None + qname_text = data if data and _looks_like_prefix_name(data) else None + self._start(*args, qname_text) + if qname_text is not None: + return + if data and self._root_seen: + self._write(_escape_cdata_c14n(data)) + + def start_ns(self, prefix, uri): + if self._ignored_depth: + return + # we may have to resolve qnames in text content + if self._data: + self._flush() + self._ns_stack[-1].append((uri, prefix)) + + def start(self, tag, attrs): + if self._exclude_tags is not None and ( + self._ignored_depth or tag in self._exclude_tags): + self._ignored_depth += 1 + return + if self._data: + self._flush() + + new_namespaces = [] + self._declared_ns_stack.append(new_namespaces) + + if self._qname_aware_tags is not None and tag in self._qname_aware_tags: + # Need to parse text first to see if it requires a prefix declaration. + self._pending_start = (tag, attrs, new_namespaces) + return + self._start(tag, attrs, new_namespaces) + + def _start(self, tag, attrs, new_namespaces, qname_text=None): + if self._exclude_attrs is not None and attrs: + attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs} + + qnames = {tag, *attrs} + resolved_names = {} + + # Resolve prefixes in attribute and tag text. + if qname_text is not None: + qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text) + qnames.add(qname) + if self._find_qname_aware_attrs is not None and attrs: + qattrs = self._find_qname_aware_attrs(attrs) + if qattrs: + for attr_name in qattrs: + value = attrs[attr_name] + if _looks_like_prefix_name(value): + qname = resolved_names[value] = self._resolve_prefix_name(value) + qnames.add(qname) + else: + qattrs = None + else: + qattrs = None + + # Assign prefixes in lexicographical order of used URIs. + parse_qname = self._qname + parsed_qnames = {n: parse_qname(n) for n in sorted( + qnames, key=lambda n: n.split('}', 1))} + + # Write namespace declarations in prefix order ... + if new_namespaces: + attr_list = [ + ('xmlns:' + prefix if prefix else 'xmlns', uri) + for uri, prefix in new_namespaces + ] + attr_list.sort() + else: + # almost always empty + attr_list = [] + + # ... followed by attributes in URI+name order + if attrs: + for k, v in sorted(attrs.items()): + if qattrs is not None and k in qattrs and v in resolved_names: + v = parsed_qnames[resolved_names[v]][0] + attr_qname, attr_name, uri = parsed_qnames[k] + # No prefix for attributes in default ('') namespace. + attr_list.append((attr_qname if uri else attr_name, v)) + + # Honour xml:space attributes. + space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space') + self._preserve_space.append( + space_behaviour == 'preserve' if space_behaviour + else self._preserve_space[-1]) + + # Write the tag. + write = self._write + write('<' + parsed_qnames[tag][0]) + if attr_list: + write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list])) + write('>') + + # Write the resolved qname text content. + if qname_text is not None: + write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0])) + + self._root_seen = True + self._ns_stack.append([]) + + def end(self, tag): + if self._ignored_depth: + self._ignored_depth -= 1 + return + if self._data: + self._flush() + self._write(f'') + self._preserve_space.pop() + self._root_done = len(self._preserve_space) == 1 + self._declared_ns_stack.pop() + self._ns_stack.pop() + + def comment(self, text): + if not self._with_comments: + return + if self._ignored_depth: + return + if self._root_done: + self._write('\n') + elif self._root_seen and self._data: + self._flush() + self._write(f'') + if not self._root_seen: + self._write('\n') + + def pi(self, target, data): + if self._ignored_depth: + return + if self._root_done: + self._write('\n') + elif self._root_seen and self._data: + self._flush() + self._write( + f'' if data else f'') + if not self._root_seen: + self._write('\n') + + +def _escape_cdata_c14n(text): + # escape character data + try: + # it's worth avoiding do-nothing calls for strings that are + # shorter than 500 character, or so. assume that's, by far, + # the most common case in most applications. + if '&' in text: + text = text.replace('&', '&') + if '<' in text: + text = text.replace('<', '<') + if '>' in text: + text = text.replace('>', '>') + if '\r' in text: + text = text.replace('\r', ' ') + return text + except (TypeError, AttributeError): + _raise_serialization_error(text) + + +def _escape_attrib_c14n(text): + # escape attribute value + try: + if '&' in text: + text = text.replace('&', '&') + if '<' in text: + text = text.replace('<', '<') + if '"' in text: + text = text.replace('"', '"') + if '\t' in text: + text = text.replace('\t', ' ') + if '\n' in text: + text = text.replace('\n', ' ') + if '\r' in text: + text = text.replace('\r', ' ') + return text + except (TypeError, AttributeError): + _raise_serialization_error(text) + + +# -------------------------------------------------------------------- + # Import the C accelerators try: # Element is going to be shadowed by the C implementation. We need to keep @@ -1656,7 +2070,10 @@ def close(self): # (see tests) _Element_Py = Element - # Element, SubElement, ParseError, TreeBuilder, XMLParser + # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories from _elementtree import * + from _elementtree import _set_factories except ImportError: pass +else: + _set_factories(Comment, ProcessingInstruction) diff --git a/addon/globalPlugins/readFeeds/xml/etree/__init__.py b/addon/globalPlugins/readFeeds/xml/etree/__init__.py index 27fd8f6d..e2ec5342 100644 --- a/addon/globalPlugins/readFeeds/xml/etree/__init__.py +++ b/addon/globalPlugins/readFeeds/xml/etree/__init__.py @@ -30,4 +30,4 @@ # -------------------------------------------------------------------- # Licensed to PSF under a Contributor Agreement. -# See http://www.python.org/psf/license for licensing details. +# See https://www.python.org/psf/license for licensing details. diff --git a/addon/globalPlugins/readFeeds/xml/sax/__init__.py b/addon/globalPlugins/readFeeds/xml/sax/__init__.py index 13f6cf58..17b75879 100644 --- a/addon/globalPlugins/readFeeds/xml/sax/__init__.py +++ b/addon/globalPlugins/readFeeds/xml/sax/__init__.py @@ -67,18 +67,18 @@ def parseString(string, handler, errorHandler=ErrorHandler()): default_parser_list = sys.registry.getProperty(_key).split(",") -def make_parser(parser_list = []): +def make_parser(parser_list=()): """Creates and returns a SAX parser. Creates the first parser it is able to instantiate of the ones - given in the list created by doing parser_list + - default_parser_list. The lists must contain the names of Python + given in the iterable created by chaining parser_list and + default_parser_list. The iterables must contain the names of Python modules containing both a SAX parser and a create_parser function.""" - for parser_name in parser_list + default_parser_list: + for parser_name in list(parser_list) + default_parser_list: try: return _create_parser(parser_name) - except ImportError as e: + except ImportError: import sys if parser_name in sys.modules: # The parser module was found, but importing it diff --git a/addon/globalPlugins/readFeeds/xml/sax/expatreader.py b/addon/globalPlugins/readFeeds/xml/sax/expatreader.py index 5066ffc2..e334ac9f 100644 --- a/addon/globalPlugins/readFeeds/xml/sax/expatreader.py +++ b/addon/globalPlugins/readFeeds/xml/sax/expatreader.py @@ -93,7 +93,7 @@ def __init__(self, namespaceHandling=0, bufsize=2**16-20): self._parser = None self._namespaces = namespaceHandling self._lex_handler_prop = None - self._parsing = 0 + self._parsing = False self._entity_stack = [] self._external_ges = 0 self._interning = None @@ -203,10 +203,10 @@ def setProperty(self, name, value): # IncrementalParser methods - def feed(self, data, isFinal = 0): + def feed(self, data, isFinal=False): if not self._parsing: self.reset() - self._parsing = 1 + self._parsing = True self._cont_handler.startDocument() try: @@ -237,13 +237,13 @@ def close(self): # If we are completing an external entity, do nothing here return try: - self.feed("", isFinal = 1) + self.feed(b"", isFinal=True) self._cont_handler.endDocument() - self._parsing = 0 + self._parsing = False # break cycle created by expat handlers pointing to our methods self._parser = None finally: - self._parsing = 0 + self._parsing = False if self._parser is not None: # Keep ErrorColumnNumber and ErrorLineNumber after closing. parser = _ClosedParser() @@ -307,7 +307,7 @@ def reset(self): self._parser.SetParamEntityParsing( expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE) - self._parsing = 0 + self._parsing = False self._entity_stack = [] # Locator methods diff --git a/addon/globalPlugins/readFeeds/xml/sax/handler.py b/addon/globalPlugins/readFeeds/xml/sax/handler.py index 481733d2..e8d417e5 100644 --- a/addon/globalPlugins/readFeeds/xml/sax/handler.py +++ b/addon/globalPlugins/readFeeds/xml/sax/handler.py @@ -340,3 +340,48 @@ def resolveEntity(self, publicId, systemId): property_xml_string, property_encoding, property_interning_dict] + + +class LexicalHandler: + """Optional SAX2 handler for lexical events. + + This handler is used to obtain lexical information about an XML + document, that is, information about how the document was encoded + (as opposed to what it contains, which is reported to the + ContentHandler), such as comments and CDATA marked section + boundaries. + + To set the LexicalHandler of an XMLReader, use the setProperty + method with the property identifier + 'http://xml.org/sax/properties/lexical-handler'.""" + + def comment(self, content): + """Reports a comment anywhere in the document (including the + DTD and outside the document element). + + content is a string that holds the contents of the comment.""" + + def startDTD(self, name, public_id, system_id): + """Report the start of the DTD declarations, if the document + has an associated DTD. + + A startEntity event will be reported before declaration events + from the external DTD subset are reported, and this can be + used to infer from which subset DTD declarations derive. + + name is the name of the document element type, public_id the + public identifier of the DTD (or None if none were supplied) + and system_id the system identfier of the external subset (or + None if none were supplied).""" + + def endDTD(self): + """Signals the end of DTD declarations.""" + + def startCDATA(self): + """Reports the beginning of a CDATA marked section. + + The contents of the CDATA marked section will be reported + through the characters event.""" + + def endCDATA(self): + """Reports the end of a CDATA marked section.""" diff --git a/addon/globalPlugins/readFeeds/xml/sax/saxutils.py b/addon/globalPlugins/readFeeds/xml/sax/saxutils.py index a69c7f76..c1612ea1 100644 --- a/addon/globalPlugins/readFeeds/xml/sax/saxutils.py +++ b/addon/globalPlugins/readFeeds/xml/sax/saxutils.py @@ -56,8 +56,7 @@ def quoteattr(data, entities={}): the optional entities parameter. The keys and values must all be strings; each key will be replaced with its corresponding value. """ - entities = entities.copy() - entities.update({'\n': ' ', '\r': ' ', '\t':' '}) + entities = {**entities, '\n': ' ', '\r': ' ', '\t':' '} data = escape(data, entities) if '"' in data: if "'" in data: @@ -340,6 +339,8 @@ def prepare_input_source(source, base=""): """This function takes an InputSource and an optional base URL and returns a fully resolved InputSource object ready for reading.""" + if isinstance(source, os.PathLike): + source = os.fspath(source) if isinstance(source, str): source = xmlreader.InputSource(source) elif hasattr(source, "read"):