From 019143fecbfc26e69800d28d2a9e3392a051780b Mon Sep 17 00:00:00 2001 From: "Jason R. Coombs" Date: Fri, 29 Mar 2024 16:06:09 -0400 Subject: [PATCH] gh-117348: Refactored RawConfigParser._read for similicity and comprehensibility (#117372) * Extract method for _read_inner, reducing complexity and indentation by 1. * Extract method for _raise_all and yield ParseErrors from _read_inner. Reduces complexity by 1 and reduces touch points for handling errors in _read_inner. * Prefer iterators to splat expansion and literal indexing. * Extract method for _strip_comments. Reduces complexity by 7. * Model the file lines in a class to encapsulate the comment status and cleaned value. * Encapsulate the read state as a dataclass * Extract _handle_continuation_line and _handle_rest methods. Reduces complexity by 8. * Reindent * At least for now, collect errors in the ReadState * Check for missing section header separately. * Extract methods for _handle_header and _handle_option. Reduces complexity by 6. * Remove unreachable code. Reduces complexity by 4. * Remove unreachable branch * Handle error condition early. Reduces complexity by 1. * Add blurb * Move _raise_all to ParsingError, as its behavior is most closely related to the exception class and not the reader. * Split _strip* into separate methods. * Refactor _strip_full to compute the strip just once and use 'not any' to determine the factor. * Replace use of 'sys.maxsize' with direct computation of the stripped value. * Extract has_comments as a dynamic property. * Implement clean as a cached property. * Model comment prefixes in the RawConfigParser within a prefixes namespace. * Use a regular expression to search for the first match. Avoids mutating variables and tricky logic and over-computing all of the starts when only the first is relevant. --- Lib/configparser.py | 330 ++++++++++-------- ...-03-29-12-07-26.gh-issue-117348.WjCYvK.rst | 2 + 2 files changed, 185 insertions(+), 147 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-03-29-12-07-26.gh-issue-117348.WjCYvK.rst diff --git a/Lib/configparser.py b/Lib/configparser.py index 3040e1fbe5b9c1..d0326c60e9b907 100644 --- a/Lib/configparser.py +++ b/Lib/configparser.py @@ -145,12 +145,15 @@ from collections.abc import MutableMapping from collections import ChainMap as _ChainMap +import contextlib +from dataclasses import dataclass, field import functools import io import itertools import os import re import sys +from typing import Iterable __all__ = ("NoSectionError", "DuplicateOptionError", "DuplicateSectionError", "NoOptionError", "InterpolationError", "InterpolationDepthError", @@ -302,15 +305,33 @@ def __init__(self, option, section, rawval): class ParsingError(Error): """Raised when a configuration file does not follow legal syntax.""" - def __init__(self, source): + def __init__(self, source, *args): super().__init__(f'Source contains parsing errors: {source!r}') self.source = source self.errors = [] self.args = (source, ) + if args: + self.append(*args) def append(self, lineno, line): self.errors.append((lineno, line)) - self.message += '\n\t[line %2d]: %s' % (lineno, line) + self.message += '\n\t[line %2d]: %s' % (lineno, repr(line)) + + def combine(self, others): + for other in others: + for error in other.errors: + self.append(*error) + return self + + @staticmethod + def _raise_all(exceptions: Iterable['ParsingError']): + """ + Combine any number of ParsingErrors into one and raise it. + """ + exceptions = iter(exceptions) + with contextlib.suppress(StopIteration): + raise next(exceptions).combine(exceptions) + class MissingSectionHeaderError(ParsingError): @@ -517,6 +538,55 @@ def _interpolate_some(self, parser, option, accum, rest, section, map, "found: %r" % (rest,)) +@dataclass +class _ReadState: + elements_added : set[str] = field(default_factory=set) + cursect : dict[str, str] | None = None + sectname : str | None = None + optname : str | None = None + lineno : int = 0 + indent_level : int = 0 + errors : list[ParsingError] = field(default_factory=list) + + +@dataclass +class _Prefixes: + full : Iterable[str] + inline : Iterable[str] + + +class _Line(str): + + def __new__(cls, val, *args, **kwargs): + return super().__new__(cls, val) + + def __init__(self, val, prefixes: _Prefixes): + self.prefixes = prefixes + + @functools.cached_property + def clean(self): + return self._strip_full() and self._strip_inline() + + @property + def has_comments(self): + return self.strip() != self.clean + + def _strip_inline(self): + """ + Search for the earliest prefix at the beginning of the line or following a space. + """ + matcher = re.compile( + '|'.join(fr'(^|\s)({re.escape(prefix)})' for prefix in self.prefixes.inline) + # match nothing if no prefixes + or '(?!)' + ) + match = matcher.search(self) + return self[:match.start() if match else None].strip() + + def _strip_full(self): + return '' if any(map(self.strip().startswith, self.prefixes.full)) else True + + class RawConfigParser(MutableMapping): """ConfigParser that does not do interpolation.""" @@ -583,8 +653,10 @@ def __init__(self, defaults=None, dict_type=_default_dict, else: self._optcre = re.compile(self._OPT_TMPL.format(delim=d), re.VERBOSE) - self._comment_prefixes = tuple(comment_prefixes or ()) - self._inline_comment_prefixes = tuple(inline_comment_prefixes or ()) + self._prefixes = _Prefixes( + full=tuple(comment_prefixes or ()), + inline=tuple(inline_comment_prefixes or ()), + ) self._strict = strict self._allow_no_value = allow_no_value self._empty_lines_in_values = empty_lines_in_values @@ -975,147 +1047,117 @@ def _read(self, fp, fpname): in an otherwise empty line or may be entered in lines holding values or section names. Please note that comments get stripped off when reading configuration files. """ - elements_added = set() - cursect = None # None, or a dictionary - sectname = None - optname = None - lineno = 0 - indent_level = 0 - e = None # None, or an exception try: - for lineno, line in enumerate(fp, start=1): - comment_start = sys.maxsize - # strip inline comments - inline_prefixes = {p: -1 for p in self._inline_comment_prefixes} - while comment_start == sys.maxsize and inline_prefixes: - next_prefixes = {} - for prefix, index in inline_prefixes.items(): - index = line.find(prefix, index+1) - if index == -1: - continue - next_prefixes[prefix] = index - if index == 0 or (index > 0 and line[index-1].isspace()): - comment_start = min(comment_start, index) - inline_prefixes = next_prefixes - # strip full line comments - for prefix in self._comment_prefixes: - if line.strip().startswith(prefix): - comment_start = 0 - break - if comment_start == sys.maxsize: - comment_start = None - value = line[:comment_start].strip() - if not value: - if self._empty_lines_in_values: - # add empty line to the value, but only if there was no - # comment on the line - if (comment_start is None and - cursect is not None and - optname and - cursect[optname] is not None): - cursect[optname].append('') # newlines added at join - else: - # empty line marks end of value - indent_level = sys.maxsize - continue - # continuation line? - first_nonspace = self.NONSPACECRE.search(line) - cur_indent_level = first_nonspace.start() if first_nonspace else 0 - if (cursect is not None and optname and - cur_indent_level > indent_level): - if cursect[optname] is None: - raise MultilineContinuationError(fpname, lineno, line) - cursect[optname].append(value) - # a section header or option header? - else: - if self._allow_unnamed_section and cursect is None: - sectname = UNNAMED_SECTION - cursect = self._dict() - self._sections[sectname] = cursect - self._proxies[sectname] = SectionProxy(self, sectname) - elements_added.add(sectname) - - indent_level = cur_indent_level - # is it a section header? - mo = self.SECTCRE.match(value) - if mo: - sectname = mo.group('header') - if sectname in self._sections: - if self._strict and sectname in elements_added: - raise DuplicateSectionError(sectname, fpname, - lineno) - cursect = self._sections[sectname] - elements_added.add(sectname) - elif sectname == self.default_section: - cursect = self._defaults - else: - cursect = self._dict() - self._sections[sectname] = cursect - self._proxies[sectname] = SectionProxy(self, sectname) - elements_added.add(sectname) - # So sections can't start with a continuation line - optname = None - # no section header? - elif cursect is None: - raise MissingSectionHeaderError(fpname, lineno, line) - # an option line? - else: - indent_level = cur_indent_level - # is it a section header? - mo = self.SECTCRE.match(value) - if mo: - sectname = mo.group('header') - if sectname in self._sections: - if self._strict and sectname in elements_added: - raise DuplicateSectionError(sectname, fpname, - lineno) - cursect = self._sections[sectname] - elements_added.add(sectname) - elif sectname == self.default_section: - cursect = self._defaults - else: - cursect = self._dict() - self._sections[sectname] = cursect - self._proxies[sectname] = SectionProxy(self, sectname) - elements_added.add(sectname) - # So sections can't start with a continuation line - optname = None - # no section header in the file? - elif cursect is None: - raise MissingSectionHeaderError(fpname, lineno, line) - # an option line? - else: - mo = self._optcre.match(value) - if mo: - optname, vi, optval = mo.group('option', 'vi', 'value') - if not optname: - e = self._handle_error(e, fpname, lineno, line) - optname = self.optionxform(optname.rstrip()) - if (self._strict and - (sectname, optname) in elements_added): - raise DuplicateOptionError(sectname, optname, - fpname, lineno) - elements_added.add((sectname, optname)) - # This check is fine because the OPTCRE cannot - # match if it would set optval to None - if optval is not None: - optval = optval.strip() - cursect[optname] = [optval] - else: - # valueless option handling - cursect[optname] = None - else: - # a non-fatal parsing error occurred. set up the - # exception but keep going. the exception will be - # raised at the end of the file and will contain a - # list of all bogus lines - e = self._handle_error(e, fpname, lineno, line) + ParsingError._raise_all(self._read_inner(fp, fpname)) finally: self._join_multiline_values() - # if any parsing errors occurred, raise an exception - if e: - raise e + + def _read_inner(self, fp, fpname): + st = _ReadState() + + Line = functools.partial(_Line, prefixes=self._prefixes) + for st.lineno, line in enumerate(map(Line, fp), start=1): + if not line.clean: + if self._empty_lines_in_values: + # add empty line to the value, but only if there was no + # comment on the line + if (not line.has_comments and + st.cursect is not None and + st.optname and + st.cursect[st.optname] is not None): + st.cursect[st.optname].append('') # newlines added at join + else: + # empty line marks end of value + st.indent_level = sys.maxsize + continue + + first_nonspace = self.NONSPACECRE.search(line) + st.cur_indent_level = first_nonspace.start() if first_nonspace else 0 + + if self._handle_continuation_line(st, line, fpname): + continue + + self._handle_rest(st, line, fpname) + + return st.errors + + def _handle_continuation_line(self, st, line, fpname): + # continuation line? + is_continue = (st.cursect is not None and st.optname and + st.cur_indent_level > st.indent_level) + if is_continue: + if st.cursect[st.optname] is None: + raise MultilineContinuationError(fpname, st.lineno, line) + st.cursect[st.optname].append(line.clean) + return is_continue + + def _handle_rest(self, st, line, fpname): + # a section header or option header? + if self._allow_unnamed_section and st.cursect is None: + st.sectname = UNNAMED_SECTION + st.cursect = self._dict() + self._sections[st.sectname] = st.cursect + self._proxies[st.sectname] = SectionProxy(self, st.sectname) + st.elements_added.add(st.sectname) + + st.indent_level = st.cur_indent_level + # is it a section header? + mo = self.SECTCRE.match(line.clean) + + if not mo and st.cursect is None: + raise MissingSectionHeaderError(fpname, st.lineno, line) + + self._handle_header(st, mo, fpname) if mo else self._handle_option(st, line, fpname) + + def _handle_header(self, st, mo, fpname): + st.sectname = mo.group('header') + if st.sectname in self._sections: + if self._strict and st.sectname in st.elements_added: + raise DuplicateSectionError(st.sectname, fpname, + st.lineno) + st.cursect = self._sections[st.sectname] + st.elements_added.add(st.sectname) + elif st.sectname == self.default_section: + st.cursect = self._defaults + else: + st.cursect = self._dict() + self._sections[st.sectname] = st.cursect + self._proxies[st.sectname] = SectionProxy(self, st.sectname) + st.elements_added.add(st.sectname) + # So sections can't start with a continuation line + st.optname = None + + def _handle_option(self, st, line, fpname): + # an option line? + st.indent_level = st.cur_indent_level + + mo = self._optcre.match(line.clean) + if not mo: + # a non-fatal parsing error occurred. set up the + # exception but keep going. the exception will be + # raised at the end of the file and will contain a + # list of all bogus lines + st.errors.append(ParsingError(fpname, st.lineno, line)) + return + + st.optname, vi, optval = mo.group('option', 'vi', 'value') + if not st.optname: + st.errors.append(ParsingError(fpname, st.lineno, line)) + st.optname = self.optionxform(st.optname.rstrip()) + if (self._strict and + (st.sectname, st.optname) in st.elements_added): + raise DuplicateOptionError(st.sectname, st.optname, + fpname, st.lineno) + st.elements_added.add((st.sectname, st.optname)) + # This check is fine because the OPTCRE cannot + # match if it would set optval to None + if optval is not None: + optval = optval.strip() + st.cursect[st.optname] = [optval] + else: + # valueless option handling + st.cursect[st.optname] = None def _join_multiline_values(self): defaults = self.default_section, self._defaults @@ -1135,12 +1177,6 @@ def _read_defaults(self, defaults): for key, value in defaults.items(): self._defaults[self.optionxform(key)] = value - def _handle_error(self, exc, fpname, lineno, line): - if not exc: - exc = ParsingError(fpname) - exc.append(lineno, repr(line)) - return exc - def _unify_values(self, section, vars): """Create a sequence of lookups with 'vars' taking priority over the 'section' which takes priority over the DEFAULTSECT. diff --git a/Misc/NEWS.d/next/Library/2024-03-29-12-07-26.gh-issue-117348.WjCYvK.rst b/Misc/NEWS.d/next/Library/2024-03-29-12-07-26.gh-issue-117348.WjCYvK.rst new file mode 100644 index 00000000000000..cd3006c3b7b8f0 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-03-29-12-07-26.gh-issue-117348.WjCYvK.rst @@ -0,0 +1,2 @@ +Refactored :meth:`configparser.RawConfigParser._read` to reduce cyclometric +complexity and improve comprehensibility.