diff --git a/CHANGES.rst b/CHANGES.rst index 6cef031159d..e34684c91ac 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -14,6 +14,9 @@ Deprecated Features added -------------- +* #13036: linkcheck: modifications including use of Py3.11 structural pattern + matching and ``StrEnum`` status codes. + Patch by James Addison. Bugs fixed ---------- diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index 0ee0addd7b8..51f346f4c48 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -7,6 +7,7 @@ import re import socket import time +from enum import StrEnum from html.parser import HTMLParser from os import path from queue import PriorityQueue, Queue @@ -51,6 +52,17 @@ DEFAULT_DELAY = 60.0 +class LinkStatus(StrEnum): + BROKEN = 'broken' + IGNORED = 'ignored' + TIMEOUT = 'timeout' + RATE_LIMITED = 'rate-limited' + REDIRECTED = 'redirected' + UNCHECKED = 'unchecked' + UNKNOWN = 'unknown' + WORKING = 'working' + + class CheckExternalLinksBuilder(DummyBuilder): """ Checks for broken external links. @@ -95,103 +107,100 @@ def process_result(self, result: CheckResult) -> None: } self.write_linkstat(linkstat) - if result.status == 'unchecked': - return - if result.status == 'working' and result.message == 'old': + if result.status == LinkStatus.UNCHECKED: return + if result.lineno: logger.info('(%16s: line %4d) ', result.docname, result.lineno, nonl=True) - if result.status == 'ignored': - if result.message: - logger.info(darkgray('-ignored- ') + result.uri + ': ' + result.message) - else: - logger.info(darkgray('-ignored- ') + result.uri) - elif result.status == 'local': - logger.info(darkgray('-local- ') + result.uri) - self.write_entry( - 'local', result.docname, filename, result.lineno, result.uri - ) - elif result.status == 'working': - logger.info(darkgreen('ok ') + result.uri + result.message) - elif result.status == 'timeout': - if self.app.quiet: - logger.warning( - 'timeout ' + result.uri + result.message, - location=(result.docname, result.lineno), - ) - else: - logger.info( - red('timeout ') + result.uri + red(' - ' + result.message) - ) - self.write_entry( - 'timeout', - result.docname, - filename, - result.lineno, - result.uri + ': ' + result.message, - ) - self.timed_out_hyperlinks += 1 - elif result.status == 'broken': - if self.app.quiet: - logger.warning( - __('broken link: %s (%s)'), - result.uri, - result.message, - location=(result.docname, result.lineno), - ) - else: - logger.info( - red('broken ') + result.uri + red(' - ' + result.message) + + match result.status: + case LinkStatus.IGNORED: + if result.message: + msg = darkgray('-ignored- ') + result.uri + ': ' + result.message + else: + msg = darkgray('-ignored- ') + result.uri + logger.info(msg) + case LinkStatus.WORKING: + msg = darkgreen('ok ') + result.uri + result.message + logger.info(msg) + case LinkStatus.TIMEOUT: + if self.app.quiet: + msg = 'timeout ' + result.uri + result.message + logger.warning(msg, location=(result.docname, result.lineno)) + else: + msg = red('timeout ') + result.uri + red(' - ' + result.message) + logger.info(msg) + self.write_entry( + result.status, + result.docname, + str(filename), + result.lineno, + result.uri + ': ' + result.message, ) - self.write_entry( - 'broken', - result.docname, - filename, - result.lineno, - result.uri + ': ' + result.message, - ) - self.broken_hyperlinks += 1 - elif result.status == 'redirected': - try: - text, color = { - 301: ('permanently', purple), - 302: ('with Found', purple), - 303: ('with See Other', purple), - 307: ('temporarily', turquoise), - 308: ('permanently', purple), - }[result.code] - except KeyError: - text, color = ('with unknown code', purple) - linkstat['text'] = text - if self.config.linkcheck_allowed_redirects: - logger.warning( - 'redirect ' + result.uri + ' - ' + text + ' to ' + result.message, - location=(result.docname, result.lineno), + self.timed_out_hyperlinks += 1 + case LinkStatus.BROKEN: + if self.app.quiet or self.app.warningiserror: + msg = __('broken link: %s (%s)') + location = (result.docname, result.lineno) + logger.warning(msg, result.uri, result.message, location=location) + else: + msg = red('broken ') + result.uri + red(f' - {result.message}') + logger.info(msg) + self.write_entry( + result.status, + result.docname, + str(filename), + result.lineno, + result.uri + ': ' + result.message, ) - else: - logger.info( - color('redirect ') - + result.uri - + color(' - ' + text + ' to ' + result.message) + self.broken_hyperlinks += 1 + case LinkStatus.REDIRECTED: + try: + text, color = { + 301: ('permanently', purple), + 302: ('with Found', purple), + 303: ('with See Other', purple), + 307: ('temporarily', turquoise), + 308: ('permanently', purple), + }[result.code] + except KeyError: + text, color = ('with unknown code', purple) + if self.config.linkcheck_allowed_redirects: + msg = f'redirect {result.uri} - {text} to {result.message}' + location = (result.docname, result.lineno) + logger.warning(msg, location=location) + else: + msg = ( + color('redirect ') + + result.uri + + color(f' - {text} to {result.message}') + ) + logger.info(msg) + self.write_entry( + result.status, + result.docname, + str(filename), + result.lineno, + result.uri + ' to ' + result.message, + context=' ' + text, ) - self.write_entry( - 'redirected ' + text, - result.docname, - filename, - result.lineno, - result.uri + ' to ' + result.message, - ) - else: - raise ValueError('Unknown status %s.' % result.status) + case _: + raise ValueError('Unknown status %s.' % result.status) def write_linkstat(self, data: dict[str, str | int]) -> None: self.json_outfile.write(json.dumps(data)) self.json_outfile.write('\n') def write_entry( - self, what: str, docname: str, filename: _StrPath, line: int, uri: str + self, + what: LinkStatus, + docname: str, + filename: str, + line: int, + uri: str, + context: str = '', ) -> None: - self.txt_outfile.write(f'{filename}:{line}: [{what}] {uri}\n') + self.txt_outfile.write(f'{filename}:{line}: [{what}{context}] {uri}\n') class HyperlinkCollector(SphinxPostTransform): @@ -291,7 +300,12 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]: for hyperlink in hyperlinks.values(): if self.is_ignored_uri(hyperlink.uri): yield CheckResult( - hyperlink.uri, hyperlink.docname, hyperlink.lineno, 'ignored', '', 0 + hyperlink.uri, + hyperlink.docname, + hyperlink.lineno, + LinkStatus(LinkStatus.IGNORED), + '', + 0, ) else: self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, hyperlink), False) @@ -330,7 +344,7 @@ class CheckResult(NamedTuple): uri: str docname: str lineno: int - status: str + status: LinkStatus message: str code: int @@ -374,9 +388,9 @@ def __init__( self.rate_limit_timeout = config.linkcheck_rate_limit_timeout self._allow_unauthorized = config.linkcheck_allow_unauthorized if config.linkcheck_report_timeouts_as_broken: - self._timeout_status = 'broken' + self._timeout_status = LinkStatus.BROKEN else: - self._timeout_status = 'timeout' + self._timeout_status = LinkStatus.TIMEOUT self.user_agent = config.user_agent self.tls_verify = config.tls_verify @@ -413,7 +427,7 @@ def run(self) -> None: self.wqueue.task_done() continue status, info, code = self._check(docname, uri, hyperlink) - if status == 'rate-limited': + if status == LinkStatus.RATE_LIMITED: logger.info( darkgray('-rate limited- ') + uri + darkgray(' | sleeping...') ) @@ -423,7 +437,7 @@ def run(self) -> None: def _check( self, docname: str, uri: str, hyperlink: Hyperlink - ) -> tuple[str, str, int]: + ) -> tuple[LinkStatus, str, int]: # check for various conditions without bothering the network for doc_matcher in self.documents_exclude: @@ -432,25 +446,25 @@ def _check( f'{docname} matched {doc_matcher.pattern} from ' 'linkcheck_exclude_documents' ) - return 'ignored', info, 0 + return LinkStatus(LinkStatus.IGNORED), info, 0 if len(uri) == 0 or uri.startswith(('#', 'mailto:', 'tel:')): - return 'unchecked', '', 0 + return LinkStatus(LinkStatus.UNCHECKED), '', 0 if not uri.startswith(('http:', 'https:')): if uri_re.match(uri): # Non-supported URI schemes (ex. ftp) - return 'unchecked', '', 0 + return LinkStatus(LinkStatus.UNCHECKED), '', 0 src_dir = path.dirname(hyperlink.docpath) if path.exists(path.join(src_dir, uri)): - return 'working', '', 0 - return 'broken', '', 0 + return LinkStatus(LinkStatus.WORKING), '', 0 + return LinkStatus(LinkStatus.BROKEN), '', 0 # need to actually check the URI - status, info, code = '', '', 0 + status, info, code = LinkStatus(LinkStatus.UNKNOWN), '', 0 for _ in range(self.retries): status, info, code = self._check_uri(uri, hyperlink) - if status != 'broken': + if status != LinkStatus.BROKEN: break return status, info, code @@ -464,7 +478,7 @@ def _retrieval_methods( yield self._session.head, {'allow_redirects': True} yield self._session.get, {'stream': True} - def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]: + def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[LinkStatus, str, int]: req_url, delimiter, anchor = uri.partition('#') if delimiter and anchor: for rex in self.anchors_ignore: @@ -519,10 +533,14 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]: try: found = contains_anchor(response, anchor) except UnicodeDecodeError: - return 'ignored', 'unable to decode response content', 0 + return ( + LinkStatus.IGNORED, + 'unable to decode response content', + 0, + ) if not found: return ( - 'broken', + LinkStatus.BROKEN, __("Anchor '%s' not found") % quote(anchor), 0, ) @@ -531,7 +549,7 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]: status_code = response.status_code redirect_status_code = ( response.history[-1].status_code if response.history else None - ) # NoQA: E501 + ) retry_after = response.headers.get('Retry-After', '') response_url = f'{response.url}' response.raise_for_status() @@ -543,7 +561,7 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]: except SSLError as err: # SSL failure; report that the link is broken. - return 'broken', str(err), 0 + return LinkStatus(LinkStatus.BROKEN), str(err), 0 except (ConnectionError, TooManyRedirects) as err: # Servers drop the connection on HEAD requests, causing @@ -557,18 +575,18 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]: # Unauthorized: the client did not provide required credentials if status_code == 401: status = 'working' if self._allow_unauthorized else 'broken' - return status, 'unauthorized', 0 + return LinkStatus(status), 'unauthorized', 0 # Rate limiting; back-off if allowed, or report failure otherwise if status_code == 429: if next_check := self.limit_rate(response_url, retry_after): self.wqueue.put(CheckRequest(next_check, hyperlink), False) - return 'rate-limited', '', 0 - return 'broken', error_message, 0 + return LinkStatus(LinkStatus.RATE_LIMITED), '', 0 + return LinkStatus(LinkStatus.BROKEN), error_message, 0 # Don't claim success/failure during server-side outages if status_code == 503: - return 'ignored', 'service unavailable', 0 + return LinkStatus(LinkStatus.IGNORED), 'service unavailable', 0 # For most HTTP failures, continue attempting alternate retrieval methods continue @@ -576,26 +594,25 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> tuple[str, str, int]: except Exception as err: # Unhandled exception (intermittent or permanent); report that # the link is broken. - return 'broken', str(err), 0 + return LinkStatus(LinkStatus.BROKEN), str(err), 0 else: # All available retrieval methods have been exhausted; report # that the link is broken. - return 'broken', error_message, 0 + return LinkStatus(LinkStatus.BROKEN), error_message, 0 # Success; clear rate limits for the origin netloc = urlsplit(req_url).netloc self.rate_limits.pop(netloc, None) - if ( - (response_url.rstrip('/') == req_url.rstrip('/')) - or _allowed_redirect(req_url, response_url, self.allowed_redirects) - ): # fmt: skip - return 'working', '', 0 + if (response_url.rstrip('/') == req_url.rstrip('/')) or _allowed_redirect( + req_url, response_url, self.allowed_redirects + ): + return LinkStatus(LinkStatus.WORKING), '', 0 elif redirect_status_code is not None: - return 'redirected', response_url, redirect_status_code + return LinkStatus(LinkStatus.REDIRECTED), response_url, redirect_status_code else: - return 'redirected', response_url, 0 + return LinkStatus(LinkStatus.REDIRECTED), response_url, 0 def limit_rate(self, response_url: str, retry_after: str | None) -> float | None: delay = DEFAULT_DELAY