From 88f560c76100699e1310f6942f3b17164df811f8 Mon Sep 17 00:00:00 2001 From: James Addison <55152140+jayaddison@users.noreply.github.com> Date: Sun, 20 Oct 2024 17:56:22 +0000 Subject: [PATCH] Use a ``StrEnum`` to record linkcheck status codes (#13043) Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- sphinx/builders/linkcheck.py | 123 +++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 55 deletions(-) diff --git a/sphinx/builders/linkcheck.py b/sphinx/builders/linkcheck.py index fdc3915cce4..c74fa12a98a 100644 --- a/sphinx/builders/linkcheck.py +++ b/sphinx/builders/linkcheck.py @@ -7,6 +7,7 @@ import re import socket import time +from enum import StrEnum from html.parser import HTMLParser from os import path from queue import PriorityQueue, Queue @@ -38,19 +39,20 @@ from sphinx.util._pathlib import _StrPath from sphinx.util.typing import ExtensionMetadata - _Status: TypeAlias = Literal[ - 'broken', - 'ignored', - 'local', - 'rate-limited', - 'redirected', - 'timeout', - 'unchecked', - 'working', - ] - _StatusUnknown: TypeAlias = _Status | Literal[''] - _URIProperties: TypeAlias = tuple[_Status, str, int] - _URIPropertiesUnknown: TypeAlias = tuple[_StatusUnknown, str, int] + _URIProperties: TypeAlias = tuple['_Status', str, int] + + +class _Status(StrEnum): + BROKEN = 'broken' + IGNORED = 'ignored' + LOCAL = 'local' + RATE_LIMITED = 'rate-limited' + REDIRECTED = 'redirected' + TIMEOUT = 'timeout' + UNCHECKED = 'unchecked' + UNKNOWN = 'unknown' + WORKING = 'working' + logger = logging.getLogger(__name__) @@ -109,25 +111,29 @@ def process_result(self, result: CheckResult) -> None: } self.write_linkstat(linkstat) - if result.status == 'unchecked': + if result.status == _Status.UNCHECKED: return - if result.status == 'working' and result.message == 'old': + if result.status == _Status.WORKING and result.message == 'old': return if result.lineno: logger.info('(%16s: line %4d) ', result.docname, result.lineno, nonl=True) - if result.status == 'ignored': + if result.status == _Status.IGNORED: if result.message: logger.info(darkgray('-ignored- ') + result.uri + ': ' + result.message) else: logger.info(darkgray('-ignored- ') + result.uri) - elif result.status == 'local': + elif result.status == _Status.LOCAL: logger.info(darkgray('-local- ') + result.uri) self.write_entry( - 'local', result.docname, filename, result.lineno, result.uri + _Status.LOCAL, + result.docname, + filename, + result.lineno, + result.uri, ) - elif result.status == 'working': + elif result.status == _Status.WORKING: logger.info(darkgreen('ok ') + result.uri + result.message) - elif result.status == 'timeout': + elif result.status == _Status.TIMEOUT: if self.app.quiet: logger.warning( 'timeout ' + result.uri + result.message, @@ -138,14 +144,14 @@ def process_result(self, result: CheckResult) -> None: red('timeout ') + result.uri + red(' - ' + result.message) ) self.write_entry( - 'timeout', + _Status.TIMEOUT, result.docname, filename, result.lineno, result.uri + ': ' + result.message, ) self.timed_out_hyperlinks += 1 - elif result.status == 'broken': + elif result.status == _Status.BROKEN: if self.app.quiet: logger.warning( __('broken link: %s (%s)'), @@ -158,14 +164,14 @@ def process_result(self, result: CheckResult) -> None: red('broken ') + result.uri + red(' - ' + result.message) ) self.write_entry( - 'broken', + _Status.BROKEN, result.docname, filename, result.lineno, result.uri + ': ' + result.message, ) self.broken_hyperlinks += 1 - elif result.status == 'redirected': + elif result.status == _Status.REDIRECTED: try: text, color = { 301: ('permanently', purple), @@ -199,7 +205,7 @@ def process_result(self, result: CheckResult) -> None: msg = f'Unknown status {result.status!r}.' raise ValueError(msg) - def write_linkstat(self, data: dict[str, str | int]) -> None: + def write_linkstat(self, data: dict[str, str | int | _Status]) -> None: self.json_outfile.write(json.dumps(data)) self.json_outfile.write('\n') @@ -306,7 +312,12 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]: for hyperlink in hyperlinks.values(): if self.is_ignored_uri(hyperlink.uri): yield CheckResult( - hyperlink.uri, hyperlink.docname, hyperlink.lineno, 'ignored', '', 0 + uri=hyperlink.uri, + docname=hyperlink.docname, + lineno=hyperlink.lineno, + status=_Status.IGNORED, + message='', + code=0, ) else: self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, hyperlink), False) @@ -345,7 +356,7 @@ class CheckResult(NamedTuple): uri: str docname: str lineno: int - status: _StatusUnknown + status: _Status message: str code: int @@ -388,11 +399,11 @@ def __init__( self.retries: int = config.linkcheck_retries self.rate_limit_timeout = config.linkcheck_rate_limit_timeout self._allow_unauthorized = config.linkcheck_allow_unauthorized - self._timeout_status: Literal['broken', 'timeout'] + self._timeout_status: Literal[_Status.BROKEN, _Status.TIMEOUT] if config.linkcheck_report_timeouts_as_broken: - self._timeout_status = 'broken' + self._timeout_status = _Status.BROKEN else: - self._timeout_status = 'timeout' + self._timeout_status = _Status.TIMEOUT self.user_agent = config.user_agent self.tls_verify = config.tls_verify @@ -429,7 +440,7 @@ def run(self) -> None: self.wqueue.task_done() continue status, info, code = self._check(docname, uri, hyperlink) - if status == 'rate-limited': + if status == _Status.RATE_LIMITED: logger.info( darkgray('-rate limited- ') + uri + darkgray(' | sleeping...') ) @@ -437,9 +448,7 @@ def run(self) -> None: self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code)) self.wqueue.task_done() - def _check( - self, docname: str, uri: str, hyperlink: Hyperlink - ) -> _URIPropertiesUnknown: + def _check(self, docname: str, uri: str, hyperlink: Hyperlink) -> _URIProperties: # check for various conditions without bothering the network for doc_matcher in self.documents_exclude: @@ -448,26 +457,26 @@ def _check( f'{docname} matched {doc_matcher.pattern} from ' 'linkcheck_exclude_documents' ) - return 'ignored', info, 0 + return _Status.IGNORED, info, 0 if len(uri) == 0 or uri.startswith(('#', 'mailto:', 'tel:')): - return 'unchecked', '', 0 + return _Status.UNCHECKED, '', 0 if not uri.startswith(('http:', 'https:')): if uri_re.match(uri): # Non-supported URI schemes (ex. ftp) - return 'unchecked', '', 0 + return _Status.UNCHECKED, '', 0 src_dir = path.dirname(hyperlink.docpath) if path.exists(path.join(src_dir, uri)): - return 'working', '', 0 - return 'broken', '', 0 + return _Status.WORKING, '', 0 + return _Status.BROKEN, '', 0 # need to actually check the URI - status: _StatusUnknown - status, info, code = '', '', 0 + status: _Status + status, info, code = _Status.UNKNOWN, '', 0 for _ in range(self.retries): status, info, code = self._check_uri(uri, hyperlink) - if status != 'broken': + if status != _Status.BROKEN: break return status, info, code @@ -536,10 +545,14 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: try: found = contains_anchor(response, anchor) except UnicodeDecodeError: - return 'ignored', 'unable to decode response content', 0 + return ( + _Status.IGNORED, + 'unable to decode response content', + 0, + ) if not found: return ( - 'broken', + _Status.BROKEN, __("Anchor '%s' not found") % quote(anchor), 0, ) @@ -560,7 +573,7 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: except SSLError as err: # SSL failure; report that the link is broken. - return 'broken', str(err), 0 + return _Status.BROKEN, str(err), 0 except (ConnectionError, TooManyRedirects) as err: # Servers drop the connection on HEAD requests, causing @@ -574,20 +587,20 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: # Unauthorized: the client did not provide required credentials if status_code == 401: if self._allow_unauthorized: - return 'working', 'unauthorized', 0 + return _Status.WORKING, 'unauthorized', 0 else: - return 'broken', 'unauthorized', 0 + return _Status.BROKEN, 'unauthorized', 0 # Rate limiting; back-off if allowed, or report failure otherwise if status_code == 429: if next_check := self.limit_rate(response_url, retry_after): self.wqueue.put(CheckRequest(next_check, hyperlink), False) - return 'rate-limited', '', 0 - return 'broken', error_message, 0 + return _Status.RATE_LIMITED, '', 0 + return _Status.BROKEN, error_message, 0 # Don't claim success/failure during server-side outages if status_code == 503: - return 'ignored', 'service unavailable', 0 + return _Status.IGNORED, 'service unavailable', 0 # For most HTTP failures, continue attempting alternate retrieval methods continue @@ -595,12 +608,12 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: except Exception as err: # Unhandled exception (intermittent or permanent); report that # the link is broken. - return 'broken', str(err), 0 + return _Status.BROKEN, str(err), 0 else: # All available retrieval methods have been exhausted; report # that the link is broken. - return 'broken', error_message, 0 + return _Status.BROKEN, error_message, 0 # Success; clear rate limits for the origin netloc = urlsplit(req_url).netloc @@ -610,11 +623,11 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties: (response_url.rstrip('/') == req_url.rstrip('/')) or _allowed_redirect(req_url, response_url, self.allowed_redirects) ): # fmt: skip - return 'working', '', 0 + return _Status.WORKING, '', 0 elif redirect_status_code is not None: - return 'redirected', response_url, redirect_status_code + return _Status.REDIRECTED, response_url, redirect_status_code else: - return 'redirected', response_url, 0 + return _Status.REDIRECTED, response_url, 0 def limit_rate(self, response_url: str, retry_after: str | None) -> float | None: delay = DEFAULT_DELAY