Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a StrEnum to record linkcheck status codes #13043

Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 68 additions & 55 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import socket
import time
from enum import StrEnum
from html.parser import HTMLParser
from os import path
from queue import PriorityQueue, Queue
Expand Down Expand Up @@ -38,19 +39,20 @@
from sphinx.util._pathlib import _StrPath
from sphinx.util.typing import ExtensionMetadata

_Status: TypeAlias = Literal[
'broken',
'ignored',
'local',
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
'rate-limited',
'redirected',
'timeout',
'unchecked',
'working',
]
_StatusUnknown: TypeAlias = _Status | Literal['']
_URIProperties: TypeAlias = tuple[_Status, str, int]
_URIPropertiesUnknown: TypeAlias = tuple[_StatusUnknown, str, int]
_URIProperties: TypeAlias = tuple['_Status', str, int]


class _Status(StrEnum):
BROKEN = 'broken'
IGNORED = 'ignored'
LOCAL = 'local'
RATE_LIMITED = 'rate-limited'
REDIRECTED = 'redirected'
TIMEOUT = 'timeout'
UNCHECKED = 'unchecked'
UNKNOWN = 'unknown'
WORKING = 'working'


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -109,25 +111,29 @@ def process_result(self, result: CheckResult) -> None:
}
self.write_linkstat(linkstat)

if result.status == 'unchecked':
if result.status == _Status.UNCHECKED:
return
if result.status == 'working' and result.message == 'old':
if result.status == _Status.WORKING and result.message == 'old':
return
if result.lineno:
logger.info('(%16s: line %4d) ', result.docname, result.lineno, nonl=True)
if result.status == 'ignored':
if result.status == _Status.IGNORED:
if result.message:
logger.info(darkgray('-ignored- ') + result.uri + ': ' + result.message)
else:
logger.info(darkgray('-ignored- ') + result.uri)
elif result.status == 'local':
elif result.status == _Status.LOCAL:
logger.info(darkgray('-local- ') + result.uri)
self.write_entry(
'local', result.docname, filename, result.lineno, result.uri
_Status.LOCAL,
result.docname,
filename,
result.lineno,
result.uri,
)
elif result.status == 'working':
elif result.status == _Status.WORKING:
logger.info(darkgreen('ok ') + result.uri + result.message)
elif result.status == 'timeout':
elif result.status == _Status.TIMEOUT:
if self.app.quiet:
logger.warning(
'timeout ' + result.uri + result.message,
Expand All @@ -138,14 +144,14 @@ def process_result(self, result: CheckResult) -> None:
red('timeout ') + result.uri + red(' - ' + result.message)
)
self.write_entry(
'timeout',
_Status.TIMEOUT,
result.docname,
filename,
result.lineno,
result.uri + ': ' + result.message,
)
self.timed_out_hyperlinks += 1
elif result.status == 'broken':
elif result.status == _Status.BROKEN:
if self.app.quiet:
logger.warning(
__('broken link: %s (%s)'),
Expand All @@ -158,14 +164,14 @@ def process_result(self, result: CheckResult) -> None:
red('broken ') + result.uri + red(' - ' + result.message)
)
self.write_entry(
'broken',
_Status.BROKEN,
result.docname,
filename,
result.lineno,
result.uri + ': ' + result.message,
)
self.broken_hyperlinks += 1
elif result.status == 'redirected':
elif result.status == _Status.REDIRECTED:
try:
text, color = {
301: ('permanently', purple),
Expand Down Expand Up @@ -199,7 +205,7 @@ def process_result(self, result: CheckResult) -> None:
msg = f'Unknown status {result.status!r}.'
raise ValueError(msg)

def write_linkstat(self, data: dict[str, str | int]) -> None:
def write_linkstat(self, data: dict[str, str | int | _Status]) -> None:
self.json_outfile.write(json.dumps(data))
self.json_outfile.write('\n')

Expand Down Expand Up @@ -306,7 +312,12 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]:
for hyperlink in hyperlinks.values():
if self.is_ignored_uri(hyperlink.uri):
yield CheckResult(
hyperlink.uri, hyperlink.docname, hyperlink.lineno, 'ignored', '', 0
uri=hyperlink.uri,
docname=hyperlink.docname,
lineno=hyperlink.lineno,
status=_Status.IGNORED,
message='',
code=0,
)
else:
self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, hyperlink), False)
Expand Down Expand Up @@ -345,7 +356,7 @@ class CheckResult(NamedTuple):
uri: str
docname: str
lineno: int
status: _StatusUnknown
status: _Status
message: str
code: int

Expand Down Expand Up @@ -388,11 +399,11 @@ def __init__(
self.retries: int = config.linkcheck_retries
self.rate_limit_timeout = config.linkcheck_rate_limit_timeout
self._allow_unauthorized = config.linkcheck_allow_unauthorized
self._timeout_status: Literal['broken', 'timeout']
self._timeout_status: Literal[_Status.BROKEN, _Status.TIMEOUT]
if config.linkcheck_report_timeouts_as_broken:
self._timeout_status = 'broken'
self._timeout_status = _Status.BROKEN
else:
self._timeout_status = 'timeout'
self._timeout_status = _Status.TIMEOUT

self.user_agent = config.user_agent
self.tls_verify = config.tls_verify
Expand Down Expand Up @@ -429,17 +440,15 @@ def run(self) -> None:
self.wqueue.task_done()
continue
status, info, code = self._check(docname, uri, hyperlink)
if status == 'rate-limited':
if status == _Status.RATE_LIMITED:
logger.info(
darkgray('-rate limited- ') + uri + darkgray(' | sleeping...')
)
else:
self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
self.wqueue.task_done()

def _check(
self, docname: str, uri: str, hyperlink: Hyperlink
) -> _URIPropertiesUnknown:
def _check(self, docname: str, uri: str, hyperlink: Hyperlink) -> _URIProperties:
# check for various conditions without bothering the network

for doc_matcher in self.documents_exclude:
Expand All @@ -448,26 +457,26 @@ def _check(
f'{docname} matched {doc_matcher.pattern} from '
'linkcheck_exclude_documents'
)
return 'ignored', info, 0
return _Status.IGNORED, info, 0

if len(uri) == 0 or uri.startswith(('#', 'mailto:', 'tel:')):
return 'unchecked', '', 0
return _Status.UNCHECKED, '', 0
if not uri.startswith(('http:', 'https:')):
if uri_re.match(uri):
# Non-supported URI schemes (ex. ftp)
return 'unchecked', '', 0
return _Status.UNCHECKED, '', 0

src_dir = path.dirname(hyperlink.docpath)
if path.exists(path.join(src_dir, uri)):
return 'working', '', 0
return 'broken', '', 0
return _Status.WORKING, '', 0
return _Status.BROKEN, '', 0

# need to actually check the URI
status: _StatusUnknown
status, info, code = '', '', 0
status: _Status
status, info, code = _Status.UNKNOWN, '', 0
for _ in range(self.retries):
status, info, code = self._check_uri(uri, hyperlink)
if status != 'broken':
if status != _Status.BROKEN:
break

return status, info, code
Expand Down Expand Up @@ -536,10 +545,14 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
try:
found = contains_anchor(response, anchor)
except UnicodeDecodeError:
return 'ignored', 'unable to decode response content', 0
return (
_Status.IGNORED,
'unable to decode response content',
0,
)
if not found:
return (
'broken',
_Status.BROKEN,
__("Anchor '%s' not found") % quote(anchor),
0,
)
Expand All @@ -560,7 +573,7 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:

except SSLError as err:
# SSL failure; report that the link is broken.
return 'broken', str(err), 0
return _Status.BROKEN, str(err), 0

except (ConnectionError, TooManyRedirects) as err:
# Servers drop the connection on HEAD requests, causing
Expand All @@ -574,33 +587,33 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
# Unauthorized: the client did not provide required credentials
if status_code == 401:
if self._allow_unauthorized:
return 'working', 'unauthorized', 0
return _Status.WORKING, 'unauthorized', 0
else:
return 'broken', 'unauthorized', 0
return _Status.BROKEN, 'unauthorized', 0

# Rate limiting; back-off if allowed, or report failure otherwise
if status_code == 429:
if next_check := self.limit_rate(response_url, retry_after):
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', error_message, 0
return _Status.RATE_LIMITED, '', 0
return _Status.BROKEN, error_message, 0

# Don't claim success/failure during server-side outages
if status_code == 503:
return 'ignored', 'service unavailable', 0
return _Status.IGNORED, 'service unavailable', 0

# For most HTTP failures, continue attempting alternate retrieval methods
continue

except Exception as err:
# Unhandled exception (intermittent or permanent); report that
# the link is broken.
return 'broken', str(err), 0
return _Status.BROKEN, str(err), 0

else:
# All available retrieval methods have been exhausted; report
# that the link is broken.
return 'broken', error_message, 0
return _Status.BROKEN, error_message, 0

# Success; clear rate limits for the origin
netloc = urlsplit(req_url).netloc
Expand All @@ -610,11 +623,11 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
(response_url.rstrip('/') == req_url.rstrip('/'))
or _allowed_redirect(req_url, response_url, self.allowed_redirects)
): # fmt: skip
return 'working', '', 0
return _Status.WORKING, '', 0
elif redirect_status_code is not None:
return 'redirected', response_url, redirect_status_code
return _Status.REDIRECTED, response_url, redirect_status_code
else:
return 'redirected', response_url, 0
return _Status.REDIRECTED, response_url, 0

def limit_rate(self, response_url: str, retry_after: str | None) -> float | None:
delay = DEFAULT_DELAY
Expand Down
Loading