Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a StrEnum to record linkcheck status codes #13043

Merged
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
104 changes: 63 additions & 41 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import socket
import time
from enum import StrEnum
from html.parser import HTMLParser
from os import path
from queue import PriorityQueue, Queue
Expand All @@ -27,6 +28,18 @@
from sphinx.util.http_date import rfc1123_to_epoch
from sphinx.util.nodes import get_node_line


class LinkStatus(StrEnum):
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
BROKEN = 'broken'
IGNORED = 'ignored'
TIMEOUT = 'timeout'
RATE_LIMITED = 'rate-limited'
REDIRECTED = 'redirected'
UNCHECKED = 'unchecked'
UNKNOWN = 'unknown'
WORKING = 'working'


if TYPE_CHECKING:
from collections.abc import Callable, Iterator
from typing import Any, Literal, TypeAlias
Expand All @@ -39,14 +52,14 @@
from sphinx.util.typing import ExtensionMetadata

_Status: TypeAlias = Literal[
'broken',
'ignored',
'local',
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
'rate-limited',
'redirected',
'timeout',
'unchecked',
'working',
LinkStatus.BROKEN,
LinkStatus.IGNORED,
LinkStatus.TIMEOUT,
LinkStatus.RATE_LIMITED,
LinkStatus.REDIRECTED,
LinkStatus.UNCHECKED,
LinkStatus.UNKNOWN,
LinkStatus.WORKING,
]
_StatusUnknown: TypeAlias = _Status | Literal['']
_URIProperties: TypeAlias = tuple[_Status, str, int]
Expand Down Expand Up @@ -109,13 +122,13 @@ def process_result(self, result: CheckResult) -> None:
}
self.write_linkstat(linkstat)

if result.status == 'unchecked':
if result.status == LinkStatus.UNCHECKED:
return
if result.status == 'working' and result.message == 'old':
if result.status == LinkStatus.WORKING and result.message == 'old':
return
if result.lineno:
logger.info('(%16s: line %4d) ', result.docname, result.lineno, nonl=True)
if result.status == 'ignored':
if result.status == LinkStatus.IGNORED:
if result.message:
logger.info(darkgray('-ignored- ') + result.uri + ': ' + result.message)
else:
Expand All @@ -125,9 +138,9 @@ def process_result(self, result: CheckResult) -> None:
self.write_entry(
'local', result.docname, filename, result.lineno, result.uri
)
elif result.status == 'working':
elif result.status == LinkStatus.WORKING:
logger.info(darkgreen('ok ') + result.uri + result.message)
elif result.status == 'timeout':
elif result.status == LinkStatus.TIMEOUT:
if self.app.quiet:
logger.warning(
'timeout ' + result.uri + result.message,
Expand All @@ -138,14 +151,14 @@ def process_result(self, result: CheckResult) -> None:
red('timeout ') + result.uri + red(' - ' + result.message)
)
self.write_entry(
'timeout',
LinkStatus.TIMEOUT,
result.docname,
filename,
result.lineno,
result.uri + ': ' + result.message,
)
self.timed_out_hyperlinks += 1
elif result.status == 'broken':
elif result.status == LinkStatus.BROKEN:
if self.app.quiet:
logger.warning(
__('broken link: %s (%s)'),
Expand All @@ -158,14 +171,14 @@ def process_result(self, result: CheckResult) -> None:
red('broken ') + result.uri + red(' - ' + result.message)
)
self.write_entry(
'broken',
result.status,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
result.status,
LinkStatus.BROKEN.value,

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using a specific symbol/status-code per write_entry call makes sense to me; I'm less sure about the .value access though. What's the thinking there?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

'broken' is a literal, the closest semantic replacement is the LinkStatus.BROKEN.value constant, rather than the result.status variable.

result.docname,
filename,
result.lineno,
result.uri + ': ' + result.message,
)
self.broken_hyperlinks += 1
elif result.status == 'redirected':
elif result.status == LinkStatus.REDIRECTED:
try:
text, color = {
301: ('permanently', purple),
Expand Down Expand Up @@ -306,7 +319,12 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]:
for hyperlink in hyperlinks.values():
if self.is_ignored_uri(hyperlink.uri):
yield CheckResult(
hyperlink.uri, hyperlink.docname, hyperlink.lineno, 'ignored', '', 0
uri=hyperlink.uri,
docname=hyperlink.docname,
lineno=hyperlink.lineno,
status=LinkStatus.IGNORED,
message='',
code=0,
)
else:
self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, hyperlink), False)
Expand Down Expand Up @@ -388,11 +406,11 @@ def __init__(
self.retries: int = config.linkcheck_retries
self.rate_limit_timeout = config.linkcheck_rate_limit_timeout
self._allow_unauthorized = config.linkcheck_allow_unauthorized
self._timeout_status: Literal['broken', 'timeout']
self._timeout_status: Literal[LinkStatus.BROKEN, LinkStatus.TIMEOUT]
if config.linkcheck_report_timeouts_as_broken:
self._timeout_status = 'broken'
self._timeout_status = LinkStatus.BROKEN
else:
self._timeout_status = 'timeout'
self._timeout_status = LinkStatus.TIMEOUT

self.user_agent = config.user_agent
self.tls_verify = config.tls_verify
Expand Down Expand Up @@ -429,7 +447,7 @@ def run(self) -> None:
self.wqueue.task_done()
continue
status, info, code = self._check(docname, uri, hyperlink)
if status == 'rate-limited':
if status == LinkStatus.RATE_LIMITED:
logger.info(
darkgray('-rate limited- ') + uri + darkgray(' | sleeping...')
)
Expand All @@ -448,26 +466,26 @@ def _check(
f'{docname} matched {doc_matcher.pattern} from '
'linkcheck_exclude_documents'
)
return 'ignored', info, 0
return LinkStatus.IGNORED, info, 0

if len(uri) == 0 or uri.startswith(('#', 'mailto:', 'tel:')):
return 'unchecked', '', 0
return LinkStatus.UNCHECKED, '', 0
if not uri.startswith(('http:', 'https:')):
if uri_re.match(uri):
# Non-supported URI schemes (ex. ftp)
return 'unchecked', '', 0
return LinkStatus.UNCHECKED, '', 0

src_dir = path.dirname(hyperlink.docpath)
if path.exists(path.join(src_dir, uri)):
return 'working', '', 0
return 'broken', '', 0
return LinkStatus.WORKING, '', 0
return LinkStatus.BROKEN, '', 0

# need to actually check the URI
status: _StatusUnknown
status, info, code = '', '', 0
for _ in range(self.retries):
status, info, code = self._check_uri(uri, hyperlink)
if status != 'broken':
if status != LinkStatus.BROKEN:
break

return status, info, code
Expand Down Expand Up @@ -536,10 +554,14 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
try:
found = contains_anchor(response, anchor)
except UnicodeDecodeError:
return 'ignored', 'unable to decode response content', 0
return (
LinkStatus.IGNORED,
'unable to decode response content',
0,
)
if not found:
return (
'broken',
LinkStatus.BROKEN,
__("Anchor '%s' not found") % quote(anchor),
0,
)
Expand All @@ -560,7 +582,7 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:

except SSLError as err:
# SSL failure; report that the link is broken.
return 'broken', str(err), 0
return LinkStatus.BROKEN, str(err), 0

except (ConnectionError, TooManyRedirects) as err:
# Servers drop the connection on HEAD requests, causing
Expand All @@ -574,33 +596,33 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
# Unauthorized: the client did not provide required credentials
if status_code == 401:
if self._allow_unauthorized:
return 'working', 'unauthorized', 0
return LinkStatus.WORKING, 'unauthorized', 0
else:
return 'broken', 'unauthorized', 0
return LinkStatus.BROKEN, 'unauthorized', 0

# Rate limiting; back-off if allowed, or report failure otherwise
if status_code == 429:
if next_check := self.limit_rate(response_url, retry_after):
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', error_message, 0
return LinkStatus.RATE_LIMITED, '', 0
return LinkStatus.BROKEN, error_message, 0

# Don't claim success/failure during server-side outages
if status_code == 503:
return 'ignored', 'service unavailable', 0
return LinkStatus.IGNORED, 'service unavailable', 0

# For most HTTP failures, continue attempting alternate retrieval methods
continue

except Exception as err:
# Unhandled exception (intermittent or permanent); report that
# the link is broken.
return 'broken', str(err), 0
return LinkStatus.BROKEN, str(err), 0

else:
# All available retrieval methods have been exhausted; report
# that the link is broken.
return 'broken', error_message, 0
return LinkStatus.BROKEN, error_message, 0

# Success; clear rate limits for the origin
netloc = urlsplit(req_url).netloc
Expand All @@ -610,11 +632,11 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
(response_url.rstrip('/') == req_url.rstrip('/'))
or _allowed_redirect(req_url, response_url, self.allowed_redirects)
): # fmt: skip
return 'working', '', 0
return LinkStatus.WORKING, '', 0
elif redirect_status_code is not None:
return 'redirected', response_url, redirect_status_code
return LinkStatus.REDIRECTED, response_url, redirect_status_code
else:
return 'redirected', response_url, 0
return LinkStatus.REDIRECTED, response_url, 0

def limit_rate(self, response_url: str, retry_after: str | None) -> float | None:
delay = DEFAULT_DELAY
Expand Down
Loading