Skip to content

Commit

Permalink
Use a StrEnum to record linkcheck status codes (#13043)
Browse files Browse the repository at this point in the history
Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
  • Loading branch information
jayaddison and AA-Turner authored Oct 20, 2024
1 parent 285908a commit 88f560c
Showing 1 changed file with 68 additions and 55 deletions.
123 changes: 68 additions & 55 deletions sphinx/builders/linkcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import socket
import time
from enum import StrEnum
from html.parser import HTMLParser
from os import path
from queue import PriorityQueue, Queue
Expand Down Expand Up @@ -38,19 +39,20 @@
from sphinx.util._pathlib import _StrPath
from sphinx.util.typing import ExtensionMetadata

_Status: TypeAlias = Literal[
'broken',
'ignored',
'local',
'rate-limited',
'redirected',
'timeout',
'unchecked',
'working',
]
_StatusUnknown: TypeAlias = _Status | Literal['']
_URIProperties: TypeAlias = tuple[_Status, str, int]
_URIPropertiesUnknown: TypeAlias = tuple[_StatusUnknown, str, int]
_URIProperties: TypeAlias = tuple['_Status', str, int]


class _Status(StrEnum):
BROKEN = 'broken'
IGNORED = 'ignored'
LOCAL = 'local'
RATE_LIMITED = 'rate-limited'
REDIRECTED = 'redirected'
TIMEOUT = 'timeout'
UNCHECKED = 'unchecked'
UNKNOWN = 'unknown'
WORKING = 'working'


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -109,25 +111,29 @@ def process_result(self, result: CheckResult) -> None:
}
self.write_linkstat(linkstat)

if result.status == 'unchecked':
if result.status == _Status.UNCHECKED:
return
if result.status == 'working' and result.message == 'old':
if result.status == _Status.WORKING and result.message == 'old':
return
if result.lineno:
logger.info('(%16s: line %4d) ', result.docname, result.lineno, nonl=True)
if result.status == 'ignored':
if result.status == _Status.IGNORED:
if result.message:
logger.info(darkgray('-ignored- ') + result.uri + ': ' + result.message)
else:
logger.info(darkgray('-ignored- ') + result.uri)
elif result.status == 'local':
elif result.status == _Status.LOCAL:
logger.info(darkgray('-local- ') + result.uri)
self.write_entry(
'local', result.docname, filename, result.lineno, result.uri
_Status.LOCAL,
result.docname,
filename,
result.lineno,
result.uri,
)
elif result.status == 'working':
elif result.status == _Status.WORKING:
logger.info(darkgreen('ok ') + result.uri + result.message)
elif result.status == 'timeout':
elif result.status == _Status.TIMEOUT:
if self.app.quiet:
logger.warning(
'timeout ' + result.uri + result.message,
Expand All @@ -138,14 +144,14 @@ def process_result(self, result: CheckResult) -> None:
red('timeout ') + result.uri + red(' - ' + result.message)
)
self.write_entry(
'timeout',
_Status.TIMEOUT,
result.docname,
filename,
result.lineno,
result.uri + ': ' + result.message,
)
self.timed_out_hyperlinks += 1
elif result.status == 'broken':
elif result.status == _Status.BROKEN:
if self.app.quiet:
logger.warning(
__('broken link: %s (%s)'),
Expand All @@ -158,14 +164,14 @@ def process_result(self, result: CheckResult) -> None:
red('broken ') + result.uri + red(' - ' + result.message)
)
self.write_entry(
'broken',
_Status.BROKEN,
result.docname,
filename,
result.lineno,
result.uri + ': ' + result.message,
)
self.broken_hyperlinks += 1
elif result.status == 'redirected':
elif result.status == _Status.REDIRECTED:
try:
text, color = {
301: ('permanently', purple),
Expand Down Expand Up @@ -199,7 +205,7 @@ def process_result(self, result: CheckResult) -> None:
msg = f'Unknown status {result.status!r}.'
raise ValueError(msg)

def write_linkstat(self, data: dict[str, str | int]) -> None:
def write_linkstat(self, data: dict[str, str | int | _Status]) -> None:
self.json_outfile.write(json.dumps(data))
self.json_outfile.write('\n')

Expand Down Expand Up @@ -306,7 +312,12 @@ def check(self, hyperlinks: dict[str, Hyperlink]) -> Iterator[CheckResult]:
for hyperlink in hyperlinks.values():
if self.is_ignored_uri(hyperlink.uri):
yield CheckResult(
hyperlink.uri, hyperlink.docname, hyperlink.lineno, 'ignored', '', 0
uri=hyperlink.uri,
docname=hyperlink.docname,
lineno=hyperlink.lineno,
status=_Status.IGNORED,
message='',
code=0,
)
else:
self.wqueue.put(CheckRequest(CHECK_IMMEDIATELY, hyperlink), False)
Expand Down Expand Up @@ -345,7 +356,7 @@ class CheckResult(NamedTuple):
uri: str
docname: str
lineno: int
status: _StatusUnknown
status: _Status
message: str
code: int

Expand Down Expand Up @@ -388,11 +399,11 @@ def __init__(
self.retries: int = config.linkcheck_retries
self.rate_limit_timeout = config.linkcheck_rate_limit_timeout
self._allow_unauthorized = config.linkcheck_allow_unauthorized
self._timeout_status: Literal['broken', 'timeout']
self._timeout_status: Literal[_Status.BROKEN, _Status.TIMEOUT]
if config.linkcheck_report_timeouts_as_broken:
self._timeout_status = 'broken'
self._timeout_status = _Status.BROKEN
else:
self._timeout_status = 'timeout'
self._timeout_status = _Status.TIMEOUT

self.user_agent = config.user_agent
self.tls_verify = config.tls_verify
Expand Down Expand Up @@ -429,17 +440,15 @@ def run(self) -> None:
self.wqueue.task_done()
continue
status, info, code = self._check(docname, uri, hyperlink)
if status == 'rate-limited':
if status == _Status.RATE_LIMITED:
logger.info(
darkgray('-rate limited- ') + uri + darkgray(' | sleeping...')
)
else:
self.rqueue.put(CheckResult(uri, docname, lineno, status, info, code))
self.wqueue.task_done()

def _check(
self, docname: str, uri: str, hyperlink: Hyperlink
) -> _URIPropertiesUnknown:
def _check(self, docname: str, uri: str, hyperlink: Hyperlink) -> _URIProperties:
# check for various conditions without bothering the network

for doc_matcher in self.documents_exclude:
Expand All @@ -448,26 +457,26 @@ def _check(
f'{docname} matched {doc_matcher.pattern} from '
'linkcheck_exclude_documents'
)
return 'ignored', info, 0
return _Status.IGNORED, info, 0

if len(uri) == 0 or uri.startswith(('#', 'mailto:', 'tel:')):
return 'unchecked', '', 0
return _Status.UNCHECKED, '', 0
if not uri.startswith(('http:', 'https:')):
if uri_re.match(uri):
# Non-supported URI schemes (ex. ftp)
return 'unchecked', '', 0
return _Status.UNCHECKED, '', 0

src_dir = path.dirname(hyperlink.docpath)
if path.exists(path.join(src_dir, uri)):
return 'working', '', 0
return 'broken', '', 0
return _Status.WORKING, '', 0
return _Status.BROKEN, '', 0

# need to actually check the URI
status: _StatusUnknown
status, info, code = '', '', 0
status: _Status
status, info, code = _Status.UNKNOWN, '', 0
for _ in range(self.retries):
status, info, code = self._check_uri(uri, hyperlink)
if status != 'broken':
if status != _Status.BROKEN:
break

return status, info, code
Expand Down Expand Up @@ -536,10 +545,14 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
try:
found = contains_anchor(response, anchor)
except UnicodeDecodeError:
return 'ignored', 'unable to decode response content', 0
return (
_Status.IGNORED,
'unable to decode response content',
0,
)
if not found:
return (
'broken',
_Status.BROKEN,
__("Anchor '%s' not found") % quote(anchor),
0,
)
Expand All @@ -560,7 +573,7 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:

except SSLError as err:
# SSL failure; report that the link is broken.
return 'broken', str(err), 0
return _Status.BROKEN, str(err), 0

except (ConnectionError, TooManyRedirects) as err:
# Servers drop the connection on HEAD requests, causing
Expand All @@ -574,33 +587,33 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
# Unauthorized: the client did not provide required credentials
if status_code == 401:
if self._allow_unauthorized:
return 'working', 'unauthorized', 0
return _Status.WORKING, 'unauthorized', 0
else:
return 'broken', 'unauthorized', 0
return _Status.BROKEN, 'unauthorized', 0

# Rate limiting; back-off if allowed, or report failure otherwise
if status_code == 429:
if next_check := self.limit_rate(response_url, retry_after):
self.wqueue.put(CheckRequest(next_check, hyperlink), False)
return 'rate-limited', '', 0
return 'broken', error_message, 0
return _Status.RATE_LIMITED, '', 0
return _Status.BROKEN, error_message, 0

# Don't claim success/failure during server-side outages
if status_code == 503:
return 'ignored', 'service unavailable', 0
return _Status.IGNORED, 'service unavailable', 0

# For most HTTP failures, continue attempting alternate retrieval methods
continue

except Exception as err:
# Unhandled exception (intermittent or permanent); report that
# the link is broken.
return 'broken', str(err), 0
return _Status.BROKEN, str(err), 0

else:
# All available retrieval methods have been exhausted; report
# that the link is broken.
return 'broken', error_message, 0
return _Status.BROKEN, error_message, 0

# Success; clear rate limits for the origin
netloc = urlsplit(req_url).netloc
Expand All @@ -610,11 +623,11 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
(response_url.rstrip('/') == req_url.rstrip('/'))
or _allowed_redirect(req_url, response_url, self.allowed_redirects)
): # fmt: skip
return 'working', '', 0
return _Status.WORKING, '', 0
elif redirect_status_code is not None:
return 'redirected', response_url, redirect_status_code
return _Status.REDIRECTED, response_url, redirect_status_code
else:
return 'redirected', response_url, 0
return _Status.REDIRECTED, response_url, 0

def limit_rate(self, response_url: str, retry_after: str | None) -> float | None:
delay = DEFAULT_DELAY
Expand Down

0 comments on commit 88f560c

Please sign in to comment.