diff --git a/nomenklatura/enrich/common.py b/nomenklatura/enrich/common.py index 0458de8..bb511e1 100644 --- a/nomenklatura/enrich/common.py +++ b/nomenklatura/enrich/common.py @@ -7,7 +7,7 @@ from typing import Union, Any, Dict, Optional, Generator, Generic from abc import ABC, abstractmethod from requests import Session -from requests.exceptions import RequestException +from requests.exceptions import RequestException, ChunkedEncodingError from followthemoney.types import registry from followthemoney.types.topic import TopicType from rigour.urls import build_url, ParamsType @@ -82,9 +82,11 @@ def session(self) -> Session: if self._session is None: self._session = Session() self._session.headers["User-Agent"] = f"nomenklatura/{__version__}" + # 4 * 2 ** 4 = 64s + # openfigi or something needed 60 seconds to recover retries = Retry( - total=4, - backoff_factor=1, + total=5, + backoff_factor=4, allowed_methods=["GET", "POST"], ) self._session.mount("https://", HTTPAdapter(max_retries=retries)) @@ -139,7 +141,7 @@ def http_post_json_cached( data: Any = None, headers: HeadersType = None, cache_days: Optional[int] = None, - retry: int = 3, + retry_chunked: int = 1, ) -> Any: cache_days_ = self.cache_days if cache_days is None else cache_days resp_data = self.cache.get_json(cache_key, max_age=cache_days_) @@ -147,24 +149,30 @@ def http_post_json_cached( try: resp = self.session.post(url, json=json, data=data, headers=headers) resp.raise_for_status() + except ChunkedEncodingError as rex: + # With urllib3 Retry enabled we should only see this exception + # directly (not inside MaxRetryError) due to + # https://github.com/urllib3/urllib3/issues/2751#issuecomment-2567630065 + # Since urllib won't retry it, retry it here. + # urllib does close the connection. + if "Response ended prematurely" in str(rex) and retry_chunked > 0: + log.info("Retrying due to chunked encoding error: %s", rex) + return self.http_post_json_cached( + url, + cache_key, + json=json, + data=data, + headers=headers, + cache_days=cache_days, + retry_chunked=retry_chunked - 1, + ) + + msg = "HTTP POST failed [%s]: %s" % (url, rex) + raise EnrichmentException(rex) from rex except RequestException as rex: if rex.response is not None and rex.response.status_code in (401, 403): raise EnrichmentAbort("Authorization failure: %s" % url) from rex - if rex.response is not None and rex.response.status_code == 429: - if retry > 0: - log.info("Rate limit exceeded. Sleeping for 60s.") - time.sleep(61) - return self.http_post_json_cached( - url, - cache_key, - json=json, - cache_days=cache_days, - retry=retry - 1, - ) - else: - raise EnrichmentAbort( - "Rate limit exceeded and out of retries: %s" % url - ) from rex + msg = "HTTP POST failed [%s]: %s" % (url, rex) log.info(f"{msg}\n{traceback.format_exc()}") raise EnrichmentException(msg) from rex diff --git a/nomenklatura/enrich/permid.py b/nomenklatura/enrich/permid.py index 327a22d..c4d9753 100644 --- a/nomenklatura/enrich/permid.py +++ b/nomenklatura/enrich/permid.py @@ -10,6 +10,7 @@ from typing import cast, Set, Generator, Optional, Dict, Any from urllib.parse import urljoin from followthemoney.types import registry +from requests.exceptions import ChunkedEncodingError from nomenklatura.entity import CE from nomenklatura.dataset import DS @@ -179,7 +180,6 @@ def match(self, entity: CE) -> Generator[CE, None, None]: cache_key, data=query, headers=headers, - retry=0, cache_days=self.cache_days, ) seen_matches: Set[str] = set() diff --git a/nomenklatura/enrich/yente.py b/nomenklatura/enrich/yente.py index 7c28aaf..a207fa2 100644 --- a/nomenklatura/enrich/yente.py +++ b/nomenklatura/enrich/yente.py @@ -67,7 +67,7 @@ def match(self, entity: CE) -> Generator[CE, None, None]: } for retry in range(4): try: - response = self.http_post_json_cached(url, cache_key, query, retry=0) + response = self.http_post_json_cached(url, cache_key, query) inner_resp = response.get("responses", {}).get("entity", {}) for result in inner_resp.get("results", []): proxy = self.load_entity(entity, result)