Skip to content

Commit

Permalink
Retry unretried errors from permid
Browse files Browse the repository at this point in the history
Removing superfluous retry code - normally retries should be
handled by urllib3 Retry.
  • Loading branch information
jbothma committed Jan 8, 2025
1 parent 1ed01bc commit 8be039f
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 21 deletions.
46 changes: 27 additions & 19 deletions nomenklatura/enrich/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from typing import Union, Any, Dict, Optional, Generator, Generic
from abc import ABC, abstractmethod
from requests import Session
from requests.exceptions import RequestException
from requests.exceptions import RequestException, ChunkedEncodingError
from followthemoney.types import registry
from followthemoney.types.topic import TopicType
from rigour.urls import build_url, ParamsType
Expand Down Expand Up @@ -82,9 +82,11 @@ def session(self) -> Session:
if self._session is None:
self._session = Session()
self._session.headers["User-Agent"] = f"nomenklatura/{__version__}"
# 4 * 2 ** 4 = 64s
# openfigi or something needed 60 seconds to recover
retries = Retry(
total=4,
backoff_factor=1,
total=5,
backoff_factor=4,
allowed_methods=["GET", "POST"],
)
self._session.mount("https://", HTTPAdapter(max_retries=retries))
Expand Down Expand Up @@ -139,32 +141,38 @@ def http_post_json_cached(
data: Any = None,
headers: HeadersType = None,
cache_days: Optional[int] = None,
retry: int = 3,
retry_chunked: int = 1,
) -> Any:
cache_days_ = self.cache_days if cache_days is None else cache_days
resp_data = self.cache.get_json(cache_key, max_age=cache_days_)
if resp_data is None:
try:
resp = self.session.post(url, json=json, data=data, headers=headers)
resp.raise_for_status()
except ChunkedEncodingError as rex:
# With urllib3 Retry enabled we should only see this exception
# directly (not inside MaxRetryError) due to
# https://github.com/urllib3/urllib3/issues/2751#issuecomment-2567630065
# Since urllib won't retry it, retry it here.
# urllib does close the connection.
if "Response ended prematurely" in str(rex) and retry_chunked > 0:
log.info("Retrying due to chunked encoding error: %s", rex)
return self.http_post_json_cached(
url,
cache_key,
json=json,
data=data,
headers=headers,
cache_days=cache_days,
retry_chunked=retry_chunked - 1,
)

msg = "HTTP POST failed [%s]: %s" % (url, rex)
raise EnrichmentException(rex) from rex
except RequestException as rex:
if rex.response is not None and rex.response.status_code in (401, 403):
raise EnrichmentAbort("Authorization failure: %s" % url) from rex
if rex.response is not None and rex.response.status_code == 429:
if retry > 0:
log.info("Rate limit exceeded. Sleeping for 60s.")
time.sleep(61)
return self.http_post_json_cached(
url,
cache_key,
json=json,
cache_days=cache_days,
retry=retry - 1,
)
else:
raise EnrichmentAbort(
"Rate limit exceeded and out of retries: %s" % url
) from rex

msg = "HTTP POST failed [%s]: %s" % (url, rex)
log.info(f"{msg}\n{traceback.format_exc()}")
raise EnrichmentException(msg) from rex
Expand Down
2 changes: 1 addition & 1 deletion nomenklatura/enrich/permid.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from typing import cast, Set, Generator, Optional, Dict, Any
from urllib.parse import urljoin
from followthemoney.types import registry
from requests.exceptions import ChunkedEncodingError

from nomenklatura.entity import CE
from nomenklatura.dataset import DS
Expand Down Expand Up @@ -179,7 +180,6 @@ def match(self, entity: CE) -> Generator[CE, None, None]:
cache_key,
data=query,
headers=headers,
retry=0,
cache_days=self.cache_days,
)
seen_matches: Set[str] = set()
Expand Down
2 changes: 1 addition & 1 deletion nomenklatura/enrich/yente.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def match(self, entity: CE) -> Generator[CE, None, None]:
}
for retry in range(4):
try:
response = self.http_post_json_cached(url, cache_key, query, retry=0)
response = self.http_post_json_cached(url, cache_key, query)
inner_resp = response.get("responses", {}).get("entity", {})
for result in inner_resp.get("results", []):
proxy = self.load_entity(entity, result)
Expand Down

0 comments on commit 8be039f

Please sign in to comment.