diff --git a/nomenklatura/enrich/common.py b/nomenklatura/enrich/common.py index f0a58c98..399ce699 100644 --- a/nomenklatura/enrich/common.py +++ b/nomenklatura/enrich/common.py @@ -1,6 +1,7 @@ import os import json import logging +import time from banal import as_bool from typing import Union, Any, Dict, Optional, Generator from abc import ABC, abstractmethod @@ -101,6 +102,7 @@ def http_post_json_cached( cache_key: str, json: Any, cache_days: Optional[int] = None, + retry: int = 3, ) -> Any: cache_days_ = self.cache_days if cache_days is None else cache_days resp_data = self.cache.get_json(cache_key, max_age=cache_days_) @@ -111,6 +113,17 @@ def http_post_json_cached( except RequestException as rex: if rex.response is not None and rex.response.status_code in (401, 403): raise EnrichmentAbort("Authorization failure: %s" % url) from rex + if rex.response is not None and rex.response.status_code == 429: + if retry > 0: + log.info("Rate limit exceeded. Sleeping for 60s.") + time.sleep(61) + return self.http_post_json_cached( + url, cache_key, json, cache_days, retry - 1 + ) + else: + raise EnrichmentAbort( + "Rate limit exceeded and out of retries: %s" % url + ) from rex msg = "HTTP POST failed [%s]: %s" % (url, rex) raise EnrichmentException(msg) from rex resp_data = resp.json() diff --git a/nomenklatura/enrich/openfigi.py b/nomenklatura/enrich/openfigi.py new file mode 100644 index 00000000..df792a1d --- /dev/null +++ b/nomenklatura/enrich/openfigi.py @@ -0,0 +1,82 @@ +import os +import logging +from typing import Any, Generator, Dict, List +from urllib.parse import urljoin +from followthemoney.util import make_entity_id +from normality import slugify + +from nomenklatura.entity import CE +from nomenklatura.dataset import DS +from nomenklatura.cache import Cache +from nomenklatura.enrich.common import Enricher, EnricherConfig + +log = logging.getLogger(__name__) + +URL = "https://api.openfigi.com/v3/search" + + +class OpenFIGIEnricher(Enricher): + """Uses the `OpenFIGI` search API to look up FIGIs by company name.""" + + def __init__(self, dataset: DS, cache: Cache, config: EnricherConfig): + super().__init__(dataset, cache, config) + + api_key = os.environ.get("OPENFIGI_API_KEY") + if api_key is not None: + self.session.headers["X-OPENFIGI-APIKEY"] = api_key + + def make_company_id(self, name: str) -> str: + return f"figi-co-{make_entity_id(name)}" + + def make_security_id(self, figi: str) -> str: + return f"figi-id-{slugify(figi, sep='-')}" + + def search(self, query: str) -> Generator[Dict[str, str], None, None]: + body = {"query": query} + next = None + + while True: + if next is not None: + body["start"] = next + + log.info(f"Searching {query}. Offset={next}") + cache_key = f"{URL}:{query}:{next}" + resp = self.http_post_json_cached(URL, cache_key, body) + if "data" in resp: + yield from resp["data"] + + next = resp.get("next", None) + if next is None: + break + + def match(self, entity: CE) -> Generator[CE, None, None]: + for name in entity.get("name"): + for match in self.search(name): + match_name = match.get("name", None) + if match_name is None: + continue + other = self.make_entity(entity, "Company") + other.id = self.make_company_id(match_name) + other.add("name", match_name) + yield other + + def expand(self, entity: CE, match: CE) -> Generator[CE, None, None]: + name = match.get("name")[0] + for item in self.search(name): + + # Only emit the securities which match the name of the positive match + # to the company exactly. Skip everything else. + if item["name"] != name: + continue + + security = self.make_entity(match, "Security") + security.id = self.make_security_id(item["figi"]) + security.add("name", item["figi"]) + security.add("issuer", match) + security.add("ticker", item["ticker"]) + security.add("type", item["securityType"]) + if item["exchCode"] is not None: + security.add("notes", f'exchange {item["exchCode"]}') + security.add("description", item["securityDescription"]) + + yield security diff --git a/tests/enrich/test_openfigi.py b/tests/enrich/test_openfigi.py new file mode 100644 index 00000000..7a1e72ad --- /dev/null +++ b/tests/enrich/test_openfigi.py @@ -0,0 +1,71 @@ +import requests_mock +from nomenklatura.cache import Cache +from nomenklatura.dataset import Dataset +from nomenklatura.enrich import get_enricher +from nomenklatura.enrich.common import Enricher +from nomenklatura.entity import CompositeEntity + + +PATH = "nomenklatura.enrich.openfigi:OpenFIGIEnricher" +RESPONSE = { + "data": [ + { + "figi": "BBG0005S7P81", + "securityType": "EURO-DOLLAR", + "marketSector": "Govt", + "ticker": "BKRUSS F 12/31/01", + "name": "CENTRAL BANK OF RUSSIA", + "exchCode": "NOT LISTED", + "shareClassFIGI": None, + "compositeFIGI": None, + "securityType2": None, + "securityDescription": "BKRUSS Float 12/31/01", + }, + { + "figi": "BBG002T3FYF0", + "securityType": "Index", + "marketSector": "Index", + "ticker": "RCRAMAR", + "name": "Bank of Russia Russia Central", + "exchCode": None, + "shareClassFIGI": None, + "compositeFIGI": None, + "securityType2": None, + "securityDescription": "Bank of Russia Russia Central", + }, + ] +} + + +dataset = Dataset.make({"name": "ext_open_figi", "title": "OpenFIGI"}) + + +def load_enricher(): + enricher_cls = get_enricher(PATH) + assert issubclass(enricher_cls, Enricher) + cache = Cache.make_default(dataset) + return enricher_cls(dataset, cache, {}) + + +def test_nominatim_match(): + enricher = load_enricher() + with requests_mock.Mocker() as m: + m.post("/v3/search", json=RESPONSE) + + data = { + "schema": "Company", + "id": "xxx", + "properties": {"name": ["Bank of Russia"]}, + } + ent = CompositeEntity.from_data(dataset, data) + m_results = list(enricher.match(ent)) + assert len(m_results) == 2, m_results + m1 = m_results[0] + m2 = m_results[1] + assert m1.get("name") == ["CENTRAL BANK OF RUSSIA"], m1 + assert m2.get("name") == ["Bank of Russia Russia Central"], m2 + + e_results = list(enricher.expand(ent, m_results[0])) + assert len(e_results) == 1, e_results + assert e_results[0].get("ticker") == ["BKRUSS F 12/31/01"], e_results + assert e_results[0].get("issuer") == [m_results[0].id], e_results