From 104067ec854459799d3120c86f39e92a0e500b23 Mon Sep 17 00:00:00 2001 From: JD Bothma Date: Tue, 14 Nov 2023 14:03:32 +0200 Subject: [PATCH 1/5] Basic matching for openfigi --- nomenklatura/enrich/common.py | 5 +++ nomenklatura/enrich/openfigi.py | 54 +++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 nomenklatura/enrich/openfigi.py diff --git a/nomenklatura/enrich/common.py b/nomenklatura/enrich/common.py index f0a58c98..4dc7a896 100644 --- a/nomenklatura/enrich/common.py +++ b/nomenklatura/enrich/common.py @@ -1,6 +1,7 @@ import os import json import logging +import time from banal import as_bool from typing import Union, Any, Dict, Optional, Generator from abc import ABC, abstractmethod @@ -111,6 +112,10 @@ def http_post_json_cached( except RequestException as rex: if rex.response is not None and rex.response.status_code in (401, 403): raise EnrichmentAbort("Authorization failure: %s" % url) from rex + if rex.response is not None and rex.response.status_code == 429: + log.info("Rate limit exceeded. Sleeping for 60s.") + time.sleep(61) + return self.http_post_json_cached(url, cache_key, json, cache_days) msg = "HTTP POST failed [%s]: %s" % (url, rex) raise EnrichmentException(msg) from rex resp_data = resp.json() diff --git a/nomenklatura/enrich/openfigi.py b/nomenklatura/enrich/openfigi.py new file mode 100644 index 00000000..c37d49b3 --- /dev/null +++ b/nomenklatura/enrich/openfigi.py @@ -0,0 +1,54 @@ +import os +import logging +from typing import Any, Generator, Dict, List +from urllib.parse import urljoin + +from nomenklatura.entity import CE +from nomenklatura.dataset import DS +from nomenklatura.cache import Cache +from nomenklatura.enrich.common import Enricher, EnricherConfig + +log = logging.getLogger(__name__) +URL = "https://api.openfigi.com/v3/search" + + +class OpenFIGIEnricher(Enricher): + """Uses the `OpenFIGI` search API to look up FIGIs by company name.""" + + def __init__(self, dataset: DS, cache: Cache, config: EnricherConfig): + super().__init__(dataset, cache, config) + + api_key = os.environ.get("OPENFIGI_API_KEY") + self.session.headers["X-OPENFIGI-APIKEY"] = api_key + + def make_entity_id(self, name): + return make_entity_id("name", name, ) + + + def match(self, entity: CE) -> Generator[CE, None, None]: + for name in entity.get("name"): + body = {"query": name} + next = None + + while True: + if next is not None: + body["start"] = next + + log.info(f"Searching {name}. Offset={next}") + cache_key = f"{URL}:{name}:{next}" + resp = self.http_post_json_cached(URL, cache_key, body) + + for match in resp.get("data"): + print(name, "->", match["name"]) + other = self.make_entity(entity, "Company") + other.id = self.make_entity_id(match["name"]) + other.add("name", match["name"]) + yield other + + next = resp.get("next", None) + if next is None: + break + + + def expand(self, entity: CE, match: CE) -> Generator[CE, None, None]: + pass From 80858829294dd2d8b11e5fc1e2babd39fcdb6269 Mon Sep 17 00:00:00 2001 From: JD Bothma Date: Tue, 14 Nov 2023 17:27:43 +0200 Subject: [PATCH 2/5] Emit matching securities --- nomenklatura/enrich/openfigi.py | 71 ++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/nomenklatura/enrich/openfigi.py b/nomenklatura/enrich/openfigi.py index c37d49b3..5426a0c6 100644 --- a/nomenklatura/enrich/openfigi.py +++ b/nomenklatura/enrich/openfigi.py @@ -2,6 +2,8 @@ import logging from typing import Any, Generator, Dict, List from urllib.parse import urljoin +from followthemoney.util import make_entity_id +from normality import slugify from nomenklatura.entity import CE from nomenklatura.dataset import DS @@ -9,6 +11,7 @@ from nomenklatura.enrich.common import Enricher, EnricherConfig log = logging.getLogger(__name__) + URL = "https://api.openfigi.com/v3/search" @@ -17,38 +20,58 @@ class OpenFIGIEnricher(Enricher): def __init__(self, dataset: DS, cache: Cache, config: EnricherConfig): super().__init__(dataset, cache, config) - + api_key = os.environ.get("OPENFIGI_API_KEY") self.session.headers["X-OPENFIGI-APIKEY"] = api_key - def make_entity_id(self, name): - return make_entity_id("name", name, ) + def make_company_id(self, name): + return f"figi-co-{make_entity_id(name)}" + def make_security_id(self, figi): + return f"figi-id-{slugify(figi, sep='-')}" - def match(self, entity: CE) -> Generator[CE, None, None]: - for name in entity.get("name"): - body = {"query": name} - next = None - - while True: - if next is not None: - body["start"] = next + def search(self, query): + body = {"query": query} + next = None - log.info(f"Searching {name}. Offset={next}") - cache_key = f"{URL}:{name}:{next}" - resp = self.http_post_json_cached(URL, cache_key, body) + while True: + if next is not None: + body["start"] = next - for match in resp.get("data"): - print(name, "->", match["name"]) - other = self.make_entity(entity, "Company") - other.id = self.make_entity_id(match["name"]) - other.add("name", match["name"]) - yield other + log.info(f"Searching {query}. Offset={next}") + cache_key = f"{URL}:{query}:{next}" + resp = self.http_post_json_cached(URL, cache_key, body) + yield from resp["data"] - next = resp.get("next", None) - if next is None: - break + next = resp.get("next", None) + if next is None: + break + def match(self, entity: CE) -> Generator[CE, None, None]: + for name in entity.get("name"): + for match in self.search(name): + other = self.make_entity(entity, "Company") + name = match.get("name", None) + if name is None: + continue + other.id = self.make_company_id(name) + other.add("name", name) + yield other def expand(self, entity: CE, match: CE) -> Generator[CE, None, None]: - pass + yield match + + name = match.get("name")[0] + for item in self.search(name): + if item["name"] != name: + continue + + security = self.make_entity(match, "Security") + security.id = self.make_security_id(item["figi"]) + security.add("name", item["figi"]) + security.add("issuer", match) + security.add("ticker", item["ticker"]) + security.add("type", item["securityType"]) + security.add("notes", f'exchange {item["exchCode"]}') + + yield security From f9fcfee3c6a7acd807343970a2b6aec80f967074 Mon Sep 17 00:00:00 2001 From: JD Bothma Date: Wed, 15 Nov 2023 12:04:01 +0200 Subject: [PATCH 3/5] Types and tests --- nomenklatura/enrich/openfigi.py | 31 ++++++++------ tests/enrich/test_openfigi.py | 71 +++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 13 deletions(-) create mode 100644 tests/enrich/test_openfigi.py diff --git a/nomenklatura/enrich/openfigi.py b/nomenklatura/enrich/openfigi.py index 5426a0c6..df792a1d 100644 --- a/nomenklatura/enrich/openfigi.py +++ b/nomenklatura/enrich/openfigi.py @@ -22,15 +22,16 @@ def __init__(self, dataset: DS, cache: Cache, config: EnricherConfig): super().__init__(dataset, cache, config) api_key = os.environ.get("OPENFIGI_API_KEY") - self.session.headers["X-OPENFIGI-APIKEY"] = api_key + if api_key is not None: + self.session.headers["X-OPENFIGI-APIKEY"] = api_key - def make_company_id(self, name): + def make_company_id(self, name: str) -> str: return f"figi-co-{make_entity_id(name)}" - def make_security_id(self, figi): + def make_security_id(self, figi: str) -> str: return f"figi-id-{slugify(figi, sep='-')}" - def search(self, query): + def search(self, query: str) -> Generator[Dict[str, str], None, None]: body = {"query": query} next = None @@ -41,7 +42,8 @@ def search(self, query): log.info(f"Searching {query}. Offset={next}") cache_key = f"{URL}:{query}:{next}" resp = self.http_post_json_cached(URL, cache_key, body) - yield from resp["data"] + if "data" in resp: + yield from resp["data"] next = resp.get("next", None) if next is None: @@ -50,19 +52,20 @@ def search(self, query): def match(self, entity: CE) -> Generator[CE, None, None]: for name in entity.get("name"): for match in self.search(name): - other = self.make_entity(entity, "Company") - name = match.get("name", None) - if name is None: + match_name = match.get("name", None) + if match_name is None: continue - other.id = self.make_company_id(name) - other.add("name", name) + other = self.make_entity(entity, "Company") + other.id = self.make_company_id(match_name) + other.add("name", match_name) yield other def expand(self, entity: CE, match: CE) -> Generator[CE, None, None]: - yield match - name = match.get("name")[0] for item in self.search(name): + + # Only emit the securities which match the name of the positive match + # to the company exactly. Skip everything else. if item["name"] != name: continue @@ -72,6 +75,8 @@ def expand(self, entity: CE, match: CE) -> Generator[CE, None, None]: security.add("issuer", match) security.add("ticker", item["ticker"]) security.add("type", item["securityType"]) - security.add("notes", f'exchange {item["exchCode"]}') + if item["exchCode"] is not None: + security.add("notes", f'exchange {item["exchCode"]}') + security.add("description", item["securityDescription"]) yield security diff --git a/tests/enrich/test_openfigi.py b/tests/enrich/test_openfigi.py new file mode 100644 index 00000000..7a1e72ad --- /dev/null +++ b/tests/enrich/test_openfigi.py @@ -0,0 +1,71 @@ +import requests_mock +from nomenklatura.cache import Cache +from nomenklatura.dataset import Dataset +from nomenklatura.enrich import get_enricher +from nomenklatura.enrich.common import Enricher +from nomenklatura.entity import CompositeEntity + + +PATH = "nomenklatura.enrich.openfigi:OpenFIGIEnricher" +RESPONSE = { + "data": [ + { + "figi": "BBG0005S7P81", + "securityType": "EURO-DOLLAR", + "marketSector": "Govt", + "ticker": "BKRUSS F 12/31/01", + "name": "CENTRAL BANK OF RUSSIA", + "exchCode": "NOT LISTED", + "shareClassFIGI": None, + "compositeFIGI": None, + "securityType2": None, + "securityDescription": "BKRUSS Float 12/31/01", + }, + { + "figi": "BBG002T3FYF0", + "securityType": "Index", + "marketSector": "Index", + "ticker": "RCRAMAR", + "name": "Bank of Russia Russia Central", + "exchCode": None, + "shareClassFIGI": None, + "compositeFIGI": None, + "securityType2": None, + "securityDescription": "Bank of Russia Russia Central", + }, + ] +} + + +dataset = Dataset.make({"name": "ext_open_figi", "title": "OpenFIGI"}) + + +def load_enricher(): + enricher_cls = get_enricher(PATH) + assert issubclass(enricher_cls, Enricher) + cache = Cache.make_default(dataset) + return enricher_cls(dataset, cache, {}) + + +def test_nominatim_match(): + enricher = load_enricher() + with requests_mock.Mocker() as m: + m.post("/v3/search", json=RESPONSE) + + data = { + "schema": "Company", + "id": "xxx", + "properties": {"name": ["Bank of Russia"]}, + } + ent = CompositeEntity.from_data(dataset, data) + m_results = list(enricher.match(ent)) + assert len(m_results) == 2, m_results + m1 = m_results[0] + m2 = m_results[1] + assert m1.get("name") == ["CENTRAL BANK OF RUSSIA"], m1 + assert m2.get("name") == ["Bank of Russia Russia Central"], m2 + + e_results = list(enricher.expand(ent, m_results[0])) + assert len(e_results) == 1, e_results + assert e_results[0].get("ticker") == ["BKRUSS F 12/31/01"], e_results + assert e_results[0].get("issuer") == [m_results[0].id], e_results From 84d29e74ca32bce2e4664e571c15990bad7d7f0e Mon Sep 17 00:00:00 2001 From: JD Bothma Date: Wed, 15 Nov 2023 12:13:49 +0200 Subject: [PATCH 4/5] Avoid endless retry --- nomenklatura/enrich/common.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/nomenklatura/enrich/common.py b/nomenklatura/enrich/common.py index 4dc7a896..c8c170cf 100644 --- a/nomenklatura/enrich/common.py +++ b/nomenklatura/enrich/common.py @@ -102,6 +102,7 @@ def http_post_json_cached( cache_key: str, json: Any, cache_days: Optional[int] = None, + retry: int = 3, ) -> Any: cache_days_ = self.cache_days if cache_days is None else cache_days resp_data = self.cache.get_json(cache_key, max_age=cache_days_) @@ -113,9 +114,16 @@ def http_post_json_cached( if rex.response is not None and rex.response.status_code in (401, 403): raise EnrichmentAbort("Authorization failure: %s" % url) from rex if rex.response is not None and rex.response.status_code == 429: - log.info("Rate limit exceeded. Sleeping for 60s.") - time.sleep(61) - return self.http_post_json_cached(url, cache_key, json, cache_days) + if retry > 0: + log.info("Rate limit exceeded. Sleeping for 60s.") + time.sleep(61) + return self.http_post_json_cached( + url, cache_key, json, cache_days, retry - 1 + ) + else: + raise EnrichmentException( + "Rate limit exceeded and out of retries: %s" % url + ) from rex msg = "HTTP POST failed [%s]: %s" % (url, rex) raise EnrichmentException(msg) from rex resp_data = resp.json() From c7dec0afb87e63af80e4740ed73d98eeb8e0faad Mon Sep 17 00:00:00 2001 From: JD Bothma Date: Wed, 15 Nov 2023 13:11:47 +0200 Subject: [PATCH 5/5] Abort when backoff doesn't work --- nomenklatura/enrich/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomenklatura/enrich/common.py b/nomenklatura/enrich/common.py index c8c170cf..399ce699 100644 --- a/nomenklatura/enrich/common.py +++ b/nomenklatura/enrich/common.py @@ -121,7 +121,7 @@ def http_post_json_cached( url, cache_key, json, cache_days, retry - 1 ) else: - raise EnrichmentException( + raise EnrichmentAbort( "Rate limit exceeded and out of retries: %s" % url ) from rex msg = "HTTP POST failed [%s]: %s" % (url, rex)