From 16504a7bfc038b8cb47c1203c8d00ce73758e4f2 Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Mon, 17 May 2021 14:01:13 +0100 Subject: [PATCH 1/7] pre-commit autoupdate and change black to python 3 --- .pre-commit-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cf1a1db..1a2e628 100755 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,15 +1,15 @@ repos: - repo: https://github.com/ambv/black - rev: stable + rev: 21.5b1 hooks: - id: black - language_version: python3.7 + language_version: python3 - repo: https://gitlab.com/pycqa/flake8 - rev: 3.7.9 + rev: 3.9.2 hooks: - id: flake8 - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.1.0 + rev: v4.0.1 hooks: - id: check-json - id: check-merge-conflict \ No newline at end of file From d3018a9e2fa57e8c47e560d5d63004a58caeaad0 Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Mon, 17 May 2021 14:01:42 +0100 Subject: [PATCH 2/7] add pip requirements files --- requirements.txt | 5 +++++ requirements_dev.txt | 6 ++++++ 2 files changed, 11 insertions(+) create mode 100644 requirements.txt create mode 100644 requirements_dev.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a70e2b9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +click==7.1.2 +elasticsearch==7.8.1 +SPARQLWrapper==1.8.5 +tqdm>=4.48.2 +requests==2.24.0 \ No newline at end of file diff --git a/requirements_dev.txt b/requirements_dev.txt new file mode 100644 index 0000000..08cf1da --- /dev/null +++ b/requirements_dev.txt @@ -0,0 +1,6 @@ +pre-commit +black +pytest +pylint +flake8 +jupyterlab From b2a87309b072f14f85aab34e991b16b760288cb3 Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Mon, 17 May 2021 14:01:57 +0100 Subject: [PATCH 3/7] add pyenv python-version to gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5d704c4..38ea9cb 100755 --- a/.gitignore +++ b/.gitignore @@ -96,7 +96,7 @@ ipython_config.py # pyenv # For a library or package, you might want to ignore these files since the code is # intended to run in multiple environments; otherwise, check them in: -# .python-version +.python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. From 1ee3889f1e7516f230532ccb5b0656a696fa6c94 Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Mon, 17 May 2021 14:09:33 +0100 Subject: [PATCH 4/7] update cli help text Better help text for `--disable_refresh/--no_disable_refresh` option --- cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cli.py b/cli.py index 8c706cd..5e4e2c9 100755 --- a/cli.py +++ b/cli.py @@ -56,7 +56,7 @@ @click.option( "--disable_refresh/--no_disable_refresh", "-dr/-ndr", - help="Whether to disable CPU-intensive refresh on load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.", + help="Whether to disable Elasticsearch's (CPU-intensive) refresh during data load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.", default=True, ) def main( @@ -152,7 +152,7 @@ def load_from_sparql(path, es_credentials, index, limit, page_size=100, **kwargs query = f.read() # limit is used when getting list of entities - print(f"Getting entities from SPARQL query") + print("Getting entities from SPARQL query") entity_list = sparql_to_es.get_entities_from_query( query, page_size=100, limit=limit ) From e60bcc60d0c848cbd6bed1621cc26d301da5fba7 Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Mon, 17 May 2021 15:26:19 +0100 Subject: [PATCH 5/7] change to the way PIDs are specified Also v1.0.0-beta in changelog --- CHANGELOG.md | 3 +++ README.md | 4 ++-- cli.py | 10 ++++++++-- pids.sample.cfg | 3 +++ 4 files changed, 16 insertions(+), 4 deletions(-) create mode 100644 pids.sample.cfg diff --git a/CHANGELOG.md b/CHANGELOG.md index 8399bcc..f5e2c07 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ All notable changes documented below. +## 1.0.0-beta +- **enhancement (breaking change):** properties now passed as whitespace-separated list rather than comma-separated. They can also be passed through a config file by giving the `--properties` option a filename to a file that exists. + ## 0.3.7 - **fix:** reading from JSON dump forces utf-8 ## 0.3.6 diff --git a/README.md b/README.md index d01a096..5309379 100755 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Simple CLI tools to load a subset of Wikidata into Elasticsearch. Part of the [H - [Loading from Wikidata dump (.ndjson)](#loading-from-wikidata-dump-ndjson) - [Loading from SPARQL query](#loading-from-sparql-query) - [Temporary side effects](#temporary-side-effects) - +
![PyPI - Downloads](https://img.shields.io/pypi/dm/elastic-wikidata) @@ -62,7 +62,7 @@ A full list of options can be found with `ew --help`, but the following are like - `--index/-i`: the index name to push to. If not specified at runtime, elastic-wikidata will prompt for it - `--limit/-l`: limit the number of records pushed into ES. You might want to use this for a small trial run before importing the whole thing. -- `--properties/-prop`: pass a comma-separated list of properties to include in the ES index. E.g. *p31,p21*. +- `--properties/-prop`: a whitespace-separated list of properties to include in the ES index e.g. *'p31 p21'*, or the path to a text file containing newline-separated properties e.g. [this one](./pids.sample.cfg). - `--language/-lang`: [Wikimedia language code](https://www.wikidata.org/wiki/Help:Wikimedia_language_codes/lists/all). Only one supported at this time. ### Loading from Wikidata dump (.ndjson) diff --git a/cli.py b/cli.py index 5e4e2c9..32226c6 100755 --- a/cli.py +++ b/cli.py @@ -1,5 +1,6 @@ from elastic_wikidata import dump_to_es, sparql_to_es from elastic_wikidata.config import runtime_config +import os import click from configparser import ConfigParser @@ -44,7 +45,7 @@ "--properties", "-prop", type=str, - help="One or more Wikidata property e.g. p31 or p31,p21. Not case-sensitive", + help="One or more Wikidata property e.g. 'p31' or 'p31 p21'. A path to a file containing newline-separated properties can also be passed. Not case-sensitive", ) @click.option( "--timeout", @@ -117,7 +118,12 @@ def main( if language: kwargs["lang"] = language if properties: - kwargs["properties"] = properties.split(",") + if os.path.exists(properties): + with open(properties, "r") as f: + kwargs["properties"] = f.read().splitlines() + else: + kwargs["properties"] = properties.split() + if disable_refresh: kwargs["disable_refresh_on_index"] = disable_refresh diff --git a/pids.sample.cfg b/pids.sample.cfg new file mode 100644 index 0000000..0c9e8e9 --- /dev/null +++ b/pids.sample.cfg @@ -0,0 +1,3 @@ +P31 +P279 +P18 \ No newline at end of file From 0f771ecb312faee4a3a41c869b65a0a71d9b9150 Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Thu, 20 May 2021 10:01:29 +0100 Subject: [PATCH 6/7] es dump stability improvements - parallel_bulk -> streaming_bulk, as parallel_bulk has problems with memory consumption - add retry_on_timeout=True and max_retries=100 to Elasticsearch connector - refresh interval for es indexing resets to 1s even if the bulk load exists early --- CHANGELOG.md | 1 + elastic_wikidata/dump_to_es.py | 54 +++++++++++++++++++--------------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5e2c07..411901e 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ All notable changes documented below. ## 1.0.0-beta - **enhancement (breaking change):** properties now passed as whitespace-separated list rather than comma-separated. They can also be passed through a config file by giving the `--properties` option a filename to a file that exists. +- **stability improvements:** `elasticsearch.helpers.streaming_bulk` now used instead of `elasticsearch.helpers.parallel_bulk` due to issues with memory usage of the latter. Bulk load now retries on timeout. ## 0.3.7 - **fix:** reading from JSON dump forces utf-8 diff --git a/elastic_wikidata/dump_to_es.py b/elastic_wikidata/dump_to_es.py index 39579a3..8433af0 100755 --- a/elastic_wikidata/dump_to_es.py +++ b/elastic_wikidata/dump_to_es.py @@ -2,7 +2,7 @@ from itertools import islice from tqdm.auto import tqdm from elasticsearch import Elasticsearch -from elasticsearch.helpers import parallel_bulk +from elasticsearch.helpers import streaming_bulk from typing import Union from elastic_wikidata.wd_entities import ( get_entities, @@ -87,11 +87,16 @@ def start_elasticsearch(self): self.es_credentials["ELASTICSEARCH_USER"], self.es_credentials["ELASTICSEARCH_PASSWORD"], ), + max_retries=100, + retry_on_timeout=True, ) else: # run on localhost print("Connecting to Elasticsearch on localhost") - self.es = Elasticsearch() + self.es = Elasticsearch( + max_retries=100, + retry_on_timeout=True, + ) mappings = { "mappings": { @@ -122,28 +127,31 @@ def dump_to_es(self): elif self.entities: action_generator = self.generate_actions_from_entities() - for ok, action in tqdm( - parallel_bulk( - client=self.es, - index=self.index_name, - actions=action_generator, - chunk_size=self.config["chunk_size"], - queue_size=self.config["queue_size"], - ), - ): - if not ok: - print(action) - errors.append(action) - successes += ok + try: + for ok, action in tqdm( + streaming_bulk( + client=self.es, + index=self.index_name, + actions=action_generator, + chunk_size=self.config["chunk_size"], + # queue_size=self.config["queue_size"], + max_retries=3, + ), + ): + if not ok: + print(action) + errors.append(action) + successes += ok - if self.disable_refresh_on_index: - # reset back to default - print("Refresh interval set back to default of 1s.") - self.es.indices.put_settings({"index": {"refresh_interval": "1s"}}) + finally: + if self.disable_refresh_on_index: + # reset back to default + print("Refresh interval set back to default of 1s.") + self.es.indices.put_settings({"index": {"refresh_interval": "1s"}}) def process_doc(self, doc: dict) -> dict: """ - Processes a single document from the JSON dump, returning a filtered version of that document. + Processes a single document from the JSON dump, returning a filtered version of that document. """ lang = self.wiki_options["lang"] @@ -153,8 +161,8 @@ def process_doc(self, doc: dict) -> dict: def generate_actions_from_dump(self): """ - Generator to yield a processed document from the Wikidata JSON dump. - Each line of the Wikidata JSON dump is a separate document. + Generator to yield a processed document from the Wikidata JSON dump. + Each line of the Wikidata JSON dump is a separate document. """ with open(self.dump_path, "r", encoding="utf-8") as f: objects = (json.loads(line) for line in f) @@ -170,7 +178,7 @@ def generate_actions_from_dump(self): def generate_actions_from_entities(self): """ - Generator to yield processed document from list of entities. Calls are made to + Generator to yield processed document from list of entities. Calls are made to wbgetentities API with page size of 50 to retrieve documents. """ From 65f2824284fe505d5feb290994e73601136067c9 Mon Sep 17 00:00:00 2001 From: Kalyan Dutia Date: Thu, 20 May 2021 10:03:02 +0100 Subject: [PATCH 7/7] bump to v1.0.0 --- CHANGELOG.md | 2 +- setup.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 411901e..b30b42b 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes documented below. -## 1.0.0-beta +## 1.0.0 - **enhancement (breaking change):** properties now passed as whitespace-separated list rather than comma-separated. They can also be passed through a config file by giving the `--properties` option a filename to a file that exists. - **stability improvements:** `elasticsearch.helpers.streaming_bulk` now used instead of `elasticsearch.helpers.parallel_bulk` due to issues with memory usage of the latter. Bulk load now retries on timeout. diff --git a/setup.py b/setup.py index a79ab4c..0336dc7 100755 --- a/setup.py +++ b/setup.py @@ -5,13 +5,13 @@ setuptools.setup( name="elastic-wikidata", - version="0.3.7", + version="1.0.0", author="Science Museum Group", description="elastic-wikidata", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/TheScienceMuseum/elastic-wikidata", - download_url="https://github.com/TheScienceMuseum/elastic-wikidata/archive/v0.3.2.tar.gz", + download_url="https://github.com/TheScienceMuseum/elastic-wikidata/archive/v1.0.0.tar.gz", classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License",