From 16504a7bfc038b8cb47c1203c8d00ce73758e4f2 Mon Sep 17 00:00:00 2001
From: Kalyan Dutia <kalyan.dutia@gmail.com>
Date: Mon, 17 May 2021 14:01:13 +0100
Subject: [PATCH 1/7] pre-commit autoupdate and change black to python 3

---
 .pre-commit-config.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cf1a1db..1a2e628 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,15 +1,15 @@
 repos:
     -   repo: https://github.com/ambv/black
-        rev: stable
+        rev: 21.5b1
         hooks:
         - id: black
-          language_version: python3.7
+          language_version: python3
     -   repo: https://gitlab.com/pycqa/flake8
-        rev: 3.7.9
+        rev: 3.9.2
         hooks:
         - id: flake8
     -   repo: https://github.com/pre-commit/pre-commit-hooks
-        rev: v3.1.0 
+        rev: v4.0.1 
         hooks:
         -   id: check-json
         -   id: check-merge-conflict
\ No newline at end of file

From d3018a9e2fa57e8c47e560d5d63004a58caeaad0 Mon Sep 17 00:00:00 2001
From: Kalyan Dutia <kalyan.dutia@gmail.com>
Date: Mon, 17 May 2021 14:01:42 +0100
Subject: [PATCH 2/7] add pip requirements files

---
 requirements.txt     | 5 +++++
 requirements_dev.txt | 6 ++++++
 2 files changed, 11 insertions(+)
 create mode 100644 requirements.txt
 create mode 100644 requirements_dev.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..a70e2b9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+click==7.1.2
+elasticsearch==7.8.1
+SPARQLWrapper==1.8.5
+tqdm>=4.48.2
+requests==2.24.0
\ No newline at end of file
diff --git a/requirements_dev.txt b/requirements_dev.txt
new file mode 100644
index 0000000..08cf1da
--- /dev/null
+++ b/requirements_dev.txt
@@ -0,0 +1,6 @@
+pre-commit
+black
+pytest
+pylint
+flake8
+jupyterlab

From b2a87309b072f14f85aab34e991b16b760288cb3 Mon Sep 17 00:00:00 2001
From: Kalyan Dutia <kalyan.dutia@gmail.com>
Date: Mon, 17 May 2021 14:01:57 +0100
Subject: [PATCH 3/7] add pyenv python-version to gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 5d704c4..38ea9cb 100755
--- a/.gitignore
+++ b/.gitignore
@@ -96,7 +96,7 @@ ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
-# .python-version
+.python-version
 
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.

From 1ee3889f1e7516f230532ccb5b0656a696fa6c94 Mon Sep 17 00:00:00 2001
From: Kalyan Dutia <kalyan.dutia@gmail.com>
Date: Mon, 17 May 2021 14:09:33 +0100
Subject: [PATCH 4/7] update cli help text Better help text for
 `--disable_refresh/--no_disable_refresh` option

---
 cli.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cli.py b/cli.py
index 8c706cd..5e4e2c9 100755
--- a/cli.py
+++ b/cli.py
@@ -56,7 +56,7 @@
 @click.option(
     "--disable_refresh/--no_disable_refresh",
     "-dr/-ndr",
-    help="Whether to disable CPU-intensive refresh on load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.",
+    help="Whether to disable Elasticsearch's (CPU-intensive) refresh during data load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.",
     default=True,
 )
 def main(
@@ -152,7 +152,7 @@ def load_from_sparql(path, es_credentials, index, limit, page_size=100, **kwargs
         query = f.read()
 
     # limit is used when getting list of entities
-    print(f"Getting entities from SPARQL query")
+    print("Getting entities from SPARQL query")
     entity_list = sparql_to_es.get_entities_from_query(
         query, page_size=100, limit=limit
     )

From e60bcc60d0c848cbd6bed1621cc26d301da5fba7 Mon Sep 17 00:00:00 2001
From: Kalyan Dutia <kalyan.dutia@gmail.com>
Date: Mon, 17 May 2021 15:26:19 +0100
Subject: [PATCH 5/7] change to the way PIDs are specified Also v1.0.0-beta in
 changelog

---
 CHANGELOG.md    |  3 +++
 README.md       |  4 ++--
 cli.py          | 10 ++++++++--
 pids.sample.cfg |  3 +++
 4 files changed, 16 insertions(+), 4 deletions(-)
 create mode 100644 pids.sample.cfg

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8399bcc..f5e2c07 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 All notable changes documented below.
 
+## 1.0.0-beta
+- **enhancement (breaking change):** properties now passed as whitespace-separated list rather than comma-separated. They can also be passed through a config file by giving the `--properties` option a filename to a file that exists.
+
 ## 0.3.7
 - **fix:** reading from JSON dump forces utf-8
 ## 0.3.6
diff --git a/README.md b/README.md
index d01a096..5309379 100755
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ Simple CLI tools to load a subset of Wikidata into Elasticsearch. Part of the [H
   - [Loading from Wikidata dump (.ndjson)](#loading-from-wikidata-dump-ndjson)
   - [Loading from SPARQL query](#loading-from-sparql-query)
   - [Temporary side effects](#temporary-side-effects)
-  
+
 </br>
 
 ![PyPI - Downloads](https://img.shields.io/pypi/dm/elastic-wikidata)
@@ -62,7 +62,7 @@ A full list of options can be found with `ew --help`, but the following are like
 
 - `--index/-i`: the index name to push to. If not specified at runtime, elastic-wikidata will prompt for it
 - `--limit/-l`: limit the number of records pushed into ES. You might want to use this for a small trial run before importing the whole thing.
-- `--properties/-prop`: pass a comma-separated list of properties to include in the ES index. E.g. *p31,p21*.
+- `--properties/-prop`: a whitespace-separated list of properties to include in the ES index e.g. *'p31 p21'*, or the path to a text file containing newline-separated properties e.g. [this one](./pids.sample.cfg).
 - `--language/-lang`: [Wikimedia language code](https://www.wikidata.org/wiki/Help:Wikimedia_language_codes/lists/all). Only one supported at this time.
 
 ### Loading from Wikidata dump (.ndjson)
diff --git a/cli.py b/cli.py
index 5e4e2c9..32226c6 100755
--- a/cli.py
+++ b/cli.py
@@ -1,5 +1,6 @@
 from elastic_wikidata import dump_to_es, sparql_to_es
 from elastic_wikidata.config import runtime_config
+import os
 import click
 from configparser import ConfigParser
 
@@ -44,7 +45,7 @@
     "--properties",
     "-prop",
     type=str,
-    help="One or more Wikidata property e.g. p31 or p31,p21. Not case-sensitive",
+    help="One or more Wikidata property e.g. 'p31' or 'p31 p21'. A path to a file containing newline-separated properties can also be passed. Not case-sensitive",
 )
 @click.option(
     "--timeout",
@@ -117,7 +118,12 @@ def main(
     if language:
         kwargs["lang"] = language
     if properties:
-        kwargs["properties"] = properties.split(",")
+        if os.path.exists(properties):
+            with open(properties, "r") as f:
+                kwargs["properties"] = f.read().splitlines()
+        else:
+            kwargs["properties"] = properties.split()
+
     if disable_refresh:
         kwargs["disable_refresh_on_index"] = disable_refresh
 
diff --git a/pids.sample.cfg b/pids.sample.cfg
new file mode 100644
index 0000000..0c9e8e9
--- /dev/null
+++ b/pids.sample.cfg
@@ -0,0 +1,3 @@
+P31
+P279
+P18
\ No newline at end of file

From 0f771ecb312faee4a3a41c869b65a0a71d9b9150 Mon Sep 17 00:00:00 2001
From: Kalyan Dutia <kalyan.dutia@gmail.com>
Date: Thu, 20 May 2021 10:01:29 +0100
Subject: [PATCH 6/7] es dump stability improvements - parallel_bulk ->
 streaming_bulk, as parallel_bulk has problems with memory consumption - add
 retry_on_timeout=True and max_retries=100 to Elasticsearch connector -
 refresh interval for es indexing resets to 1s even if the bulk load exists
 early

---
 CHANGELOG.md                   |  1 +
 elastic_wikidata/dump_to_es.py | 54 +++++++++++++++++++---------------
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index f5e2c07..411901e 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@ All notable changes documented below.
 
 ## 1.0.0-beta
 - **enhancement (breaking change):** properties now passed as whitespace-separated list rather than comma-separated. They can also be passed through a config file by giving the `--properties` option a filename to a file that exists.
+- **stability improvements:** `elasticsearch.helpers.streaming_bulk` now used instead of `elasticsearch.helpers.parallel_bulk` due to issues with memory usage of the latter. Bulk load now retries on timeout.
 
 ## 0.3.7
 - **fix:** reading from JSON dump forces utf-8
diff --git a/elastic_wikidata/dump_to_es.py b/elastic_wikidata/dump_to_es.py
index 39579a3..8433af0 100755
--- a/elastic_wikidata/dump_to_es.py
+++ b/elastic_wikidata/dump_to_es.py
@@ -2,7 +2,7 @@
 from itertools import islice
 from tqdm.auto import tqdm
 from elasticsearch import Elasticsearch
-from elasticsearch.helpers import parallel_bulk
+from elasticsearch.helpers import streaming_bulk
 from typing import Union
 from elastic_wikidata.wd_entities import (
     get_entities,
@@ -87,11 +87,16 @@ def start_elasticsearch(self):
                     self.es_credentials["ELASTICSEARCH_USER"],
                     self.es_credentials["ELASTICSEARCH_PASSWORD"],
                 ),
+                max_retries=100,
+                retry_on_timeout=True,
             )
         else:
             # run on localhost
             print("Connecting to Elasticsearch on localhost")
-            self.es = Elasticsearch()
+            self.es = Elasticsearch(
+                max_retries=100,
+                retry_on_timeout=True,
+            )
 
         mappings = {
             "mappings": {
@@ -122,28 +127,31 @@ def dump_to_es(self):
         elif self.entities:
             action_generator = self.generate_actions_from_entities()
 
-        for ok, action in tqdm(
-            parallel_bulk(
-                client=self.es,
-                index=self.index_name,
-                actions=action_generator,
-                chunk_size=self.config["chunk_size"],
-                queue_size=self.config["queue_size"],
-            ),
-        ):
-            if not ok:
-                print(action)
-                errors.append(action)
-            successes += ok
+        try:
+            for ok, action in tqdm(
+                streaming_bulk(
+                    client=self.es,
+                    index=self.index_name,
+                    actions=action_generator,
+                    chunk_size=self.config["chunk_size"],
+                    # queue_size=self.config["queue_size"],
+                    max_retries=3,
+                ),
+            ):
+                if not ok:
+                    print(action)
+                    errors.append(action)
+                successes += ok
 
-        if self.disable_refresh_on_index:
-            # reset back to default
-            print("Refresh interval set back to default of 1s.")
-            self.es.indices.put_settings({"index": {"refresh_interval": "1s"}})
+        finally:
+            if self.disable_refresh_on_index:
+                # reset back to default
+                print("Refresh interval set back to default of 1s.")
+                self.es.indices.put_settings({"index": {"refresh_interval": "1s"}})
 
     def process_doc(self, doc: dict) -> dict:
         """
-        Processes a single document from the JSON dump, returning a filtered version of that document. 
+        Processes a single document from the JSON dump, returning a filtered version of that document.
         """
 
         lang = self.wiki_options["lang"]
@@ -153,8 +161,8 @@ def process_doc(self, doc: dict) -> dict:
 
     def generate_actions_from_dump(self):
         """
-        Generator to yield a processed document from the Wikidata JSON dump. 
-        Each line of the Wikidata JSON dump is a separate document. 
+        Generator to yield a processed document from the Wikidata JSON dump.
+        Each line of the Wikidata JSON dump is a separate document.
         """
         with open(self.dump_path, "r", encoding="utf-8") as f:
             objects = (json.loads(line) for line in f)
@@ -170,7 +178,7 @@ def generate_actions_from_dump(self):
 
     def generate_actions_from_entities(self):
         """
-        Generator to yield processed document from list of entities. Calls are made to 
+        Generator to yield processed document from list of entities. Calls are made to
         wbgetentities API with page size of 50 to retrieve documents.
         """
 

From 65f2824284fe505d5feb290994e73601136067c9 Mon Sep 17 00:00:00 2001
From: Kalyan Dutia <kalyan.dutia@gmail.com>
Date: Thu, 20 May 2021 10:03:02 +0100
Subject: [PATCH 7/7] bump to v1.0.0

---
 CHANGELOG.md | 2 +-
 setup.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 411901e..b30b42b 100755
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,7 +2,7 @@
 
 All notable changes documented below.
 
-## 1.0.0-beta
+## 1.0.0
 - **enhancement (breaking change):** properties now passed as whitespace-separated list rather than comma-separated. They can also be passed through a config file by giving the `--properties` option a filename to a file that exists.
 - **stability improvements:** `elasticsearch.helpers.streaming_bulk` now used instead of `elasticsearch.helpers.parallel_bulk` due to issues with memory usage of the latter. Bulk load now retries on timeout.
 
diff --git a/setup.py b/setup.py
index a79ab4c..0336dc7 100755
--- a/setup.py
+++ b/setup.py
@@ -5,13 +5,13 @@
 
 setuptools.setup(
     name="elastic-wikidata",
-    version="0.3.7",
+    version="1.0.0",
     author="Science Museum Group",
     description="elastic-wikidata",
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/TheScienceMuseum/elastic-wikidata",
-    download_url="https://github.com/TheScienceMuseum/elastic-wikidata/archive/v0.3.2.tar.gz",
+    download_url="https://github.com/TheScienceMuseum/elastic-wikidata/archive/v1.0.0.tar.gz",
     classifiers=[
         "Programming Language :: Python :: 3",
         "License :: OSI Approved :: MIT License",