Merge pull request #7 from TheScienceMuseum/develop

Develop
TheScienceMuseum · Sep 11, 2020 · d3abae2 · d3abae2
2 parents 14754a4 + 19707c8
commit d3abae2
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 7 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 All notable changes documented below.
 
+## 0.3.1
+
+- **fix:** property values without types are ignored
+- **enhancement:** refresh is disabled for the duration of data load by default, using `--disable_refresh` flag. This is beneficial for large datasets or low-resource machines as refreshing the search index is CPU-intensive and can cause the data load to freeze.
+
 ## 0.3.0
 
 - add changeable timeout for `wbgetentities` GET request

diff --git a/README.md b/README.md
@@ -8,6 +8,7 @@ Simple CLI tools to load a subset of Wikidata into Elasticsearch. Part of the [H
 - [Usage](#usage)
   - [Loading from Wikidata dump (.ndjson)](#loading-from-wikidata-dump-ndjson)
   - [Loading from SPARQL query](#loading-from-sparql-query)
+  - [Temporary side effects](#temporary-side-effects)
 
 </br>
 
@@ -90,3 +91,7 @@ For smaller collections of Wikidata entities it might be easier to populate an E
 
 1. Write a SPARQL query and save it to a text/.rq file. See [example](queries/humans.rq).
 2. Run `ew query` with the `-p` option pointing to the file containing the SPARQL query. Optionally add a `--page_size` for the SPARQL query.
+
+### Temporary side effects
+
+As of version *0.3.1* refreshing the search index is disabled for the duration of load by default, as [recommended by ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-for-indexing-speed.html#_unset_or_increase_the_refresh_interval). Refresh is re-enabled to the default interval of `1s` after load is complete. To disable this behaviour use the flag `--no_disable_refresh/-ndr`.
diff --git a/cli.py b/cli.py
@@ -53,6 +53,12 @@
     help="Timeout for Wikidata requests (seconds)",
     default=6,
 )
+@click.option(
+    "--disable_refresh/--no_disable_refresh",
+    "-dr/-ndr",
+    help="Whether to disable CPU-intensive refresh on load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.",
+    default=True,
+)
 def main(
     source,
     path,
@@ -67,6 +73,7 @@ def main(
     language,
     properties,
     timeout,
+    disable_refresh,
 ):
 
     # get elasticsearch credentials
@@ -111,6 +118,8 @@ def main(
         kwargs["lang"] = language
     if properties:
         kwargs["properties"] = properties.split(",")
+    if disable_refresh:
+        kwargs["disable_refresh_on_index"] = disable_refresh
 
     # run job
     if source == "dump":
@@ -172,4 +181,19 @@ def check_es_credentials(credentials: dict):
 
 
 if __name__ == "__main__":
+    # main(
+    #     source='dump',
+    #     path="../wikidata/all_no_articles.ndjson",
+    #     properties="p31,p279",
+    #     config="./config.ini",
+    #     index='wikidump',
+    #     cluster=None,
+    #     user=None,
+    #     password=None,
+    #     agent_contact=False,
+    #     limit=None,
+    #     page_size=100,
+    #     language='en',
+    #     timeout=6,
+    # )
     main()
diff --git a/elastic_wikidata/dump_to_es.py b/elastic_wikidata/dump_to_es.py
@@ -36,6 +36,8 @@ def __init__(
         self.index_name = index_name
 
         # process kwargs/set defaults
+        self.disable_refresh_on_index = kwargs["disable_refresh_on_index"]
+
         if "doc_limit" in kwargs:
             self.doc_limit = kwargs["doc_limit"]
         else:
@@ -93,6 +95,12 @@ def start_elasticsearch(self):
 
         self.es.indices.create(index=self.index_name, ignore=400)
 
+        if self.disable_refresh_on_index:
+            print(
+                "Temporary disabling refresh for the index. Will reset refresh interval for the default (1s) after load is complete."
+            )
+            self.es.indices.put_settings({"index": {"refresh_interval": -1}})
+
     def dump_to_es(self):
         print("Indexing documents...")
         successes = 0
@@ -118,6 +126,11 @@ def dump_to_es(self):
                 errors.append(action)
             successes += ok
 
+        if self.disable_refresh_on_index:
+            # reset back to default
+            print("Refresh interval set back to default of 1s.")
+            self.es.indices.put_settings({"index": {"refresh_interval": "1s"}})
+
     def process_doc(self, doc: dict) -> dict:
         """
         Processes a single document from the JSON dump, returning a filtered version of that document. 

diff --git a/elastic_wikidata/wd_entities.py b/elastic_wikidata/wd_entities.py
@@ -138,12 +138,15 @@ def simplify_wbgetentities_result(
         if p in doc["claims"]:
             claims = []
             for i in doc["claims"][p]:
-                value_type = i["mainsnak"]["datavalue"]["type"]
-                if value_type == "string":
-                    claims.append(i["mainsnak"]["datavalue"]["value"])
-                else:
-                    value_name = wd_type_mapping[value_type]
-                    claims.append(i["mainsnak"]["datavalue"]["value"][value_name])
+                try:
+                    value_type = i["mainsnak"]["datavalue"]["type"]
+                    if value_type == "string":
+                        claims.append(i["mainsnak"]["datavalue"]["value"])
+                    else:
+                        value_name = wd_type_mapping[value_type]
+                        claims.append(i["mainsnak"]["datavalue"]["value"][value_name])
+                except KeyError:
+                    pass
 
             newdoc["claims"][p] = claims
 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="elastic-wikidata",
-    version="0.3.0",
+    version="0.3.1",
     author="Science Museum Group",
     description="elastic-wikidata",
     long_description=long_description,