Skip to content

Commit

Permalink
Merge pull request #7 from TheScienceMuseum/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
kdutia authored Sep 11, 2020
2 parents 14754a4 + 19707c8 commit d3abae2
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 7 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

All notable changes documented below.

## 0.3.1

- **fix:** property values without types are ignored
- **enhancement:** refresh is disabled for the duration of data load by default, using `--disable_refresh` flag. This is beneficial for large datasets or low-resource machines as refreshing the search index is CPU-intensive and can cause the data load to freeze.

## 0.3.0

- add changeable timeout for `wbgetentities` GET request
Expand Down
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Simple CLI tools to load a subset of Wikidata into Elasticsearch. Part of the [H
- [Usage](#usage)
- [Loading from Wikidata dump (.ndjson)](#loading-from-wikidata-dump-ndjson)
- [Loading from SPARQL query](#loading-from-sparql-query)
- [Temporary side effects](#temporary-side-effects)

</br>

Expand Down Expand Up @@ -90,3 +91,7 @@ For smaller collections of Wikidata entities it might be easier to populate an E

1. Write a SPARQL query and save it to a text/.rq file. See [example](queries/humans.rq).
2. Run `ew query` with the `-p` option pointing to the file containing the SPARQL query. Optionally add a `--page_size` for the SPARQL query.

### Temporary side effects

As of version *0.3.1* refreshing the search index is disabled for the duration of load by default, as [recommended by ElasticSearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-for-indexing-speed.html#_unset_or_increase_the_refresh_interval). Refresh is re-enabled to the default interval of `1s` after load is complete. To disable this behaviour use the flag `--no_disable_refresh/-ndr`.
24 changes: 24 additions & 0 deletions cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@
help="Timeout for Wikidata requests (seconds)",
default=6,
)
@click.option(
"--disable_refresh/--no_disable_refresh",
"-dr/-ndr",
help="Whether to disable CPU-intensive refresh on load. Defaults to True. Recommended to leave this on for low-resource machines or large datasets.",
default=True,
)
def main(
source,
path,
Expand All @@ -67,6 +73,7 @@ def main(
language,
properties,
timeout,
disable_refresh,
):

# get elasticsearch credentials
Expand Down Expand Up @@ -111,6 +118,8 @@ def main(
kwargs["lang"] = language
if properties:
kwargs["properties"] = properties.split(",")
if disable_refresh:
kwargs["disable_refresh_on_index"] = disable_refresh

# run job
if source == "dump":
Expand Down Expand Up @@ -172,4 +181,19 @@ def check_es_credentials(credentials: dict):


if __name__ == "__main__":
# main(
# source='dump',
# path="../wikidata/all_no_articles.ndjson",
# properties="p31,p279",
# config="./config.ini",
# index='wikidump',
# cluster=None,
# user=None,
# password=None,
# agent_contact=False,
# limit=None,
# page_size=100,
# language='en',
# timeout=6,
# )
main()
13 changes: 13 additions & 0 deletions elastic_wikidata/dump_to_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def __init__(
self.index_name = index_name

# process kwargs/set defaults
self.disable_refresh_on_index = kwargs["disable_refresh_on_index"]

if "doc_limit" in kwargs:
self.doc_limit = kwargs["doc_limit"]
else:
Expand Down Expand Up @@ -93,6 +95,12 @@ def start_elasticsearch(self):

self.es.indices.create(index=self.index_name, ignore=400)

if self.disable_refresh_on_index:
print(
"Temporary disabling refresh for the index. Will reset refresh interval for the default (1s) after load is complete."
)
self.es.indices.put_settings({"index": {"refresh_interval": -1}})

def dump_to_es(self):
print("Indexing documents...")
successes = 0
Expand All @@ -118,6 +126,11 @@ def dump_to_es(self):
errors.append(action)
successes += ok

if self.disable_refresh_on_index:
# reset back to default
print("Refresh interval set back to default of 1s.")
self.es.indices.put_settings({"index": {"refresh_interval": "1s"}})

def process_doc(self, doc: dict) -> dict:
"""
Processes a single document from the JSON dump, returning a filtered version of that document.
Expand Down
15 changes: 9 additions & 6 deletions elastic_wikidata/wd_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,15 @@ def simplify_wbgetentities_result(
if p in doc["claims"]:
claims = []
for i in doc["claims"][p]:
value_type = i["mainsnak"]["datavalue"]["type"]
if value_type == "string":
claims.append(i["mainsnak"]["datavalue"]["value"])
else:
value_name = wd_type_mapping[value_type]
claims.append(i["mainsnak"]["datavalue"]["value"][value_name])
try:
value_type = i["mainsnak"]["datavalue"]["type"]
if value_type == "string":
claims.append(i["mainsnak"]["datavalue"]["value"])
else:
value_name = wd_type_mapping[value_type]
claims.append(i["mainsnak"]["datavalue"]["value"][value_name])
except KeyError:
pass

newdoc["claims"][p] = claims

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="elastic-wikidata",
version="0.3.0",
version="0.3.1",
author="Science Museum Group",
description="elastic-wikidata",
long_description=long_description,
Expand Down

0 comments on commit d3abae2

Please sign in to comment.