Skip to content

Commit

Permalink
Merge pull request #6 from TheScienceMuseum/develop
Browse files Browse the repository at this point in the history
0.3.0
  • Loading branch information
kdutia authored Aug 26, 2020
2 parents 3b106b5 + c3f7a2b commit 14754a4
Show file tree
Hide file tree
Showing 13 changed files with 248 additions and 52 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Changelog

All notable changes documented below.

## 0.3.0

- add changeable timeout for `wbgetentities` GET request
- handle more Wikidata claims than just QIDs
- generate User Agent from request in line with Wikidata guidelines
- make Wikidata-related methods importable (rather than just runnable from CLI)
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ This is useful if you want to create one or more large subsets of Wikidata in di
### Loading from SPARQL query

``` bash
ew query <path_to_sparql_query> <other_options>
ew query -p <path_to_sparql_query> <other_options>
```

For smaller collections of Wikidata entities it might be easier to populate an Elasticsearch index directly from a SPARQL query rather than downloading the whole Wikidata dump to take a subset. `ew query` [automatically paginates SPARQL queries](examples/paginate%20query.ipynb) so that a heavy query like *'return all the humans'* doesn't result in a timeout error.
Expand Down
33 changes: 33 additions & 0 deletions cli.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from elastic_wikidata import dump_to_es, sparql_to_es
from elastic_wikidata.config import runtime_config
import click
from configparser import ConfigParser

Expand All @@ -13,6 +14,13 @@
@click.option(
"--password", envvar="ELASTICSEARCH_PASSWORD", help="Elasticsearch password"
)
@click.option(
"--agent_contact",
"-contact",
envvar="WIKIMEDIA_AGENT_CONTACT",
help="(optional) Contact details to add to the User Agent header for Wikidata requests",
default=None,
)
@click.option(
"--config",
"-c",
Expand All @@ -38,19 +46,29 @@
type=str,
help="One or more Wikidata property e.g. p31 or p31,p21. Not case-sensitive",
)
@click.option(
"--timeout",
"-t",
type=int,
help="Timeout for Wikidata requests (seconds)",
default=6,
)
def main(
source,
path,
cluster,
user,
password,
agent_contact,
config,
index,
limit,
page_size,
language,
properties,
timeout,
):

# get elasticsearch credentials
if config:
# read .ini file
Expand All @@ -59,6 +77,14 @@ def main(
parser.read(config)
es_credentials = parser._sections["ELASTIC"]
check_es_credentials(es_credentials)

runtime_config.add_item(
{
"user_agent_contact": parser._sections["HTTP"].get(
"CONTACT_DETAILS", None
)
}
)
else:
# check environment variables/flags
es_credentials = {}
Expand All @@ -72,6 +98,13 @@ def main(

check_es_credentials(es_credentials)

runtime_config.add_item({"user_agent_contact": agent_contact})

runtime_config.add_item({"http_timeout": timeout})

# global flag for all functions that the module is being run through the CLI
runtime_config.add_item({"cli": True})

# set kwargs
kwargs = {}
if language:
Expand Down
5 changes: 4 additions & 1 deletion config.sample.ini
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
[ELASTIC]
ELASTIC_SEARCH_CLUSTER =
ELASTIC_SEARCH_USER =
ELASTIC_SEARCH_PASSWORD =
ELASTIC_SEARCH_PASSWORD =

[HTTP]
CONTACT_DETAILS =
1 change: 1 addition & 0 deletions elastic_wikidata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from elastic_wikidata.__metadata__ import __version__
1 change: 1 addition & 0 deletions elastic_wikidata/__metadata__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
__version__ = "0.3.0"
27 changes: 27 additions & 0 deletions elastic_wikidata/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
class RuntimeConfig:
def __init__(self):
self.items = {}

def add_item(self, item: dict):
"""
Add an item to the runtime config
"""

self.items.update(item)

def get(self, key: str):
"""
Get specific item from config. Returns None if key doesn't exist.
"""

return self.items.get(key, None)

def get_all(self) -> dict:
"""
Return all items from runtime config
"""

return self.items


runtime_config = RuntimeConfig()
48 changes: 11 additions & 37 deletions elastic_wikidata/dump_to_es.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from typing import Union
import re
from elastic_wikidata.wd_entities import get_entities
from elastic_wikidata.wd_entities import (
get_entities,
wiki_property_check,
simplify_wbgetentities_result,
)


class processDump:
Expand All @@ -15,8 +18,6 @@ def __init__(
self.config = {
"chunk_size": 1000,
"queue_size": 8,
"lang": "en",
"properties": ["P31"],
}

self.es_credentials = es_credentials
Expand All @@ -29,7 +30,7 @@ def __init__(
self.dump_path = None
else:
raise ValueError(
"dump must either be path to JSON dump or Python list of entitiess"
"dump must either be path to JSON dump or Python list of entities"
)

self.index_name = index_name
Expand All @@ -47,12 +48,10 @@ def __init__(
else:
self.wiki_options["lang"] = "en"

def wiki_property_check(p):
if len(re.findall(r"(p\d+)", p.lower())) == 1:
return True
else:
print(f"WARNING: property {p} is not a valid Wikidata property")
return False
if "user_agent_contact" in kwargs:
self.user_agent_contact = kwargs["user_agent_contact"]
else:
self.user_agent_contact = None

if "properties" in kwargs:
if isinstance(kwargs["properties"], str) and wiki_property_check(
Expand Down Expand Up @@ -127,32 +126,7 @@ def process_doc(self, doc: dict) -> dict:
lang = self.wiki_options["lang"]
properties = self.wiki_options["properties"]

newdoc = {"id": doc["id"]}

# add label(s)
if lang in doc["labels"]:
newdoc["labels"] = doc["labels"][lang]["value"]

# add descriptions(s)
if lang in doc["descriptions"]:
newdoc["descriptions"] = doc["descriptions"][lang]["value"]

# add aliases
if (len(doc["aliases"]) > 0) and (lang in doc["aliases"]):
newdoc["aliases"] = [i["value"] for i in doc["aliases"][lang]]
else:
newdoc["aliases"] = []

# add claims (property values)
newdoc["claims"] = {}

for p in properties:
if p in doc["claims"]:
newdoc["claims"][p] = [
i["mainsnak"]["datavalue"]["value"]["id"] for i in doc["claims"][p]
]

return newdoc
return simplify_wbgetentities_result(doc, lang, properties)

def generate_actions_from_dump(self):
"""
Expand Down
63 changes: 63 additions & 0 deletions elastic_wikidata/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import requests
import sys
from urllib.parse import quote
from elastic_wikidata import __version__ as ew_version
from elastic_wikidata.config import runtime_config


def generate_user_agent():
"""
Generates user agent string according to Wikidata User Agent Guidelines (https://meta.wikimedia.org/wiki/User-Agent_policy).
Uses contact information from `runtime_config.get('user_agent_contact')`.
Returns:
str: user agent string
"""
v_params = {
"python": "Python/" + ".".join(str(i) for i in sys.version_info),
"http_backend": "requests/" + requests.__version__,
"ew": "Elastic Wikidata bot/" + ew_version,
}

contact_information = runtime_config.get("user_agent_contact")

if contact_information is not None:
contact_information = " ".join(
[process_user_agent_username(i) for i in contact_information.split(" ")]
)
return f"{v_params['ew']} ({contact_information}) {v_params['http_backend']} {v_params['python']}"
else:
if runtime_config.get("cli"):
print(
"WARNING: please consider adding contact information through config.ini or the -contact flag to improve the User Agent header for Wikidata requests."
)
return f"{v_params['ew']} {v_params['http_backend']} {v_params['python']}"


def process_user_agent_username(username=None):
"""
**Credit to [pywikibot](https://www.mediawiki.org/wiki/Manual:Pywikibot)**
Reduce username to a representation permitted in HTTP headers.
To achieve that, this function:
1) replaces spaces (' ') with '_'
2) encodes the username as 'utf-8' and if the username is not ASCII
3) URL encodes the username if it is not ASCII, or contains '%'
"""
if not username:
return ""

username = username.replace(" ", "_") # Avoid spaces or %20.
try:
username.encode("ascii") # just test, but not actually use it
except UnicodeEncodeError:
username = quote(username.encode("utf-8"))
else:
# % is legal in the default $wgLegalTitleChars
# This is so that ops know the real pywikibot will not
# allow a useragent in the username to allow through a hand-coded
# percent-encoded value.
if "%" in username:
username = quote(username)
return username
10 changes: 6 additions & 4 deletions elastic_wikidata/sparql_helpers.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from SPARQLWrapper import SPARQLWrapper, JSON
import urllib
import time
from elastic_wikidata.http import generate_user_agent


def run_query(query: str, endpoint_url="https://query.wikidata.org/sparql") -> dict:
Expand All @@ -14,13 +15,14 @@ def run_query(query: str, endpoint_url="https://query.wikidata.org/sparql") -> d
Returns:
query_result (dict): the JSON result of the query as a dict
"""
sparql = SPARQLWrapper(endpoint_url)

user_agent = generate_user_agent()

sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
sparql.setQuery(query)
sparql.setMethod("POST")
sparql.setReturnFormat(JSON)
sparql.addCustomHttpHeader(
"User-Agent", "Elastic Wikidata/0.1 (Science Museum Group)",
)

try:
return sparql.query().convert()
except urllib.error.HTTPError as e:
Expand Down
3 changes: 2 additions & 1 deletion elastic_wikidata/sparql_to_es.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from elastic_wikidata.sparql_helpers import run_query, paginate_sparql_query
import re
from math import ceil
from itertools import islice
from tqdm.auto import tqdm
from elastic_wikidata.sparql_helpers import run_query, paginate_sparql_query
from elastic_wikidata.http import generate_user_agent


def url_to_qid(url: str) -> str:
Expand Down
Loading

0 comments on commit 14754a4

Please sign in to comment.