-
Notifications
You must be signed in to change notification settings - Fork 0
/
ok_add_ids.py
81 lines (65 loc) · 2.02 KB
/
ok_add_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# coding=utf-8
import pywikibot
import re
import urllib.request
from pywikibot import Claim, pagegenerators as pg
from time import sleep
from urllib.error import HTTPError
from utils.properties import PID_OK_ACCOUNT, PID_OK_PROFILE_ID
repo = pywikibot.Site('wikidata', 'wikidata')
def parse_ok_ru(account):
html = None
tries = 0
while html is None:
try:
html = urllib.request.urlopen('https://ok.ru/' + account).read().decode()
except HTTPError:
if tries > 3:
return None
tries += 1
sleep(2 ** tries)
match = re.search('<a data-module="AuthLoginPopup" href="/profile/(\\d+)"', html)
if match is None:
return None
return match.group(1)
def process_claim(claim):
if PID_OK_PROFILE_ID in claim.qualifiers:
return
account = claim.getTarget()
match = re.fullmatch('^profile/(\\d+)$', account)
if match:
account_id = match.group(1)
else:
account_id = parse_ok_ru(account)
if account_id is None:
print("%s -> group" % account)
return
print("%s -> %s" % (account, account_id))
qualifier = Claim(repo, PID_OK_PROFILE_ID)
qualifier.setTarget(account_id)
claim.addQualifier(qualifier)
return
def add_ok_numeric_id(item):
data = item.get()
if 'claims' not in data or PID_OK_ACCOUNT not in data['claims']:
return
for claim in data['claims'][PID_OK_ACCOUNT]:
process_claim(claim)
def iterate_items():
query = '''
SELECT ?item
{
?item p:P5163 ?statement .
?statement ps:P5163 ?value .
#?item wdt:P31 wd:Q5 .
?article schema:about ?item .
?article schema:isPartOf <https://ru.wikipedia.org/>.
FILTER REGEX(?value, "^(?!group/)", "i")
FILTER NOT EXISTS{ ?statement pq:P9269 [] }
}
'''
generator = pg.WikidataSPARQLPageGenerator(query, site=repo)
for item in generator:
add_ok_numeric_id(item)
sleep(5)
iterate_items()