-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_dump.py
52 lines (41 loc) · 1.31 KB
/
get_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
import yaml
import json
import requests
from argparse import ArgumentParser
ap = ArgumentParser()
ap.add_argument('--config', type=str)
args = ap.parse_args()
config = args.config
with open(config, "r") as fh:
params = yaml.load(fh, Loader=yaml.SafeLoader)
params = params['corpus']
dump_file = os.path.abspath(params['dump_file'])
BASE_URL = params['baseUrl']
PAGINATION_COUNT = params['pagination_count']
QUERY = params['query']
FL_PARAM = '&fl='+','.join(params['fields'])
PORTAIL = params['portail']
base_url = f"{BASE_URL}/search/{PORTAIL}/?q={QUERY}"
base_url += f"&wt=json&fl={FL_PARAM}"
base_url += f"&rows={PAGINATION_COUNT}&sort=docid+asc"
print("Downloading HAL dump...")
cursorMark = "*"
prevCursorMark = ""
docs = []
while cursorMark != prevCursorMark:
url = base_url+f"&cursorMark={cursorMark}"
print(f"{len(docs)} documents collected yet... downloading next page at {url}")
prevCursorMark = cursorMark
x = requests.get(url)
if x.ok:
res = json.loads(x.text)
else:
raise ValueError(f"Failed query: {x}")
if 'error' in res:
raise ValueError(res['error'])
docs.extend(res['response']['docs'])
cursorMark = res['nextCursorMark']
with open(dump_file, 'w') as fp:
json.dump(docs, fp)
print(f"got {len(docs)} entries, saved at {dump_file}.")