-
Notifications
You must be signed in to change notification settings - Fork 3
/
yle-fi-to-selko
executable file
·87 lines (67 loc) · 2.72 KB
/
yle-fi-to-selko
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#! /usr/bin/env python3
# -*- mode: Python; -*-
'''Brillant.'''
from argparse import ArgumentParser
import glob, json, os, sys
def parseargs():
description = '''
From a directory (FI) of Finnish JSON documents extracted from Yle
API, extract Selkouutiset (by publisher="Yle Selkouutiset") to a
new output directory. To each input file FI/yyyy/mm/DDDD.json,
where DDDD are a four-digit counter, corresponds an output file
DIR/yyyy/mm/DDDD.json, where DIR is the new output directory. The
output file is produced even when it happens to contain no selko
documents.
'''
parser = ArgumentParser(description = description)
parser.add_argument('--out', '-o',
metavar = 'dir',
default = 'selko',
help = '''
output directory ("selko") must not yet exist
''')
parser.add_argument('indir',
nargs = '?',
default = 'fi',
help = 'input directory ("fi") must exist')
args = parser.parse_args()
args.prog = parser.prog
return args
def main(args):
if not os.path.isdir(args.indir):
print('{}: error: input directory does not exist: {}'
.format(args.prog, args.indir), file = sys.stderr)
exit(1)
try:
os.makedirs(args.out)
except Exception as exn:
print('{}: error: could not make output directory: {}'
.format(args.prog, args.out), file = sys.stderr)
print(exn, file = sys.stderr)
exit(1)
for inf in glob.iglob(os.path.join(args.indir, '????/??/????.json')):
with open(inf, encoding = 'UTF-8') as ins:
data = json.load(ins)
# delete non-responsive documents, keeping those where
# publisher name is 'Yle Selkouutiset' thus:
#
# { data: [ { publisher: { name: 'Yle Selkouutiset' ...
data['data'] = [
datum for datum in data['data']
if datum['publisher']['name']
== 'Yle Selkouutiset'
]
# augment metadata with the number of remaining documents,
# leaving original harvesting metadata alone
data['meta']['selko'] = len(data['data'])
# use the ????/??/????.json part of inf in ouf
ouf = os.path.join(args.out, *inf.split('/')[-3:])
os.makedirs(os.path.dirname(ouf), exist_ok = True)
with open(ouf, 'w', encoding = 'UTF-8') as ous:
json.dump(data, ous,
check_circular = False,
ensure_ascii = False,
sort_keys = True,
indent = 4)
if __name__ == '__main__':
main(parseargs())