-
Notifications
You must be signed in to change notification settings - Fork 1
/
process_raw_reports.py
93 lines (68 loc) · 2.8 KB
/
process_raw_reports.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""Methods for processing raw database reports"""
import logging
from argparse import ArgumentParser
from pathlib import Path
from typing import Iterable
REPORT_DIR = Path.home() / "public_html/r"
log = logging.getLogger(__name__)
def _r_path(id: int, base: str = "raw") -> Path:
"""Gets the report `Path` for a report with the specified id.
Args:
id (int): The id number of the report to retrieve.
base (str, optional): The prefix of the report. Defaults to "raw".
Returns:
Path: The `Path` pointing to the report with the specified prefix and id.
"""
return REPORT_DIR / f"{base}{id}.txt"
def _dump(id: int, out: Iterable[str]) -> None:
"""Saves `out` as a newline deliminated file with the specified report `id`.
Args:
id (int): The report id number to save `out` as.
out (Iterable[str]): The Iterable to save
"""
p = _r_path(id, "report")
log.info("Dumping a result to '%s'", p)
p.write_text("\n".join(out))
def _main() -> None:
"""Main driver, to be run when this module is invoked directly."""
cli_parser = ArgumentParser(description="CLI for processing raw reports")
cli_parser.add_argument('report_ids', type=int, nargs='*', help='the report ids to process down from raw data')
args = cli_parser.parse_args()
if not args.report_ids:
cli_parser.print_help()
return
# configure logging
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter("{asctime}: {levelname}: {message}", "%Y-%m-%d %H:%M:%S", "{"))
log.addHandler(handler)
log.setLevel("DEBUG")
for id in args.report_ids:
# files on enwp and commons that share a name but are not the same file
if id == 13:
log.info("Processing report 13...")
c = {}
with _r_path(5).open() as f:
for s in f:
title, sha1 = s.strip().split("\t")
c[title] = sha1
log.info("now processing the big file (raw5) for report13")
out = []
with _r_path(2).open() as f:
for s in f:
title, sha1 = s.strip().split("\t")
if title in c and c[title] != sha1:
out.append(title)
_dump(id, out)
# orphaned file talk pages on enwp which don't belong to a commons page
elif id == 16:
log.info("Processing report 16...")
with _r_path(4).open() as f:
l = set(s.strip() for s in f)
log.info("now processing the big file (raw1) for report16")
with _r_path(1).open() as f:
for s in f:
if (s := s.strip()) in l:
l.remove(s)
_dump(id, l)
if __name__ == "__main__":
_main()