This repository has been archived by the owner on Aug 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_labels.py
executable file
·69 lines (57 loc) · 2.22 KB
/
get_labels.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
import re
import time
import subprocess
import os
import shutil
import json
import itertools
import pandas as pd
def load_wikis():
cutoffs = pd.read_csv(os.path.join('data',"ores_rcfilters_cutoffs.csv"))
return list(set(cutoffs.wiki_db))
def load_makefile():
with open("editquality/Makefile",'r') as makefile1:
with open("editquality/Makefile.manual",'r') as makefile2:
makefile = makefile1.read() + '\n' + makefile2.read()
return makefile
def grep_labelfile(wiki, makefile):
humanlabel_re_format = r"datasets/{0}\.human_labeled_revisions\.(.*)k_(.*)\.json:.*"
# find candidate human labeled revisions
humanlabel_re = re.compile(humanlabel_re_format.format(wiki))
# choose the best match
# choose the human labeled revisions with the largest N
# and the most recent
matches = list(humanlabel_re.finditer(makefile))
if len(matches) == 0:
print("found no matches for {0}".format(wiki))
return None
max_n = max(int(match.groups()[0]) for match in matches)
max_n_match = [match for match in matches if int(match.groups()[0]) == max_n]
latest_date = max(int(match.groups()[1]) for match in max_n_match)
latest_date_match = [match for match in max_n_match if int(match.groups()[1]) == latest_date]
if len(latest_date_match) > 1:
print("too many matches {1} for {0}".format(wiki,latest_date_match))
return None
else:
print("found match {0} for {1}".format(latest_date_match[0],wiki))
match = latest_date_match[0]
label_file = makefile[match.start():match.end()-1]
return (label_file)
def _download_labels(label_file):
os.chdir("editquality")
try:
subprocess.call(["make",label_file])
except Exception as e:
print(e)
os.chdir("..")
def load_labels(label_file):
return open("editquality/{0}".format(label_file))
def download_labels(label_files):
for label_file in label_files:
_download_labels(label_file)
if __name__ == "__main__":
wikis = load_wikis()
makefile = load_makefile()
label_files = map(lambda x: grep_labelfile(x, makefile), wikis)
download_labels(label_files)