forked from ivantor0/corpora-manipulation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
72 lines (67 loc) · 3.11 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import argparse
from parallel_error_corpora import *
from realec_brat_to_patch_list import *
from realec_patch_sentencize import *
def main(args):
nlp = spacy.load('en_core_web_sm', disable=['ner', 'attribute_ruler', 'lemmatizer'])
nlp.add_pipe('sentencizer_boundaries', before="parser")
all_ann_files = glob.glob(f'{args.input_dir}/**/*.ann', recursive=True)
error = args.error_type
realec_entries = []
textpatch_format = False
if error: textpatch_format = True;
for path in tqdm(all_ann_files):
fid = path[:-3] + "txt"
essay_id = path[:-3].split("\\")[-1][:-1]
try:
patch = ann_to_patchlist(path, textpatch_format=textpatch_format)
if patch:
with open(fid, "r", encoding="utf-8") as intxt:
text = intxt.read()
if error:
text, patch = filter_patch(text, patch, error)
if not patch: continue
if args.sentencize:
upd = sentencize_patch(nlp, text, patch)
for key, value in upd.items():
realec_entries.append({"id": essay_id,
"text": key,
"patch": value})
else:
realec_entries.append({"id": essay_id,
"text": text,
"patch": patch})
except:
print(f"File does not exist: {fid}")
if realec_entries:
realec_df = pd.DataFrame(realec_entries)
filename = "realec_df"
if error:
filename += f"_{error}"
if args.parallel:
filename += "_parallel"
realec_df = list_to_corpus_df_realec(realec_entries)
if error:
realec_df = realec_df.drop(["corrections_num"], axis=1)
realec_df = realec_df.sort_values(by=['id']).reset_index(drop=True)
if args.sentencize:
filename += "_sentencized"
os.makedirs(f"{args.output_dir}", exist_ok=True)
realec_df.to_pickle(f"{args.output_dir}/{filename}.pickle")
else:
print("No erroneous entites found")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Parse parallel data"
)
parser.add_argument("-error_type", type=str, default="", help=f'Type of errors; allowed:\n {allowed_error_types}')
parser.add_argument("-parallel", action=argparse.BooleanOptionalAction,
help="parse parallel data or not (error spans only by default)")
parser.add_argument("--sentencize", action=argparse.BooleanOptionalAction,
help="sentencize data or not")
parser.add_argument("--input_dir", type=str, default="realec_data",
help="Input directory (default: %(default)s)")
parser.add_argument("--output_dir", type=str, default="realec_parallel",
help="Output directory (default: %(default)s)")
args = parser.parse_args()
main(args)