-
Notifications
You must be signed in to change notification settings - Fork 1
/
squad_fi.py
104 lines (89 loc) · 4.17 KB
/
squad_fi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import json
import datasets
from datasets.tasks import QuestionAnsweringExtractive
_DESCRIPTION = """\
combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers
to look similar to answerable ones. To do well on SQuAD2.0, systems must not only answer questions when possible, but
also determine when no answer is supported by the paragraph and abstain from answering.
"""
class SquadV2Config(datasets.BuilderConfig):
"""BuilderConfig for SQUAD."""
def __init__(self, **kwargs):
"""BuilderConfig for SQUADV2.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super(SquadV2Config, self).__init__(**kwargs)
class SquadV2(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
SquadV2Config(name="squad_fi_clean", version=datasets.Version(
"5.0.0"), description="SQuAD plain text version 2 in Finnish"),
]
def _info(self):
# TODO(squad_v2): Specifies the datasets.DatasetInfo object
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# datasets.features.FeatureConnectors
features=datasets.Features(
{
"id": datasets.Value("string"),
"title": datasets.Value("string"),
"context": datasets.Value("string"),
"question": datasets.Value("string"),
"answers": datasets.features.Sequence(
{
"text": datasets.Value("string"),
"answer_start": datasets.Value("int32"),
}
),
# These are the features of your dataset like images, labels ...
}
),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage="https://turkunlp.org/",
task_templates=[
QuestionAnsweringExtractive(
question_column="question", context_column="context", answers_column="answers"
)
],
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
return [
datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={
"filepath": "squad2_fi/train-v2.0.json"}),
datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={
"filepath": "squad2_fi/dev-v2.0.json"}),
]
def _generate_examples(self, filepath):
"""Yields examples."""
with open(filepath, encoding="utf-8") as f:
squad = json.load(f)
for example in squad["data"]:
title = example.get("title", "")
for paragraph in example["paragraphs"]:
# do not strip leading blank spaces GH-2585
context = paragraph["context"]
for qa in paragraph["qas"]:
question = qa["question"]
id_ = qa["id"]
answer_starts = [answer["answer_start"]
for answer in qa["answers"]]
answers = [answer["text"].strip(" .,-:") for answer in qa["answers"]]
# Features currently used are "context", "question", and "answers".
# Others are extracted here for the ease of future expansions.
yield id_, {
"title": title,
"context": context,
"question": question,
"id": id_,
"answers": {
"answer_start": answer_starts,
"text": answers,
},
}