-
Notifications
You must be signed in to change notification settings - Fork 0
/
jsnli.py
113 lines (94 loc) · 4.55 KB
/
jsnli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import logging
import os
import datasets as ds
logger = logging.getLogger(__name__)
_CITATION = """\
- 吉越 卓見, 河原 大輔, 黒橋 禎夫: 機械翻訳を用いた自然言語推論データセットの多言語化, 第244回自然言語処理研究会, (2020.7.3).
- Samuel R. Bowman, Gabor Angeli, Christopher Potts, and Christopher D. Manning. 2015. A large annotated corpus for learning natural language inference. In Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing (EMNLP).
- Peter Young, Alice Lai, Micah Hodosh, and Julia Hockenmaier. "From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions." Transactions of the Association for Computational Linguistics 2 (2014): 67-78.
"""
_DESCRIPTION = """\
== 日本語SNLI(JSNLI)データセット ==
SNLI コーパスを日本語に翻訳した自然言語推論データセット
学習データは元データを翻訳し、計算機によるフィルタリングによって作成
評価データは日本語として意味が通るか、翻訳後のラベルが元のラベルと一致しているかどうかの2段階のクラウドソーシングによりデータをフィルタリング
"""
_HOMEPAGE = "https://nlp.ist.i.kyoto-u.ac.jp/?%E6%97%A5%E6%9C%AC%E8%AA%9ESNLI%28JSNLI%29%E3%83%87%E3%83%BC%E3%82%BF%E3%82%BB%E3%83%83%E3%83%88"
_LICENSE = """\
CC BY-SA 4.0
"""
_URL = "https://nlp.ist.i.kyoto-u.ac.jp/DLcounter/lime.cgi?down=https://nlp.ist.i.kyoto-u.ac.jp/nl-resource/JSNLI/jsnli_1.1.zip&name=JSNLI.zip"
class JSNLIDataset(ds.GeneratorBasedBuilder):
VERSION = ds.Version("1.1.0") # type: ignore
BUILDER_CONFIGS = [
ds.BuilderConfig(
name="with-filtering",
version=VERSION, # type: ignore
description="SNLIの学習データに機械翻訳を適用した後、BLEUスコアの閾値0.1でフィルタリングを施したもの。BERTにこの学習データを学習させることにより、93.0%の精度を記録した。(533,005ペア)",
),
ds.BuilderConfig(
name="without-filtering",
version=VERSION, # type: ignore
description="SNLIの学習データに機械翻訳を適用したもの。フィルタリングは行っていない。(548,014ペア)",
),
]
def _info(self) -> ds.DatasetInfo:
features = ds.Features(
{
"premise": ds.Value("string"),
"hypothesis": ds.Value("string"),
"label": ds.ClassLabel(
names=["entailment", "neutral", "contradiction"]
),
}
)
return ds.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager: ds.DownloadManager):
jsnli_base_dir = dl_manager.download_and_extract(_URL)
jsnli_dir = os.path.join(
jsnli_base_dir, f"jsnli_{self.VERSION.major}.{self.VERSION.minor}" # type: ignore
)
train_w_filtering_path = os.path.join(jsnli_dir, "train_w_filtering.tsv")
train_wo_filtering_path = os.path.join(jsnli_dir, "train_wo_filtering.tsv")
dev_path = os.path.join(jsnli_dir, "dev.tsv")
if "with-filtering" in self.config.name:
tng_path = train_w_filtering_path
elif "without-filtering" in self.config.name:
tng_path = train_wo_filtering_path
else:
raise ValueError(f"Invalid config name: {self.config.name}")
tng_gen_kwargs = {
"tsv_path": tng_path,
}
val_gen_kwargs = {
"tsv_path": dev_path,
}
return [
ds.SplitGenerator(
name=ds.Split.TRAIN, # type: ignore
gen_kwargs=tng_gen_kwargs, # type: ignore
),
ds.SplitGenerator(
name=ds.Split.VALIDATION, # type: ignore
gen_kwargs=val_gen_kwargs, # type: ignore
),
]
def _generate_examples( # type: ignore
self,
tsv_path: str,
):
with open(tsv_path, "r") as rf:
for sentence_id, line in enumerate(rf):
label, premise, hypothesis = line.replace("\n", "").split("\t")
example_dict = {
"premise": premise,
"hypothesis": hypothesis,
"label": label,
}
yield sentence_id, example_dict