-
Notifications
You must be signed in to change notification settings - Fork 191
/
test_sentence_split_mapper.py
87 lines (72 loc) · 3.05 KB
/
test_sentence_split_mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import unittest
from data_juicer.core.data import NestedDataset as Dataset
from data_juicer.ops.mapper.sentence_split_mapper import SentenceSplitMapper
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
class SentenceSplitMapperTest(DataJuicerTestCaseBase):
def _run_helper(self, op, samples):
dataset = Dataset.from_list(samples)
dataset = dataset.map(op.process, batch_size=2)
for data in dataset:
self.assertEqual(data['text'], data['target'])
def test_en_text(self):
samples = [
{
'text':
'Smithfield employs 3,700 people at its plant in Sioux Falls, '
'South Dakota. The plant slaughters 19,500 pigs a day — 5 '
'percent of U.S. pork.',
'target':
'Smithfield employs 3,700 people at its plant in Sioux Falls, '
'South Dakota.\nThe plant slaughters 19,500 pigs a day — 5 '
'percent of U.S. pork.'
},
]
op = SentenceSplitMapper('en')
self._run_helper(op, samples)
def test_fr_text(self):
samples = [
{
'text':
'Smithfield emploie 3,700 personnes dans son usine de'
' Sioux Falls, dans le Dakota du Sud. L\'usine '
'abat 19 500 porcs par jour, soit 5 % du porc américain.',
'target':
'Smithfield emploie 3,700 personnes dans son usine de'
' Sioux Falls, dans le Dakota du Sud.\nL\'usine '
'abat 19 500 porcs par jour, soit 5 % du porc américain.'
},
]
op = SentenceSplitMapper('fr')
self._run_helper(op, samples)
def test_pt_text(self):
samples = [
{
'text':
'A Smithfield emprega 3.700 pessoas em sua fábrica em '
'Sioux Falls, Dakota do Sul. A fábrica '
'abate 19.500 porcos por dia – 5% da carne suína dos EUA.',
'target':
'A Smithfield emprega 3.700 pessoas em sua fábrica em '
'Sioux Falls, Dakota do Sul.\nA fábrica abate 19.500 '
'porcos por dia – 5% da carne suína dos EUA.'
},
]
op = SentenceSplitMapper('pt')
self._run_helper(op, samples)
def test_es_text(self):
samples = [
{
'text':
'Smithfield emplea a 3.700 personas en su planta de '
'Sioux Falls, Dakota del Sur. La planta sacrifica 19.500 '
'cerdos al día, el 5 por ciento de la carne de cerdo de EE.',
'target':
'Smithfield emplea a 3.700 personas en su planta de Sioux '
'Falls, Dakota del Sur.\nLa planta sacrifica 19.500 cerdos '
'al día, el 5 por ciento de la carne de cerdo de EE.'
},
]
op = SentenceSplitMapper('es')
self._run_helper(op, samples)
if __name__ == '__main__':
unittest.main()