-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_raw_text_preprocessor.py
67 lines (54 loc) · 2.65 KB
/
test_raw_text_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
from tempfile import TemporaryDirectory, NamedTemporaryFile
from unittest import TestCase, main
import pandas as pd
from transformers import BertTokenizer
import data.raw_text_preprocessor as rp
TOY_RAW_TEXT_CORPUS = "This document is a summary.\n How does it work? "
TOY_RAW_TEXT_CORPUS_LINES = ["This document is a summary.", "How does it work?"]
TOY_RAW_TEXT_CORPUS_TOKENS = [["this", "document", "is", "a", "summary", "."],
["how", "does", "it", "work", "?"]]
class TestRawTextPreprocessor(TestCase):
@classmethod
def setUpClass(cls):
cls.tokenizer = BertTokenizer.from_pretrained(
'./model_cache/bert-base-uncased')
def test_read_and_strip_lines(self):
with NamedTemporaryFile(suffix='txt') as txt_file:
txt_file.write(TOY_RAW_TEXT_CORPUS.encode("utf-8"))
txt_file.seek(0)
result_lines = rp.read_and_strip_lines(txt_file.name)
self.assertEqual(result_lines, TOY_RAW_TEXT_CORPUS_LINES)
def test_tokenize_lines(self):
self.assertEqual(
rp.tokenize_lines(TOY_RAW_TEXT_CORPUS_LINES, self.tokenizer),
TOY_RAW_TEXT_CORPUS_TOKENS)
def test_get_sentences(self):
with TemporaryDirectory() as tmp_dir:
os.chdir(tmp_dir)
preprocessor = rp.RawTextPreprocessor(
TOY_RAW_TEXT_CORPUS_LINES, self.tokenizer,
corpus_cache_path=tmp_dir)
sentences = preprocessor.get_sentences()
expected_sentences = pd.DataFrame({
'sentence': [['this', 'document', 'is', 'a', 'summary', '.'],
['how', 'does', 'it', 'work', '?']]})
pd.testing.assert_frame_equal(expected_sentences, sentences)
def test_get_tagged_tokens(self):
with TemporaryDirectory() as tmp_dir:
os.chdir(tmp_dir)
preprocessor = rp.RawTextPreprocessor(
TOY_RAW_TEXT_CORPUS_LINES, self.tokenizer,
corpus_cache_path=tmp_dir)
tokens = preprocessor.get_tagged_tokens()
expected_tokens = pd.DataFrame({
'token': ['this', 'document', 'is', 'a', 'summary', '.', 'how',
'does', 'it', 'work', '?'],
'sense': ['this_SENSE', 'document_SENSE', 'is_SENSE', 'a_SENSE',
'summary_SENSE', '._SENSE', 'how_SENSE', 'does_SENSE',
'it_SENSE', 'work_SENSE', '?_SENSE'],
'tagged_sense': [False, False, False, False, False, False, False,
False, False, False, False]})
pd.testing.assert_frame_equal(expected_tokens, tokens)
if __name__ == '__main__':
main()