-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpymajka_test.py
executable file
·209 lines (168 loc) · 6.81 KB
/
pymajka_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/python
# -*- coding: utf-8 -*-
# pylint: disable=R0904
""" Unit test suite for pymajka """
import unittest
import pymajka
import pexpect
class TestPyMajka(unittest.TestCase):
""" Unit test suite for pymajka.Majka
It requires an access to majka and two selected dictionaries. Testing attempts to cover
all major situations when this class can be used. Including using several tokens in one
"connection"
"""
MAJKA_DICT = "majka/majka.w-lt"
MAJKA_DICT_W = "majka/majka.w"
MAJKA_DICT_LW = "majka/majka.l-w"
def test_constructor_basic(self):
""" Test basic constructor with default dictionary type """
pymajka.Majka(self.MAJKA_DICT)
def test_constructor_wl(self):
""" Test basic constructor with given dictionary type """
pymajka.Majka(self.MAJKA_DICT, "wl")
def notest_run_noexec(self):
""" Test attempt to run file that exists but is not executable """
with self.assertRaises(pexpect.ExceptionPexpect):
pymajka.Majka(["/etc/passwd"])
def notest_run_noexist(self):
""" Test attempt to run file that does not exist """
with self.assertRaises(pexpect.ExceptionPexpect):
pymajka.Majka(["/no/such/file"])
def test_raw_pes(self):
""" Check if we obtain same raw output as expected for token "pes" """
majka = pymajka.Majka(self.MAJKA_DICT)
self.assertEquals([u'pes:k1gMnSc1', u'peso:k1gNnPc2'], majka.get_raw(u"pes"))
def test_tuple_lt_pes(self):
""" Check if we obtain formatted tuples as expected for token "pes" """
majka = pymajka.Majka(self.MAJKA_DICT)
result = majka.get_tuple(u"pes")
self.assertEquals(2, len(result))
self.assertEquals("peso", result[1][0])
self.assertEquals("k1gNnPc2", result[1][1])
def test_raw_dub(self):
""" Check if we obtain same raw output as expected for token "dub" """
majka = pymajka.Majka(self.MAJKA_DICT)
self.assertEquals([u"dub:k1gInSc1", u"dub:k1gInSc4"], majka.get_raw(u"dub"))
def test_tuple_lt_dub(self):
""" Check if we obtain formatted tuples as expected for token "pes" """
majka = pymajka.Majka(self.MAJKA_DICT)
result = majka.get_tuple(u"dub")
self.assertEquals(2, len(result))
self.assertEquals("dub", result[0][0])
self.assertEquals("k1gInSc1", result[0][1])
def test_raw_xyz(self):
""" Test raw results of analyzing token that does not exists in Czech database """
majka = pymajka.Majka(self.MAJKA_DICT)
self.assertEquals([], majka.get_raw(u"xyz"))
def test_tuple_lt_xyz(self):
""" Test formatted tuples of analyzing token that does not exists in Czech database """
majka = pymajka.Majka(self.MAJKA_DICT)
result = majka.get_tuple(u"xyz")
self.assertEquals(0, len(result))
def test_tuple_w_pes(self):
""" Check if we obtain formatted tuples as expected for token "pes" with different dictionary type"""
majka = pymajka.Majka(self.MAJKA_DICT_LW, "w")
result = majka.get_tuple(u"pes")
self.assertEquals(15, len(result))
self.assertEquals("psa", result[1][0])
def test_multiple(self):
""" Test obtaining several analysis in one connection """
majka = pymajka.Majka(self.MAJKA_DICT)
result = majka.get_tuple(u"xyz")
self.assertEquals(0, len(result))
result = majka.get_tuple(u"dub")
self.assertEquals(2, len(result))
self.assertEquals("k1gInSc1", result[0][1])
def test_diacritics(self):
""" Test non-ascii string - this should fail"""
majka = pymajka.Majka(self.MAJKA_DICT)
with self.assertRaises(TypeError):
majka.get_tuple("Ruská")
def test_diacritics_unicode(self):
""" Test non-ascii string """
majka = pymajka.Majka(self.MAJKA_DICT)
majka.get_tuple(u"Ruská")
def test_colon(self):
""" Test colon which is normally a separator """
majka = pymajka.Majka(self.MAJKA_DICT)
result = majka.get_tuple(u":")
self.assertEquals(0, len(result))
def test_unicode_lemma(self):
""" Test lemma returned for unicode string """
majka = pymajka.Majka(self.MAJKA_DICT)
result = majka.get_tuple(u"život")
self.assertEquals(u"život", result[0][0])
def test_preprocess_token(self):
""" Test preprocessing of tokens e.g. unify i/y """
class Ypsilon(pymajka.Majka):
def preprocess(self, token):
return token.replace(u"y", u"i").replace(u"Y", u"I").replace(u"ý", u"í").replace(u"Ý", u"Í")
majka = Ypsilon(self.MAJKA_DICT)
self.assertEquals(u"pivo", majka.get_tuple(u"pyvo")[0][0])
def test_capitalization(self):
""" Test token which has first letter capital """
class Capit(pymajka.Majka):
def postprocess(self, token, results):
# We expect that it is used only with "w" dictionaries
if token == token.capitalize():
final_result = []
for result in results:
final_result.append((result[0].capitalize(),))
return final_result
return results
majka = Capit(self.MAJKA_DICT_W, "w")
result = majka.get_tuple(u"Žirafy")
self.assertEquals(u"Žirafy", result[0][0])
def test_uppercase(self):
""" Test token which is written in uppercase only """
class Capit(pymajka.Majka):
def postprocess(self, token, results):
# We expect that it is used only with "w" dictionaries
if token == token.upper():
final_result = []
for result in results:
final_result.append((result[0].upper(),))
return final_result
return results
majka = Capit(self.MAJKA_DICT_W, "w")
result = majka.get_tuple(u"ŽIRAFY")
self.assertEquals(u"ŽIRAFY", result[0][0])
def test_colon_in_text(self):
majka = pymajka.Majka(self.MAJKA_DICT)
result = majka.get_tuple(u"http://www.streettrutnov.cz")
self.assertEquals(0, len(result))
def test_colon_w(self):
majka = pymajka.Majka(self.MAJKA_DICT_W, "w")
result = majka.get_tuple(u":")
self.assertEquals(0, len(result))
def test_colon_wl(self):
majka = pymajka.Majka(self.MAJKA_DICT)
result = majka.get_tuple(u":")
self.assertEquals(0, len(result))
def test_colon_strange_char(self):
majka = pymajka.Majka(self.MAJKA_DICT)
majka.get_tuple(u" ")
class TestPyMajkaRepair(unittest.TestCase):
MAJKA_PATH = "majka/majka"
MAJKA_Y_PATH = "majka/libmajka-marx-y.so"
MAJKA_DICT_W = "majka/majka.w"
def test_constructor(self):
pymajka.MajkaRepair(self.MAJKA_DICT_W, self.MAJKA_Y_PATH)
def test_upper(self):
majka = pymajka.MajkaRepair(self.MAJKA_DICT_W)
result = majka.get_tuple(u"ŽIRAFY")
self.assertEquals(u"ŽIRAFY", result[0][0])
def test_capitalization(self):
majka = pymajka.MajkaRepair(self.MAJKA_DICT_W)
result = majka.get_tuple(u"Žirafy")
self.assertEquals(u"Žirafy", result[0][0])
def test_preprocess_token(self):
""" Test preprocessing of tokens e.g. unify i/y """
class Ypsilon(pymajka.MajkaRepair):
def preprocess(self, token):
return token.replace(u"y", u"i").replace(u"Y", u"I").replace(u"ý", u"í").replace(u"Ý", u"Í")
majka = Ypsilon(self.MAJKA_DICT_W, library=self.MAJKA_Y_PATH)
self.assertEquals(u"Žirafy", majka.get_tuple(u"Žirafy")[0][0])
self.assertEquals(u"Žirafy", majka.get_tuple(u"Žirafi")[0][0])
if __name__ == '__main__':
unittest.main()