Skip to content

Commit

Permalink
bugfixing. Now just passed all test cases from the original Sastrawi …
Browse files Browse the repository at this point in the history
…functional test
  • Loading branch information
har07 committed Jan 10, 2016
1 parent a153716 commit 3a7ca0e
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@ class PrecedenceAdjustmentSpecification(object):

def isSatisfiedBy(self, value):
regexRules = [
r'/^be(.*)lah$/',
r'/^be(.*)an$/',
r'/^me(.*)i$/',
r'/^di(.*)i$/',
r'/^pe(.*)i$/',
r'/^ter(.*)i$/',
r'^be(.*)lah$',
r'^be(.*)an$',
r'^me(.*)i$',
r'^di(.*)i$',
r'^pe(.*)i$',
r'^ter(.*)i$',
]

for rule in regexRules:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ def visit(self, context):

def remove(self, word):
"""Remove inflectional possessive pronoun : ku|mu|nya|-ku|-mu|-nya"""
return re.sub(r'-*(ku|mu|nya)', '', word, 1)
return re.sub(r'-*(ku|mu|nya)$', '', word, 1)

4 changes: 2 additions & 2 deletions src/Sastrawi/Stemmer/Filter/TextNormalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

def normalizeText(text):
result = str.lower(text)
result = re.sub(r'/[^a-z0-9 -]/im', ' ', result)
result = re.sub(r'/( +)/im', ' ', result)
result = re.sub(r'[^a-z0-9 -]', ' ', result, flags = re.IGNORECASE|re.MULTILINE)
result = re.sub(r'( +)', ' ', result, flags = re.IGNORECASE|re.MULTILINE)

return result.strip()

Expand Down
5 changes: 3 additions & 2 deletions src/Sastrawi/Stemmer/Stemmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,15 @@ def stemPluralWord(self, plural):
suffixes = ['ku', 'mu', 'nya', 'lah', 'kah', 'tah', 'pun']
matches = re.match(r'^(.*)-(.*)$', words[0])
if suffix in suffixes and matches:
words[1] = words[1] + '-' + suffix
words[0] = matches.group(1)
words[1] = matches.group(2) + '-' + suffix

#berbalas-balasan -> balas
rootWord1 = self.stemSingularWord(words[0])
rootWord2 = self.stemSingularWord(words[1])

#meniru-nirukan -> tiru
if words[1] in self.dictionary and rootWord2 == words[1]:
if not self.dictionary.contains(words[1]) and rootWord2 == words[1]:
rootWord2 = self.stemSingularWord('me' + words[1])

if rootWord1 == rootWord2:
Expand Down
12 changes: 6 additions & 6 deletions tests/FunctionalTests/Stemmer/StemmerTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,9 @@ def getTestData(self):
data.append(['jualan', 'jual'])

# combination of suffixes
#data.append(['bukumukah', 'buku']) gagal karena -ku dianggap suffix dan dihilangkan
data.append(['bukumukah', 'buku']) #gagal karena -ku dianggap suffix dan dihilangkan
data.append(['miliknyalah', 'milik'])
#data.append(['kulitkupun', 'kulit']) gagal karena -ku dianggap suffix dan dihilangkan
data.append(['kulitkupun', 'kulit']) #gagal karena -ku dianggap suffix dan dihilangkan
data.append(['berikanku', 'beri'])
data.append(['sakitimu', 'sakit'])
data.append(['beriannya', 'beri'])
Expand Down Expand Up @@ -243,12 +243,12 @@ def getTestData(self):
data.append(['mengkritik', 'kritik'])

# CS adjusting rule precedence
#data.append(['bersekolah', 'sekolah']) gagal sekolah -> seko why?
data.append(['bersekolah', 'sekolah']) #gagal sekolah -> seko why?
data.append(['bertahan', 'tahan'])
#data.append(['mencapai', 'capai']) gagal mencapai -> capa
data.append(['mencapai', 'capai']) #gagal mencapai -> capa
data.append(['dimulai', 'mulai'])
#data.append(['petani', 'tani']) gagal petani -> petan
#data.append(['terabai', 'abai']) gagal terabai -> aba
data.append(['petani', 'tani']) #gagal petani -> petan
data.append(['terabai', 'abai']) #gagal terabai -> aba

# ECS
data.append(['mensyaratkan', 'syarat'])
Expand Down
7 changes: 4 additions & 3 deletions tests/Stemmer/StemmerFactoryTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@ def test_fungsional(self):
factory = StemmerFactory()
stemmer = factory.createStemmer()

sentence = 'kulitkupun'
sentence = 'malaikat-malaikat-Nya'
expected = 'malaikat'
output = stemmer.stem(sentence)

if output == sentence:
raise AssertionError('input sentence was not stemmed at all')
if output != expected:
raise AssertionError(str.format('output is {} instead of {}', output, expected))

def test_getWordsFromFile(self):
factory = StemmerFactory()
Expand Down

0 comments on commit 3a7ca0e

Please sign in to comment.