Skip to content

Commit

Permalink
feat: simplify core logic and make stripping punctuations optional
Browse files Browse the repository at this point in the history
  • Loading branch information
asdofindia committed Oct 22, 2023
1 parent f79a4b9 commit 6554853
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 39 deletions.
52 changes: 13 additions & 39 deletions libindic/normalizer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,39 +20,27 @@

import codecs
import os
import re
import string


class Normalizer:

def __init__(self):
self.rules_file = os.path.join(
os.path.dirname(__file__), "normalizer_ml.rules")
self.rulesDict = dict()

def normalize(self, text):
out = [self.normalize_line(line) for line in text.split('\n')]
return '\n'.join(out)

def normalize_line(self, text):
self.rulesDict = self.LoadRules()
words = text.split(" ")
result = []
for word in words:
word = self.trim(word)
word_length = len(word)
suffix_pos_itr = 2
word_stemmed = ""
while suffix_pos_itr < word_length:
suffix = word[suffix_pos_itr:word_length]
if suffix in self.rulesDict:
word_stemmed = word[
0:suffix_pos_itr] + self.rulesDict[suffix]
break
suffix_pos_itr = suffix_pos_itr + 1
if (word_stemmed == ""):
word_stemmed = word
result.append(word_stemmed)
return " ".join(result)
pattern = "|".join(map(re.escape, self.rulesDict.keys()))
self.regex = re.compile(pattern)
self.punctuation_remover = str.maketrans('', '', string.punctuation)

def normalize(self, text, keep_punctuations=False):
replaced = self.regex.sub(
lambda match: self.rulesDict[match.group(0)], text
)
if keep_punctuations:
return replaced
return replaced.translate(self.punctuation_remover)

def LoadRules(self):
rules_dict = dict()
Expand Down Expand Up @@ -87,20 +75,6 @@ def LoadRules(self):
rules_file.close()
return rules_dict

def trim(self, word):
punctuations = ['~', '!', '@', '#', '$', '%', '^', '&', '*',
'(', ')', '-', '+', '_', '=', '{', '}', '|',
':', ';', '<', '>', r'\,', '.', '?']
word = word.strip()
index = len(word) - 1
while index > 0:
if word[index] in punctuations:
word = word[0:index]
else:
break
index = index - 1
return word

def process(self, form):
response = """
<h2>Normalizer</h2></hr>
Expand Down
8 changes: 8 additions & 0 deletions libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,11 @@ def test_normalize(self):

# TODO make this work
# self.assertEqual(normalize("അവിൽപൊതി"), "അവില്‍പൊതി")

def test_multiline_string(self):
input = """കുഞ്ചൻ നമ്പ്യാർ
ചെണ്ടമേളം"""
expected = """കുഞ്ചന്‍ നമ്പ്യാര്‍
ചെണ്ടമേളം"""
actual = self.normalizer.normalize(input)
self.assertEqual(actual, expected)

0 comments on commit 6554853

Please sign in to comment.