feat: simplify core logic and make stripping punctuations optional

libindic · Oct 22, 2023 · 6554853 · 6554853
1 parent f79a4b9
commit 6554853
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 39 deletions.
diff --git a/libindic/normalizer/core.py b/libindic/normalizer/core.py
@@ -20,39 +20,27 @@
 
 import codecs
 import os
+import re
+import string
 
 
 class Normalizer:
 
     def __init__(self):
         self.rules_file = os.path.join(
             os.path.dirname(__file__), "normalizer_ml.rules")
-        self.rulesDict = dict()
-
-    def normalize(self, text):
-        out = [self.normalize_line(line) for line in text.split('\n')]
-        return '\n'.join(out)
-
-    def normalize_line(self, text):
         self.rulesDict = self.LoadRules()
-        words = text.split(" ")
-        result = []
-        for word in words:
-            word = self.trim(word)
-            word_length = len(word)
-            suffix_pos_itr = 2
-            word_stemmed = ""
-            while suffix_pos_itr < word_length:
-                suffix = word[suffix_pos_itr:word_length]
-                if suffix in self.rulesDict:
-                    word_stemmed = word[
-                        0:suffix_pos_itr] + self.rulesDict[suffix]
-                    break
-                suffix_pos_itr = suffix_pos_itr + 1
-            if (word_stemmed == ""):
-                word_stemmed = word
-            result.append(word_stemmed)
-        return "  ".join(result)
+        pattern = "|".join(map(re.escape, self.rulesDict.keys()))
+        self.regex = re.compile(pattern)
+        self.punctuation_remover = str.maketrans('', '', string.punctuation)
+
+    def normalize(self, text, keep_punctuations=False):
+        replaced = self.regex.sub(
+            lambda match: self.rulesDict[match.group(0)], text
+        )
+        if keep_punctuations:
+            return replaced
+        return replaced.translate(self.punctuation_remover)
 
     def LoadRules(self):
         rules_dict = dict()
@@ -87,20 +75,6 @@ def LoadRules(self):
         rules_file.close()
         return rules_dict
 
-    def trim(self, word):
-        punctuations = ['~', '!', '@', '#', '$', '%', '^', '&', '*',
-                        '(', ')', '-', '+', '_', '=', '{', '}', '|',
-                        ':', ';', '<', '>', r'\,', '.', '?']
-        word = word.strip()
-        index = len(word) - 1
-        while index > 0:
-            if word[index] in punctuations:
-                word = word[0:index]
-            else:
-                break
-            index = index - 1
-        return word
-
     def process(self, form):
         response = """
             <h2>Normalizer</h2></hr>

diff --git a/libindic/normalizer/tests/test_normalizer.py b/libindic/normalizer/tests/test_normalizer.py
@@ -26,3 +26,11 @@ def test_normalize(self):
 
         # TODO make this work
         # self.assertEqual(normalize("അവിൽപൊതി"), "അവില്‍പൊതി")
+
+    def test_multiline_string(self):
+        input = """കുഞ്ചൻ നമ്പ്യാർ
+            ചെണ്ടമേളം"""
+        expected = """കുഞ്ചന്‍ നമ്പ്യാര്‍
+            ചെണ്ടമേളം"""
+        actual = self.normalizer.normalize(input)
+        self.assertEqual(actual, expected)