From b9bd3993aea9edf33b1dba3b07cf4de700e82e8b Mon Sep 17 00:00:00 2001
From: Kavya Manohar <sakhi.kavya@gmail.com>
Date: Sat, 17 Aug 2024 22:38:02 +0530
Subject: [PATCH] Add regex patterns for unwanted ZWNJ and ZWJ in words

---
 libindic/normalizer/core.py                  | 24 +++++++++++++++++---
 libindic/normalizer/rules/normalizer.ml.yaml |  6 +++++
 libindic/normalizer/tests/test_normalizer.py |  9 ++++++--
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/libindic/normalizer/core.py b/libindic/normalizer/core.py
index 14778f1..478b4e5 100755
--- a/libindic/normalizer/core.py
+++ b/libindic/normalizer/core.py
@@ -21,6 +21,7 @@
 import yaml
 import os
 import string
+import re
 
 class Normalizer:
     def __init__(self, language_code):
@@ -32,12 +33,24 @@ def load_rules(self):
         rules_path = os.path.join(os.path.dirname(__file__), 'rules', f'normalizer.{self.language_code}.yaml')
         if not os.path.exists(rules_path):
             raise FileNotFoundError(f"Rules file for language '{self.language_code}' not found.")
-        
         with open(rules_path, 'r', encoding='utf-8') as file:
             rules = yaml.safe_load(file)
+        
+        # Compile regex patterns
+        if 'regex_patterns' in rules:
+            rules['compiled_regex'] = {}
+            for pattern, replacement in rules['regex_patterns'].items():
+                rules['compiled_regex'][re.compile(pattern, re.UNICODE)] = replacement
+        
         return rules
     
-    def normalize(self, input_text, keep_punctuations=False,normalize_chillus=True, normalize_vowelsigns=True, normalize_typos=True, normalize_alternateforms=True):
+    def apply_regex_patterns(self, text):
+        if 'compiled_regex' in self.rules:
+            for pattern, replacement in self.rules['compiled_regex'].items():
+                text = pattern.sub(replacement, text)
+        return text
+    
+    def normalize(self, input_text, keep_punctuations=False, normalize_chillus=True, normalize_vowelsigns=True, normalize_typos=True, normalize_alternateforms=True, apply_regex=True):
         if normalize_chillus and 'normalize_chillus' in self.rules:
             for key, value in self.rules['normalize_chillus'].items():
                 input_text = input_text.replace(key, value)
@@ -45,13 +58,18 @@ def normalize(self, input_text, keep_punctuations=False,normalize_chillus=True,
         if normalize_vowelsigns and 'normalize_vowelsigns' in self.rules:
             for key, value in self.rules['normalize_vowelsigns'].items():
                 input_text = input_text.replace(key, value)
-
+        
         if normalize_typos and 'normalize_typos' in self.rules:
             for key, value in self.rules['normalize_typos'].items():
                 input_text = input_text.replace(key, value)
+        
         if normalize_alternateforms and 'normalize_alternateforms' in self.rules:
             for key, value in self.rules['normalize_alternateforms'].items():
                 input_text = input_text.replace(key, value)
+        
+        if apply_regex and 'regex_patterns' in self.rules:
+            input_text = self.apply_regex_patterns(input_text)
+        
         if keep_punctuations:
             return input_text
         return input_text.translate(self.punctuation_remover)
diff --git a/libindic/normalizer/rules/normalizer.ml.yaml b/libindic/normalizer/rules/normalizer.ml.yaml
index 6706594..c431070 100755
--- a/libindic/normalizer/rules/normalizer.ml.yaml
+++ b/libindic/normalizer/rules/normalizer.ml.yaml
@@ -31,3 +31,9 @@ normalize_alternateforms:
   "അധ്യാപ": "അദ്ധ്യാപ"
   "ൎ": "ർ"
   "ു്": "്"
+
+regex_patterns:
+  '([^\s]+)‌\s': '\1 '  # Remove ZWNJ at the end of words followed by space
+  '([^\s]+)‌$': '\1'    # Remove ZWNJ at the end of the string
+  '(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1'        # Remove ZWNJ after any of the chillu characters
+  '\u200D': ''                    # Remove all ZWJ characters
\ No newline at end of file
diff --git a/libindic/normalizer/tests/test_normalizer.py b/libindic/normalizer/tests/test_normalizer.py
index 39141d7..be79f2a 100644
--- a/libindic/normalizer/tests/test_normalizer.py
+++ b/libindic/normalizer/tests/test_normalizer.py
@@ -6,7 +6,6 @@
 from .. import Normalizer
 normalize = Normalizer('ml').normalize
 
-
 class MalayalamNormalizerTest(TestCase):
 
     def setUp(self):
@@ -46,7 +45,13 @@ def test_normalize(self):
         self.assertEqual(normalize('കാൎത്തുമ്പി'), 'കാർത്തുമ്പി')
         self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ')
         self.assertEqual(normalize('എൻ്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.'), 'എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ')
-
+        
+        # Regex pattern for ZWJ and ZWNJ Removal
+        self.assertEqual(normalize('അവൻ‌ വന്നു'), 'അവൻ വന്നു')
+        self.assertEqual(normalize('അവൻ‌'), 'അവൻ')
+        self.assertEqual(normalize('കൺ‌മണി'), 'കൺമണി')
+        self.assertEqual(normalize('ഹാർഡ്‌വെയർ‌'), 'ഹാർഡ്‌വെയർ')
+        self.assertEqual(normalize('കാല്‍‍പനികം'), 'കാൽപനികം')
 
     def test_multiline_string(self):
         expected = """കുഞ്ചൻ നമ്പ്യാർ