From b8e0daf9db55b8d3e1e292986f9de6e936fbd96f Mon Sep 17 00:00:00 2001
From: Kavya Manohar <sakhi.kavya@gmail.com>
Date: Sun, 18 Aug 2024 22:47:43 +0530
Subject: [PATCH] =?UTF-8?q?Add=20rules=20to=20insert=20chillus=20=E0=B5=BC?=
 =?UTF-8?q?,=20=E0=B5=BE=20and=20remove=20zwnj?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 libindic/normalizer/core.py                  |  2 ++
 libindic/normalizer/rules/normalizer.ml.yaml | 13 +++++++++----
 libindic/normalizer/tests/test_normalizer.py | 14 +++++++++++++-
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/libindic/normalizer/core.py b/libindic/normalizer/core.py
index 478b4e5..cab970f 100755
--- a/libindic/normalizer/core.py
+++ b/libindic/normalizer/core.py
@@ -40,6 +40,8 @@ def load_rules(self):
         if 'regex_patterns' in rules:
             rules['compiled_regex'] = {}
             for pattern, replacement in rules['regex_patterns'].items():
+                # Replace {PUNCTUATION} placeholder with actual punctuation characters
+                pattern = pattern.replace('{PUNCTUATION}', re.escape(string.punctuation))
                 rules['compiled_regex'][re.compile(pattern, re.UNICODE)] = replacement
         
         return rules
diff --git a/libindic/normalizer/rules/normalizer.ml.yaml b/libindic/normalizer/rules/normalizer.ml.yaml
index c431070..aa25dee 100755
--- a/libindic/normalizer/rules/normalizer.ml.yaml
+++ b/libindic/normalizer/rules/normalizer.ml.yaml
@@ -33,7 +33,12 @@ normalize_alternateforms:
   "ു്": "്"
 
 regex_patterns:
-  '([^\s]+)‌\s': '\1 '  # Remove ZWNJ at the end of words followed by space
-  '([^\s]+)‌$': '\1'    # Remove ZWNJ at the end of the string
-  '(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1'        # Remove ZWNJ after any of the chillu characters
-  '\u200D': ''                    # Remove all ZWJ characters
\ No newline at end of file
+  '([^\s]+)‌([{PUNCTUATION}\s])': '\1\2'  # Remove ZWNJ at the end of words followed by ASCII punctuation or space.
+  # The PUNCTUATIONS are defined in core.py where regex is compiled.
+  '([^\s]+)‌$': '\1'  # Remove ZWNJ at the end of the string
+  '(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1'  # Remove ZWNJ after any of the chillu characters
+  '\u200D': ''  # Remove all ZWJ characters
+  'ര്(?!$)(?!യ)(?![\s{PUNCTUATION}])': 'ർ'  # Replace ര് with ർ when not at word end, string end and not followed by യ
+  'റ്(?!$)(?!റ)(?![\s{PUNCTUATION}])': 'ർ'  # Replace ര് with ർ when not at word end, string end and not followed by യ
+  'ള്(?!$)(?!ള)(?![\s{PUNCTUATION}])': 'ൾ'  # Replace ള് with ൾ when not at word end, string end and not followed by ള
+
diff --git a/libindic/normalizer/tests/test_normalizer.py b/libindic/normalizer/tests/test_normalizer.py
index be79f2a..791aead 100644
--- a/libindic/normalizer/tests/test_normalizer.py
+++ b/libindic/normalizer/tests/test_normalizer.py
@@ -46,12 +46,24 @@ def test_normalize(self):
         self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ')
         self.assertEqual(normalize('എൻ്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.'), 'എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ')
         
-        # Regex pattern for ZWJ and ZWNJ Removal
+        # Regex pattern for ZWJ and ZWNJ Removal, Chillu insertion
         self.assertEqual(normalize('അവൻ‌ വന്നു'), 'അവൻ വന്നു')
+        self.assertEqual(normalize('അവൻ‌. വന്നു'), 'അവൻ വന്നു')
         self.assertEqual(normalize('അവൻ‌'), 'അവൻ')
         self.assertEqual(normalize('കൺ‌മണി'), 'കൺമണി')
         self.assertEqual(normalize('ഹാർഡ്‌വെയർ‌'), 'ഹാർഡ്‌വെയർ')
         self.assertEqual(normalize('കാല്‍‍പനികം'), 'കാൽപനികം')
+        self.assertEqual(normalize('അവര്ക്ക്'), 'അവർക്ക്')
+        self.assertEqual(normalize('അവര്'), 'അവര്')
+        self.assertEqual(normalize('ആര്യ '), 'ആര്യ ')
+        self.assertEqual(normalize('സര്വകലാശാല '), 'സർവകലാശാല ')
+        self.assertEqual(normalize('നമ്പറുള്പ്പെടെ'), 'നമ്പറുൾപ്പെടെ')
+        self.assertEqual(normalize('വള്ളിച്ചെടി'), 'വള്ളിച്ചെടി')
+        self.assertEqual(normalize('കാറ്ഡ്'), 'കാർഡ്')
+        self.assertEqual(normalize('കാറ്'), 'കാറ്')
+        self.assertEqual(normalize('കാറ് '), 'കാറ് ')
+
+
 
     def test_multiline_string(self):
         expected = """കുഞ്ചൻ നമ്പ്യാർ