From b8e0daf9db55b8d3e1e292986f9de6e936fbd96f Mon Sep 17 00:00:00 2001 From: Kavya Manohar Date: Sun, 18 Aug 2024 22:47:43 +0530 Subject: [PATCH] =?UTF-8?q?Add=20rules=20to=20insert=20chillus=20=E0=B5=BC?= =?UTF-8?q?,=20=E0=B5=BE=20and=20remove=20zwnj?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libindic/normalizer/core.py | 2 ++ libindic/normalizer/rules/normalizer.ml.yaml | 13 +++++++++---- libindic/normalizer/tests/test_normalizer.py | 14 +++++++++++++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/libindic/normalizer/core.py b/libindic/normalizer/core.py index 478b4e5..cab970f 100755 --- a/libindic/normalizer/core.py +++ b/libindic/normalizer/core.py @@ -40,6 +40,8 @@ def load_rules(self): if 'regex_patterns' in rules: rules['compiled_regex'] = {} for pattern, replacement in rules['regex_patterns'].items(): + # Replace {PUNCTUATION} placeholder with actual punctuation characters + pattern = pattern.replace('{PUNCTUATION}', re.escape(string.punctuation)) rules['compiled_regex'][re.compile(pattern, re.UNICODE)] = replacement return rules diff --git a/libindic/normalizer/rules/normalizer.ml.yaml b/libindic/normalizer/rules/normalizer.ml.yaml index c431070..aa25dee 100755 --- a/libindic/normalizer/rules/normalizer.ml.yaml +++ b/libindic/normalizer/rules/normalizer.ml.yaml @@ -33,7 +33,12 @@ normalize_alternateforms: "ു്": "്" regex_patterns: - '([^\s]+)‌\s': '\1 ' # Remove ZWNJ at the end of words followed by space - '([^\s]+)‌$': '\1' # Remove ZWNJ at the end of the string - '(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1' # Remove ZWNJ after any of the chillu characters - '\u200D': '' # Remove all ZWJ characters \ No newline at end of file + '([^\s]+)‌([{PUNCTUATION}\s])': '\1\2' # Remove ZWNJ at the end of words followed by ASCII punctuation or space. + # The PUNCTUATIONS are defined in core.py where regex is compiled. + '([^\s]+)‌$': '\1' # Remove ZWNJ at the end of the string + '(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1' # Remove ZWNJ after any of the chillu characters + '\u200D': '' # Remove all ZWJ characters + 'ര്(?!$)(?!യ)(?![\s{PUNCTUATION}])': 'ർ' # Replace ര് with ർ when not at word end, string end and not followed by യ + 'റ്(?!$)(?!റ)(?![\s{PUNCTUATION}])': 'ർ' # Replace ര് with ർ when not at word end, string end and not followed by യ + 'ള്(?!$)(?!ള)(?![\s{PUNCTUATION}])': 'ൾ' # Replace ള് with ൾ when not at word end, string end and not followed by ള + diff --git a/libindic/normalizer/tests/test_normalizer.py b/libindic/normalizer/tests/test_normalizer.py index be79f2a..791aead 100644 --- a/libindic/normalizer/tests/test_normalizer.py +++ b/libindic/normalizer/tests/test_normalizer.py @@ -46,12 +46,24 @@ def test_normalize(self): self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ') self.assertEqual(normalize('എൻ്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.'), 'എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ') - # Regex pattern for ZWJ and ZWNJ Removal + # Regex pattern for ZWJ and ZWNJ Removal, Chillu insertion self.assertEqual(normalize('അവൻ‌ വന്നു'), 'അവൻ വന്നു') + self.assertEqual(normalize('അവൻ‌. വന്നു'), 'അവൻ വന്നു') self.assertEqual(normalize('അവൻ‌'), 'അവൻ') self.assertEqual(normalize('കൺ‌മണി'), 'കൺമണി') self.assertEqual(normalize('ഹാർഡ്‌വെയർ‌'), 'ഹാർഡ്‌വെയർ') self.assertEqual(normalize('കാല്‍‍പനികം'), 'കാൽപനികം') + self.assertEqual(normalize('അവര്ക്ക്'), 'അവർക്ക്') + self.assertEqual(normalize('അവര്'), 'അവര്') + self.assertEqual(normalize('ആര്യ '), 'ആര്യ ') + self.assertEqual(normalize('സര്വകലാശാല '), 'സർവകലാശാല ') + self.assertEqual(normalize('നമ്പറുള്പ്പെടെ'), 'നമ്പറുൾപ്പെടെ') + self.assertEqual(normalize('വള്ളിച്ചെടി'), 'വള്ളിച്ചെടി') + self.assertEqual(normalize('കാറ്ഡ്'), 'കാർഡ്') + self.assertEqual(normalize('കാറ്'), 'കാറ്') + self.assertEqual(normalize('കാറ് '), 'കാറ് ') + + def test_multiline_string(self): expected = """കുഞ്ചൻ നമ്പ്യാർ