From b9bd3993aea9edf33b1dba3b07cf4de700e82e8b Mon Sep 17 00:00:00 2001 From: Kavya Manohar Date: Sat, 17 Aug 2024 22:38:02 +0530 Subject: [PATCH] Add regex patterns for unwanted ZWNJ and ZWJ in words --- libindic/normalizer/core.py | 24 +++++++++++++++++--- libindic/normalizer/rules/normalizer.ml.yaml | 6 +++++ libindic/normalizer/tests/test_normalizer.py | 9 ++++++-- 3 files changed, 34 insertions(+), 5 deletions(-) diff --git a/libindic/normalizer/core.py b/libindic/normalizer/core.py index 14778f1..478b4e5 100755 --- a/libindic/normalizer/core.py +++ b/libindic/normalizer/core.py @@ -21,6 +21,7 @@ import yaml import os import string +import re class Normalizer: def __init__(self, language_code): @@ -32,12 +33,24 @@ def load_rules(self): rules_path = os.path.join(os.path.dirname(__file__), 'rules', f'normalizer.{self.language_code}.yaml') if not os.path.exists(rules_path): raise FileNotFoundError(f"Rules file for language '{self.language_code}' not found.") - with open(rules_path, 'r', encoding='utf-8') as file: rules = yaml.safe_load(file) + + # Compile regex patterns + if 'regex_patterns' in rules: + rules['compiled_regex'] = {} + for pattern, replacement in rules['regex_patterns'].items(): + rules['compiled_regex'][re.compile(pattern, re.UNICODE)] = replacement + return rules - def normalize(self, input_text, keep_punctuations=False,normalize_chillus=True, normalize_vowelsigns=True, normalize_typos=True, normalize_alternateforms=True): + def apply_regex_patterns(self, text): + if 'compiled_regex' in self.rules: + for pattern, replacement in self.rules['compiled_regex'].items(): + text = pattern.sub(replacement, text) + return text + + def normalize(self, input_text, keep_punctuations=False, normalize_chillus=True, normalize_vowelsigns=True, normalize_typos=True, normalize_alternateforms=True, apply_regex=True): if normalize_chillus and 'normalize_chillus' in self.rules: for key, value in self.rules['normalize_chillus'].items(): input_text = input_text.replace(key, value) @@ -45,13 +58,18 @@ def normalize(self, input_text, keep_punctuations=False,normalize_chillus=True, if normalize_vowelsigns and 'normalize_vowelsigns' in self.rules: for key, value in self.rules['normalize_vowelsigns'].items(): input_text = input_text.replace(key, value) - + if normalize_typos and 'normalize_typos' in self.rules: for key, value in self.rules['normalize_typos'].items(): input_text = input_text.replace(key, value) + if normalize_alternateforms and 'normalize_alternateforms' in self.rules: for key, value in self.rules['normalize_alternateforms'].items(): input_text = input_text.replace(key, value) + + if apply_regex and 'regex_patterns' in self.rules: + input_text = self.apply_regex_patterns(input_text) + if keep_punctuations: return input_text return input_text.translate(self.punctuation_remover) diff --git a/libindic/normalizer/rules/normalizer.ml.yaml b/libindic/normalizer/rules/normalizer.ml.yaml index 6706594..c431070 100755 --- a/libindic/normalizer/rules/normalizer.ml.yaml +++ b/libindic/normalizer/rules/normalizer.ml.yaml @@ -31,3 +31,9 @@ normalize_alternateforms: "അധ്യാപ": "അദ്ധ്യാപ" "ൎ": "ർ" "ു്": "്" + +regex_patterns: + '([^\s]+)‌\s': '\1 ' # Remove ZWNJ at the end of words followed by space + '([^\s]+)‌$': '\1' # Remove ZWNJ at the end of the string + '(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1' # Remove ZWNJ after any of the chillu characters + '\u200D': '' # Remove all ZWJ characters \ No newline at end of file diff --git a/libindic/normalizer/tests/test_normalizer.py b/libindic/normalizer/tests/test_normalizer.py index 39141d7..be79f2a 100644 --- a/libindic/normalizer/tests/test_normalizer.py +++ b/libindic/normalizer/tests/test_normalizer.py @@ -6,7 +6,6 @@ from .. import Normalizer normalize = Normalizer('ml').normalize - class MalayalamNormalizerTest(TestCase): def setUp(self): @@ -46,7 +45,13 @@ def test_normalize(self): self.assertEqual(normalize('കാൎത്തുമ്പി'), 'കാർത്തുമ്പി') self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ') self.assertEqual(normalize('എൻ്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.'), 'എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ') - + + # Regex pattern for ZWJ and ZWNJ Removal + self.assertEqual(normalize('അവൻ‌ വന്നു'), 'അവൻ വന്നു') + self.assertEqual(normalize('അവൻ‌'), 'അവൻ') + self.assertEqual(normalize('കൺ‌മണി'), 'കൺമണി') + self.assertEqual(normalize('ഹാർഡ്‌വെയർ‌'), 'ഹാർഡ്‌വെയർ') + self.assertEqual(normalize('കാല്‍‍പനികം'), 'കാൽപനികം') def test_multiline_string(self): expected = """കുഞ്ചൻ നമ്പ്യാർ