Skip to content

Commit

Permalink
Add rules to insert chillus ർ, ൾ and remove zwnj
Browse files Browse the repository at this point in the history
  • Loading branch information
kavyamanohar committed Aug 18, 2024
1 parent 17b3031 commit b8e0daf
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 5 deletions.
2 changes: 2 additions & 0 deletions libindic/normalizer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ def load_rules(self):
if 'regex_patterns' in rules:
rules['compiled_regex'] = {}
for pattern, replacement in rules['regex_patterns'].items():
# Replace {PUNCTUATION} placeholder with actual punctuation characters
pattern = pattern.replace('{PUNCTUATION}', re.escape(string.punctuation))
rules['compiled_regex'][re.compile(pattern, re.UNICODE)] = replacement

return rules
Expand Down
13 changes: 9 additions & 4 deletions libindic/normalizer/rules/normalizer.ml.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,12 @@ normalize_alternateforms:
"ു്": ""

regex_patterns:
'([^\s]+)‌\s': '\1 ' # Remove ZWNJ at the end of words followed by space
'([^\s]+)‌$': '\1' # Remove ZWNJ at the end of the string
'(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1' # Remove ZWNJ after any of the chillu characters
'\u200D': '' # Remove all ZWJ characters
'([^\s]+)‌([{PUNCTUATION}\s])': '\1\2' # Remove ZWNJ at the end of words followed by ASCII punctuation or space.
# The PUNCTUATIONS are defined in core.py where regex is compiled.
'([^\s]+)‌$': '\1' # Remove ZWNJ at the end of the string
'(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1' # Remove ZWNJ after any of the chillu characters
'\u200D': '' # Remove all ZWJ characters
'ര്(?!$)(?!യ)(?![\s{PUNCTUATION}])': '' # Replace ര് with ർ when not at word end, string end and not followed by യ
'റ്(?!$)(?!റ)(?![\s{PUNCTUATION}])': '' # Replace ര് with ർ when not at word end, string end and not followed by യ
'ള്(?!$)(?!ള)(?![\s{PUNCTUATION}])': '' # Replace ള് with ൾ when not at word end, string end and not followed by ള

14 changes: 13 additions & 1 deletion libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,24 @@ def test_normalize(self):
self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ')
self.assertEqual(normalize('എൻ്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.'), 'എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ')

# Regex pattern for ZWJ and ZWNJ Removal
# Regex pattern for ZWJ and ZWNJ Removal, Chillu insertion
self.assertEqual(normalize('അവൻ‌ വന്നു'), 'അവൻ വന്നു')
self.assertEqual(normalize('അവൻ‌. വന്നു'), 'അവൻ വന്നു')
self.assertEqual(normalize('അവൻ‌'), 'അവൻ')
self.assertEqual(normalize('കൺ‌മണി'), 'കൺമണി')
self.assertEqual(normalize('ഹാർഡ്‌വെയർ‌'), 'ഹാർഡ്‌വെയർ')
self.assertEqual(normalize('കാല്‍‍പനികം'), 'കാൽപനികം')
self.assertEqual(normalize('അവര്ക്ക്'), 'അവർക്ക്')
self.assertEqual(normalize('അവര്'), 'അവര്')
self.assertEqual(normalize('ആര്യ '), 'ആര്യ ')
self.assertEqual(normalize('സര്വകലാശാല '), 'സർവകലാശാല ')
self.assertEqual(normalize('നമ്പറുള്പ്പെടെ'), 'നമ്പറുൾപ്പെടെ')
self.assertEqual(normalize('വള്ളിച്ചെടി'), 'വള്ളിച്ചെടി')
self.assertEqual(normalize('കാറ്ഡ്'), 'കാർഡ്')
self.assertEqual(normalize('കാറ്'), 'കാറ്')
self.assertEqual(normalize('കാറ് '), 'കാറ് ')



def test_multiline_string(self):
expected = """കുഞ്ചൻ നമ്പ്യാർ
Expand Down

0 comments on commit b8e0daf

Please sign in to comment.