From 06af6b596ab531ea02a30ba48fb3f6f9fd6092e1 Mon Sep 17 00:00:00 2001 From: Kavya Manohar Date: Thu, 22 Aug 2024 21:47:33 +0530 Subject: [PATCH] Clean up rules and test cases for common errors --- libindic/normalizer/rules/normalizer.ml.yaml | 6 +++--- libindic/normalizer/tests/test_normalizer.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/libindic/normalizer/rules/normalizer.ml.yaml b/libindic/normalizer/rules/normalizer.ml.yaml index 84bb1ce..c85c8d1 100755 --- a/libindic/normalizer/rules/normalizer.ml.yaml +++ b/libindic/normalizer/rules/normalizer.ml.yaml @@ -33,9 +33,9 @@ common_mistakes: # For common mistakes '([^\s]+)‌$': '\1' # Remove ZWNJ at the end of the string '(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1' # Remove ZWNJ after any of the chillu characters '\u200D': '' # Remove all ZWJ characters - 'ര്(?!$)(?!യ)(?![\s{PUNCTUATION}])': 'ർ' # Replace ര് with ർ when not at word end, string end and not followed by യ - 'റ്(?!$)(?!റ)(?![\s{PUNCTUATION}])': 'ർ' # Replace ര് with ർ when not at word end, string end and not followed by യ - 'ള്(?!$)(?!ള)(?![\s{PUNCTUATION}])': 'ൾ' # Replace ള് with ൾ when not at word end, string end and not followed by ള + 'ര്(?![\s{PUNCTUATION}]|യ|$)': 'ർ' # Replace ര് with ർ when not at word end, string end and not followed by യ + 'റ്(?![\s{PUNCTUATION}\u200c]|യ|വ|ല|ര|റ|$)': 'ർ' # Replace റ് with ർ when not at word end, string end and not followed by റ, ര, വ, ല, യ + 'ള്(?![\s{PUNCTUATION}]|ള|$)': 'ൾ' # Replace ള് with ൾ when not at word end, string end and not followed by ള 'ദു:ഖ': 'ദുഃഖ' # Common Mistake 'നമ:': 'നമഃ' # Remove all ZWJ characters 'ററ': 'റ്റ' # To correct പൂമ്പാററ. Fails for കണ്ടംപററി diff --git a/libindic/normalizer/tests/test_normalizer.py b/libindic/normalizer/tests/test_normalizer.py index b11eff9..5cafdc5 100644 --- a/libindic/normalizer/tests/test_normalizer.py +++ b/libindic/normalizer/tests/test_normalizer.py @@ -1,6 +1,5 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - from testtools import TestCase from .. import Normalizer @@ -46,6 +45,9 @@ def test_normalize(self): self.assertEqual(normalize('അവൻ‌'), 'അവൻ') self.assertEqual(normalize('കൺ‌മണി'), 'കൺമണി') self.assertEqual(normalize('ഹാർഡ്‌വെയർ‌'), 'ഹാർഡ്‌വെയർ') + self.assertEqual(normalize('സോഫ്റ്റ്‍വെയർ'), 'സോഫ്റ്റ്വെയർ') #soft_ware written with an zwj, before ware gets removed. + self.assertEqual(normalize('ആറ്റ്‌ലി'), 'ആറ്റ്‌ലി') + self.assertEqual(normalize('ഇൻസ്റ്റിറ്റ്യൂട്ട്'), 'ഇൻസ്റ്റിറ്റ്യൂട്ട്') self.assertEqual(normalize('കാല്‍‍പനികം'), 'കാൽപനികം') self.assertEqual(normalize('അവര്ക്ക്'), 'അവർക്ക്') self.assertEqual(normalize('അവര്'), 'അവര്') @@ -57,6 +59,7 @@ def test_normalize(self): self.assertEqual(normalize('കാറ്'), 'കാറ്') self.assertEqual(normalize('കാറ് '), 'കാറ് ') self.assertEqual(normalize('പൂമ്പാററ'), 'പൂമ്പാറ്റ') + self.assertEqual(normalize('കാറ്റ്'), 'കാറ്റ്') self.assertEqual(normalize('ദു:ഖത്തിന്റെ'), 'ദുഃഖത്തിന്റെ') self.assertEqual(normalize('ദു:ഖത്തിന്റെ', keep_punctuations=True), 'ദുഃഖത്തിന്റെ')