diff --git a/README.md b/README.md index a053fb2..65e8d51 100644 --- a/README.md +++ b/README.md @@ -38,12 +38,18 @@ Output: Normalized unicode text >>> result = normalizer.normalize('ദു:ഖത്തിന്റെ') >>> print(result) >> ദുഃഖത്തിന്റെ ->>> result = normalizer.normalize('പൌരൻ!!', keep_punctuations=True) +>>> result = normalizer.normalize('പൌരൻ!!', remove_punctuations=False) >>> print(result) >>> പൗരൻ!! ->>> result = normalizer.normalize('ദു:ഖത്തിന്റെ', keep_punctuations=True) +>>> result = normalizer.normalize('ദു:ഖത്തിന്റെ', remove_punctuations=False) # This is considered a mistake and not a real punctuation >>> print(result) >>> ദുഃഖത്തിന്റെ +>>> result = normalizer.normalize('ഇ–മെയിൽ', remove_punctuations=False) # Punctuation not removed. But normalized to ASCII +>>> print(result) +>>> ഇ-മെയിൽ +>>> result = normalizer.normalize('ഇ–മെയിൽ') # Punctuation removed in two steps. Normalized to ASCII and then removed. +>>> print(result) +>>> ഇമെയിൽ ``` ## Running tests diff --git a/libindic/normalizer/rules/normalizer.ml.yaml b/libindic/normalizer/rules/normalizer.ml.yaml index f0405e1..61b4f67 100755 --- a/libindic/normalizer/rules/normalizer.ml.yaml +++ b/libindic/normalizer/rules/normalizer.ml.yaml @@ -28,19 +28,6 @@ normalize_alternateforms: "ൎ": "ർ" "ു്": "്" - # text=text.replace('„', r'"') - # text=text.replace('“', r'"') - # text=text.replace('”', r'"') - # text=text.replace('–', r'-') - # text=text.replace('—', r' - ') - # text=text.replace('´', r"'") - # text=text.replace('‘', r"'") - # text=text.replace('‚', r"'") - # text=text.replace('’', r"'") - # text=text.replace("''", r'"') - # text=text.replace('´´', r'"') - # text=text.replace('…', r'...') - common_mistakes: # Regex patterns for common mistakes in Malayalam raw corpus, ASR/OCR outputs '(“|”)': '"' # Replace Unicode left/right double quotes with ASCII double quotes "(‘|’)": "'" # Replace Unicode left/right single quotes with ASCII single quotes/apostraphe diff --git a/libindic/normalizer/tests/test_normalizer.py b/libindic/normalizer/tests/test_normalizer.py index ea39461..a077b3d 100644 --- a/libindic/normalizer/tests/test_normalizer.py +++ b/libindic/normalizer/tests/test_normalizer.py @@ -71,6 +71,7 @@ def test_normalize(self): self.assertEqual(normalize('“ആൻറി', remove_punctuations=True), 'ആന്റി') # This happens by dafault self.assertEqual(normalize('അമ്മ’'), 'അമ്മ') self.assertEqual(normalize('അമ്മ’', remove_punctuations=False), "അമ്മ'") + self.assertEqual(normalize('ഇ–മെയിൽ', remove_punctuations=False), "ഇ-മെയിൽ")