Skip to content

Commit

Permalink
Update README
Browse files Browse the repository at this point in the history
Add examples of remove puntuation and converting to ASCII punctuations in README
  • Loading branch information
kavyamanohar committed Aug 26, 2024
1 parent 02366fc commit ba6c5c0
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 15 deletions.
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,18 @@ Output: Normalized unicode text
>>> result = normalizer.normalize('ദു:ഖത്തിന്റെ')
>>> print(result)
>> ദുഃഖത്തിന്റെ
>>> result = normalizer.normalize('പൌരൻ!!', keep_punctuations=True)
>>> result = normalizer.normalize('പൌരൻ!!', remove_punctuations=False)
>>> print(result)
>>> പൗരൻ!!
>>> result = normalizer.normalize('ദു:ഖത്തിന്റെ', keep_punctuations=True)
>>> result = normalizer.normalize('ദു:ഖത്തിന്റെ', remove_punctuations=False) # This is considered a mistake and not a real punctuation
>>> print(result)
>>> ദുഃഖത്തിന്റെ
>>> result = normalizer.normalize('ഇ–മെയിൽ', remove_punctuations=False) # Punctuation not removed. But normalized to ASCII
>>> print(result)
>>> ഇ-മെയിൽ
>>> result = normalizer.normalize('ഇ–മെയിൽ') # Punctuation removed in two steps. Normalized to ASCII and then removed.
>>> print(result)
>>> ഇമെയിൽ
```

## Running tests
Expand Down
13 changes: 0 additions & 13 deletions libindic/normalizer/rules/normalizer.ml.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,6 @@ normalize_alternateforms:
"": ""
"ു്": ""

# text=text.replace('„', r'"')
# text=text.replace('“', r'"')
# text=text.replace('”', r'"')
# text=text.replace('–', r'-')
# text=text.replace('—', r' - ')
# text=text.replace('´', r"'")
# text=text.replace('‘', r"'")
# text=text.replace('‚', r"'")
# text=text.replace('’', r"'")
# text=text.replace("''", r'"')
# text=text.replace('´´', r'"')
# text=text.replace('…', r'...')

common_mistakes: # Regex patterns for common mistakes in Malayalam raw corpus, ASR/OCR outputs
'(“|”)': '"' # Replace Unicode left/right double quotes with ASCII double quotes
"(‘|’)": "'" # Replace Unicode left/right single quotes with ASCII single quotes/apostraphe
Expand Down
1 change: 1 addition & 0 deletions libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def test_normalize(self):
self.assertEqual(normalize('“ആൻറി', remove_punctuations=True), 'ആന്റി') # This happens by dafault
self.assertEqual(normalize('അമ്മ’'), 'അമ്മ')
self.assertEqual(normalize('അമ്മ’', remove_punctuations=False), "അമ്മ'")
self.assertEqual(normalize('ഇ–മെയിൽ', remove_punctuations=False), "ഇ-മെയിൽ")



Expand Down

0 comments on commit ba6c5c0

Please sign in to comment.