Skip to content

Commit

Permalink
Merge pull request #23 from kavyamanohar/master
Browse files Browse the repository at this point in the history
Expand rules and unit test coverage
  • Loading branch information
kavyamanohar committed May 21, 2024
2 parents 5206fd3 + d3d0d54 commit b9eabc4
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 8 deletions.
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,14 @@ LibIndic's normalizer module may be used to normalize the text to a canonical
format to handle inconsistencies in text. Right now, it supports
Malayalam language only.

## Features

- Removes punctuations by default
- Changes combination chillus to atomic chillu characters
- Normalization of vowel signs
- Corrects some common typos in Malayalam (needs thorough review)
- Alternate spelling normalizations

## Installation

### Directly from git
Expand Down Expand Up @@ -55,3 +63,4 @@ OK
flake8 --max-complexity 10 libindic
```

22 changes: 18 additions & 4 deletions libindic/normalizer/rules/normalizer_ml.rules
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
#This is comment
$remove_punctuation=true
$filter_lang=ml_IN
ൻറ=ന്റ
ന്‍പ=മ്പ
ററ=റ്റ
റ്‍=ർ
# Chillu normalization to atomic chillus
ണ്‍=ൺ
ന്‍=ൻ
ര്‍=ർ
ല്‍=ൽ
ള്‍=ൾ
ക്‍=ൿ

# Vowel sign normalizations
െെ=ൈ
ൊ=ൊ
ാെ=ൊ
Expand All @@ -22,3 +21,18 @@ $filter_lang=ml_IN
ഇൗ=ഈ
ഉൗ=ഊ
ഒൗ=ഔ

# Common Typo Corrections
ൻറ=ന്റ
ന്‍പ=മ്പ
ററ=റ്റ
റ്‍=ർ
ദു:ഖ=ദുഃഖ
നമ:=നമഃ

# Alternate written forms
ൎയ്യ=ര്യ #ഭാൎയ്യ, സൂൎയ്യൻ
അധ്യാപ=അദ്ധ്യാപ
ൎ=ർ
ൽപ=ല്പ

27 changes: 23 additions & 4 deletions libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,35 @@ def setUp(self):
def test_normalize(self):
self.assertEqual(self.normalizer.normalize(u'പൂമ്പാററ'), u'പൂമ്പാറ്റ')

# ൺൻർൽൾൿ are atomic chillus and should get
# converted to ണ്‍ന്‍ര്‍ല്‍ള്‍ക്‍ respectively
# The chillus (ണ്‍ന്‍ര്‍ല്‍ള്‍ക്‍) defined by zero width joiners to be
# replaced with atomic chillus (ൺൻർൽൾൿ).

self.assertEqual(self.normalizer.normalize(u'അവിൽ'), u'അവിൽ')
self.assertEqual(self.normalizer.normalize(u'രമണൻ'), u'രമണൻ')
self.assertEqual(self.normalizer.normalize(u'അവൾ'), u'അവൾ')
self.assertEqual(self.normalizer.normalize(u'ശ്രാവൺ'), u'ശ്രാവൺ')

# TODO make this work
# self.assertEqual(normalize("അവിൽപൊതി"), "അവില്‍പൊതി")
# Multiple normalisations in a single word
self.assertEqual(normalize('കര്‍ണൻ'), 'കർണൻ')

# ൊ=ൊ, ാെ=ൊ,ോ=ോ,ാേ=ോ: Vowel sign normalizations
self.assertEqual(normalize('അവില്‍പാെതി'), 'അവിൽപൊതി')
self.assertEqual(normalize('കാേടതി'), 'കോടതി')
self.assertEqual(normalize('കോടതി'), 'കോടതി')

# Remove punctuations
self.assertEqual(normalize('1-ാം'), '1ാം')
self.assertEqual(normalize('1-ാം', keep_punctuations=True), '1-ാം')

# Common Typos
self.assertEqual(normalize('പൂമ്പാററ'), 'പൂമ്പാറ്റ')
self.assertEqual(normalize('ദു:ഖത്തിന്റെ'), 'ദുഃഖത്തിന്റെ')
self.assertEqual(normalize('ദു:ഖത്തിന്റെ', keep_punctuations=True),
'ദുഃഖത്തിന്റെ')

# Alternate Spellings
self.assertEqual(normalize('കാൎത്തുമ്പി'), 'കാർത്തുമ്പി')
self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ')

def test_multiline_string(self):
expected = """കുഞ്ചൻ നമ്പ്യാർ
Expand Down
3 changes: 3 additions & 0 deletions test-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,6 @@ testrepository

# for running tests in multiple python versions
tox

# for core library
importlib_resources

0 comments on commit b9eabc4

Please sign in to comment.