Skip to content

Commit

Permalink
Update README with more usage examples, removed rule to correct ൻറ
Browse files Browse the repository at this point in the history
  • Loading branch information
kavyamanohar committed Aug 26, 2024
1 parent 62628ee commit d7c558b
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 11 deletions.
10 changes: 3 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ Output: Normalized unicode text
>>> from libindic.normalizer import Normalizer
>>> normalizer = Normalizer("ml")
>>> result = normalizer.normalize('ദു:ഖത്തിന്റെ')
>>> result = normalizer.normalize('ഇ–മെയിൽ ദു:ഖത്തിന്റെ ൊന്നിലോ പാൻറ് 2011 സര്വകലാശാല അവള്‍ അവില്‍പാെതി ഹാർഡ്‌വെയർ‌ അവര്ക്ക് കാറ്ഡ് നമ്പറുള്പ്പെടെ പൌരൻ കൺ്മഷി “ഭാൎയ്യ”')
>>> print(result)
>> ദുഃഖത്തിന്റെ
>> ഇമെയിൽ ദുഃഖത്തിന്റെ ഒന്നിലോ പാന്റ് 2011 സർവകലാശാല അവൾ അവിൽപൊതി ഹാർഡ്‌വെയർ അവർക്ക് കാർഡ് നമ്പറുൾപ്പെടെ പൗരൻ കൺമഷി ഭാര്യ
>>> result = normalizer.normalize('പൌരൻ!!', remove_punctuations=False)
>>> print(result)
>>> പൗരൻ!!
Expand Down Expand Up @@ -66,10 +66,6 @@ Sample output:
coverage run --source=libindic -m unittest discover -s libindic
.
----------------------------------------------------------------------
Ran 1 test in 0.001s
OK
flake8 --max-complexity 10 libindic
Ran 2 test in 0.014s
```

1 change: 0 additions & 1 deletion libindic/normalizer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def load_rules(self):
# Replace {PUNCTUATION} placeholder with actual punctuation characters
pattern = pattern.replace('{PUNCTUATION}', re.escape(string.punctuation))
rules['compiled_regex'][re.compile(pattern, re.UNICODE)] = replacement

return rules

def apply_regex_patterns(self, text):
Expand Down
1 change: 0 additions & 1 deletion libindic/normalizer/rules/normalizer.ml.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@ common_mistakes: # Regex patterns for common mistakes in Malayalam raw corpus, A
'പക്ഷെ': 'പക്ഷേ'
'ൻറും' : 'ന്റും'
'ൻറ്': 'ന്റ്'
'ൻറി' : 'ന്റി'
'ുൻപോൾ' : 'ുമ്പോൾ'


3 changes: 1 addition & 2 deletions libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,7 @@ def test_normalize(self):
self.assertEqual(normalize('ൌന്നത്യം'), 'ഔന്നത്യം')
self.assertEqual(normalize('പാൻറ്'), 'പാന്റ്')
self.assertEqual(normalize('കൺ്മഷി'), 'കൺമഷി')
self.assertEqual(normalize('“ആൻറി”', remove_punctuations=False), '"ആന്റി"')
self.assertEqual(normalize('“ആൻറി', remove_punctuations=True), 'ആന്റി') # This happens by dafault
self.assertEqual(normalize('“ആൻസി”', remove_punctuations=False), '"ആൻസി"')
self.assertEqual(normalize('അമ്മ’'), 'അമ്മ')
self.assertEqual(normalize('അമ്മ’', remove_punctuations=False), "അമ്മ'")
self.assertEqual(normalize('ഇ–മെയിൽ', remove_punctuations=False), "ഇ-മെയിൽ")
Expand Down

0 comments on commit d7c558b

Please sign in to comment.