Skip to content

Commit

Permalink
Merge pull request #25 from kavyamanohar/master
Browse files Browse the repository at this point in the history
Add regex patterns for removing unwanted ZWNJ and ZWJ in Malayalam text
  • Loading branch information
kavyamanohar committed Aug 17, 2024
2 parents fde4b71 + b9bd399 commit fc7730f
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 5 deletions.
24 changes: 21 additions & 3 deletions libindic/normalizer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import yaml
import os
import string
import re

class Normalizer:
def __init__(self, language_code):
Expand All @@ -32,26 +33,43 @@ def load_rules(self):
rules_path = os.path.join(os.path.dirname(__file__), 'rules', f'normalizer.{self.language_code}.yaml')
if not os.path.exists(rules_path):
raise FileNotFoundError(f"Rules file for language '{self.language_code}' not found.")

with open(rules_path, 'r', encoding='utf-8') as file:
rules = yaml.safe_load(file)

# Compile regex patterns
if 'regex_patterns' in rules:
rules['compiled_regex'] = {}
for pattern, replacement in rules['regex_patterns'].items():
rules['compiled_regex'][re.compile(pattern, re.UNICODE)] = replacement

return rules

def normalize(self, input_text, keep_punctuations=False,normalize_chillus=True, normalize_vowelsigns=True, normalize_typos=True, normalize_alternateforms=True):
def apply_regex_patterns(self, text):
if 'compiled_regex' in self.rules:
for pattern, replacement in self.rules['compiled_regex'].items():
text = pattern.sub(replacement, text)
return text

def normalize(self, input_text, keep_punctuations=False, normalize_chillus=True, normalize_vowelsigns=True, normalize_typos=True, normalize_alternateforms=True, apply_regex=True):
if normalize_chillus and 'normalize_chillus' in self.rules:
for key, value in self.rules['normalize_chillus'].items():
input_text = input_text.replace(key, value)

if normalize_vowelsigns and 'normalize_vowelsigns' in self.rules:
for key, value in self.rules['normalize_vowelsigns'].items():
input_text = input_text.replace(key, value)

if normalize_typos and 'normalize_typos' in self.rules:
for key, value in self.rules['normalize_typos'].items():
input_text = input_text.replace(key, value)

if normalize_alternateforms and 'normalize_alternateforms' in self.rules:
for key, value in self.rules['normalize_alternateforms'].items():
input_text = input_text.replace(key, value)

if apply_regex and 'regex_patterns' in self.rules:
input_text = self.apply_regex_patterns(input_text)

if keep_punctuations:
return input_text
return input_text.translate(self.punctuation_remover)
6 changes: 6 additions & 0 deletions libindic/normalizer/rules/normalizer.ml.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ normalize_alternateforms:
"അധ്യാപ": "അദ്ധ്യാപ"
"": ""
"ു്": ""

regex_patterns:
'([^\s]+)‌\s': '\1 ' # Remove ZWNJ at the end of words followed by space
'([^\s]+)‌$': '\1' # Remove ZWNJ at the end of the string
'(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)‌': '\1' # Remove ZWNJ after any of the chillu characters
'\u200D': '' # Remove all ZWJ characters
9 changes: 7 additions & 2 deletions libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from .. import Normalizer
normalize = Normalizer('ml').normalize


class MalayalamNormalizerTest(TestCase):

def setUp(self):
Expand Down Expand Up @@ -46,7 +45,13 @@ def test_normalize(self):
self.assertEqual(normalize('കാൎത്തുമ്പി'), 'കാർത്തുമ്പി')
self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ')
self.assertEqual(normalize('എൻ്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.'), 'എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ')


# Regex pattern for ZWJ and ZWNJ Removal
self.assertEqual(normalize('അവൻ‌ വന്നു'), 'അവൻ വന്നു')
self.assertEqual(normalize('അവൻ‌'), 'അവൻ')
self.assertEqual(normalize('കൺ‌മണി'), 'കൺമണി')
self.assertEqual(normalize('ഹാർഡ്‌വെയർ‌'), 'ഹാർഡ്‌വെയർ')
self.assertEqual(normalize('കാല്‍‍പനികം'), 'കാൽപനികം')

def test_multiline_string(self):
expected = """കുഞ്ചൻ നമ്പ്യാർ
Expand Down

0 comments on commit fc7730f

Please sign in to comment.