Skip to content

Commit

Permalink
Add rules to replace unicode quotes with ASII quotes; EMDASH and ENDA…
Browse files Browse the repository at this point in the history
…SH with hyphen
  • Loading branch information
kavyamanohar committed Aug 26, 2024
1 parent 940e46e commit 02366fc
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 15 deletions.
8 changes: 4 additions & 4 deletions libindic/normalizer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def apply_regex_patterns(self, text):
text = pattern.sub(replacement, text)
return text

def normalize(self, input_text, keep_punctuations=False, normalize_chillus=True, normalize_vowelsigns=True, normalize_alternateforms=True, correct_commonmistakes = True):
def normalize(self, input_text, remove_punctuations=True, normalize_chillus=True, normalize_vowelsigns=True, normalize_alternateforms=True, correct_commonmistakes = True):
if normalize_chillus and 'normalize_chillus' in self.rules:
for key, value in self.rules['normalize_chillus'].items():
input_text = input_text.replace(key, value)
Expand All @@ -68,6 +68,6 @@ def normalize(self, input_text, keep_punctuations=False, normalize_chillus=True,
if correct_commonmistakes and 'common_mistakes' in self.rules:
input_text = self.apply_regex_patterns(input_text)

if keep_punctuations:
return input_text
return input_text.translate(self.punctuation_remover)
if remove_punctuations:
return input_text.translate(self.punctuation_remover) # Removes only ASCII punctuations
return input_text
26 changes: 23 additions & 3 deletions libindic/normalizer/rules/normalizer.ml.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
normalize_chillus:
"ണ്‍": ""
"ണ്‍": "" # ZWJ chullus to atomic Chillus
"ന്‍": ""
"ര്‍": ""
"ല്‍": ""
Expand Down Expand Up @@ -28,11 +28,31 @@ normalize_alternateforms:
"": ""
"ു്": ""

common_mistakes: # Regex patterns for common mistakes
# text=text.replace('„', r'"')
# text=text.replace('“', r'"')
# text=text.replace('”', r'"')
# text=text.replace('–', r'-')
# text=text.replace('—', r' - ')
# text=text.replace('´', r"'")
# text=text.replace('‘', r"'")
# text=text.replace('‚', r"'")
# text=text.replace('’', r"'")
# text=text.replace("''", r'"')
# text=text.replace('´´', r'"')
# text=text.replace('…', r'...')

common_mistakes: # Regex patterns for common mistakes in Malayalam raw corpus, ASR/OCR outputs
'(“|”)': '"' # Replace Unicode left/right double quotes with ASCII double quotes
"(‘|’)": "'" # Replace Unicode left/right single quotes with ASCII single quotes/apostraphe
"(–|—’)": "-" # Replace ENDASH and EMDASH with ASCII single quotes/apostraphe
'': '...' # Replace Ellipsis with ASCII dots
'(\u00A0|\u200B)': ' ' #Repalce NO_BREAK_space and ZERO_WIDTH_SPACE='\u200B' with normal space
'\uFEFF|\uFFFE|\u2060|\u00AD' : '' # Remove BYTE_ORDER_MARK='\uFEFF' BYTE_ORDER_MARK_2='\uFFFE' WORD_JOINER='' SOFT_HYPHEN=''
'([^\s]+)\u200c([{PUNCTUATION}\s])': '\1\2' # Remove ZWNJ at the end of words followed by ASCII punctuation or space.
# The PUNCTUATIONS are defined in core.py where regex is compiled.
'([^\s]+)\u200c$': '\1' # Remove ZWNJ at the end of the string
'(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)\u200c': '\1' # Remove ZWNJ after any of the chillu characters
'(ൺ|ൻ|ർ|ൽ|ൾ|ൿ|ൔ|ൕ|ൖ)്': '\1' # Remove virama after any of the chillu characters
'\u200D': '' # Remove all ZWJ characters
'\u200B': '' # Remove all Zero Width space characters
'\u00AD': '' # Remove all soft hyphen characters
Expand All @@ -59,7 +79,7 @@ common_mistakes: # Regex patterns for common mistakes
'പക്ഷെ': 'പക്ഷേ'
'ൻറും' : 'ന്റും'
'ൻറ്': 'ന്റ്'
'ൻറിൽ' : 'ന്റിൽ'
'ൻറി' : 'ന്റി'
'ുൻപോൾ' : 'ുമ്പോൾ'


19 changes: 11 additions & 8 deletions libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ def test_normalize(self):
self.assertEqual(normalize('അവില്‍പാെതി'), 'അവിൽപൊതി')
self.assertEqual(normalize('കാേടതി'), 'കോടതി')
self.assertEqual(normalize('കോടതി'), 'കോടതി')
self.assertEqual(normalize('പൌരൻ!!', keep_punctuations=True), 'പൗരൻ!!')
self.assertEqual(normalize('പൌരൻ!!', remove_punctuations=False), 'പൗരൻ!!')


# # Remove punctuations
self.assertEqual(normalize('1-ാം'), '1ാം')
self.assertEqual(normalize('1-ാം', keep_punctuations=True), '1-ാം')
self.assertEqual(normalize('1-ാം', remove_punctuations=False), '1-ാം')

# Alternate Spellings
self.assertEqual(normalize('കാൎത്തുമ്പി'), 'കാർത്തുമ്പി')
Expand Down Expand Up @@ -60,21 +60,24 @@ def test_normalize(self):
self.assertEqual(normalize('കാറ് '), 'കാറ് ')
self.assertEqual(normalize('പൂമ്പാററ'), 'പൂമ്പാറ്റ')
self.assertEqual(normalize('കാറ്റ്'), 'കാറ്റ്')
self.assertEqual(normalize('ദു:ഖത്തിന്റെ'), 'ദുഃഖത്തിന്റെ')
self.assertEqual(normalize('ദു:ഖത്തിന്റെ', keep_punctuations=True),
self.assertEqual(normalize('ദു:ഖത്തിന്റെ'), 'ദുഃഖത്തിന്റെ')
self.assertEqual(normalize('ദു:ഖത്തിന്റെ', remove_punctuations=False),
'ദുഃഖത്തിന്റെ')
self.assertEqual(normalize(' ൊന്നിലോ'), ' ഒന്നിലോ')
self.assertEqual(normalize('ൌന്നത്യം'), 'ഔന്നത്യം')
self.assertEqual(normalize('പാൻറ്'), 'പാന്റ്')


self.assertEqual(normalize('കൺ്മഷി'), 'കൺമഷി')
self.assertEqual(normalize('“ആൻറി”', remove_punctuations=False), '"ആന്റി"')
self.assertEqual(normalize('“ആൻറി', remove_punctuations=True), 'ആന്റി') # This happens by dafault
self.assertEqual(normalize('അമ്മ’'), 'അമ്മ')
self.assertEqual(normalize('അമ്മ’', remove_punctuations=False), "അമ്മ'")



def test_multiline_string(self):
expected = """കുഞ്ചൻ നമ്പ്യാർ
ചെണ്ടമേളം"""
ചെണ്ടമേളം"""
input = """കുഞ്ചന്‍ നമ്പ്യാര്‍
ചെണ്ടമേളം"""
ചെണ്ടമേളം"""
actual = self.normalizer.normalize(input)
self.assertEqual(actual, expected)

0 comments on commit 02366fc

Please sign in to comment.