Skip to content

Commit

Permalink
Merge pull request #24 from kavyamanohar/master
Browse files Browse the repository at this point in the history
Adding provision to keep language specific rules
  • Loading branch information
kavyamanohar committed Aug 15, 2024
2 parents b9eabc4 + 19f49c6 commit fde4b71
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 148 deletions.
16 changes: 8 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# LibIndic Normalizer

LibIndic's normalizer module may be used to normalize the text to a canonical
format to handle inconsistencies in text. Right now, it supports
Malayalam language only.
format to handle inconsistencies in text. Apply normalization rules based on the language code. Right now, it supports Malayalam language only.

## Features

Expand Down Expand Up @@ -31,18 +30,19 @@ Note: Prefer using virtualenv for installation as the library is in experimental

## Usage
```
Input: Unicode text...
Input: Unicode text
Output: Normalized unicode text
>>> from libindic.normalizer import Normalizer
>>> normalizer = Normalizer()
>>> result = normalizer.normalize(u'പൂമ്പാററ')
>>> normalizer = Normalizer("ml")
>>> result = normalizer.normalize('ദു:ഖത്തിന്റെ')
>>> print(result)
പൂമ്പാറ്റ
>> ദുഃഖത്തിന്റെ
>>> result = normalizer.normalize('പൌരൻ!!', keep_punctuations=True)
>>> print(result)
>>> പൗരൻ!!
```

For more details read the [docs](http://indicstemmer.rtfd.org/)

## Running tests
To run tests,

Expand Down
6 changes: 3 additions & 3 deletions libindic/normalizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# -*- coding: utf-8 -*-
__all__ = ["Normalizer", "getInstance"]
from .core import Normalizer, getInstance
from .core import Normalizer

__all__ = ['Normalizer']
123 changes: 31 additions & 92 deletions libindic/normalizer/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,101 +18,40 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#

import re
import yaml
import os
import string
from importlib_resources import files


class Normalizer:

def __init__(self):
self.rules_file = files("libindic.normalizer.rules").joinpath(
"normalizer_ml.rules"
)
self.rulesDict = self.LoadRules()
pattern = "|".join(map(re.escape, self.rulesDict.keys()))
self.regex = re.compile(pattern)
def __init__(self, language_code):
self.language_code = language_code
self.rules = self.load_rules()
self.punctuation_remover = str.maketrans('', '', string.punctuation)

def normalize(self, text, keep_punctuations=False):
replaced = self.regex.sub(
lambda match: self.rulesDict[match.group(0)], text
)
def load_rules(self):
rules_path = os.path.join(os.path.dirname(__file__), 'rules', f'normalizer.{self.language_code}.yaml')
if not os.path.exists(rules_path):
raise FileNotFoundError(f"Rules file for language '{self.language_code}' not found.")

with open(rules_path, 'r', encoding='utf-8') as file:
rules = yaml.safe_load(file)
return rules

def normalize(self, input_text, keep_punctuations=False,normalize_chillus=True, normalize_vowelsigns=True, normalize_typos=True, normalize_alternateforms=True):
if normalize_chillus and 'normalize_chillus' in self.rules:
for key, value in self.rules['normalize_chillus'].items():
input_text = input_text.replace(key, value)

if normalize_vowelsigns and 'normalize_vowelsigns' in self.rules:
for key, value in self.rules['normalize_vowelsigns'].items():
input_text = input_text.replace(key, value)

if normalize_typos and 'normalize_typos' in self.rules:
for key, value in self.rules['normalize_typos'].items():
input_text = input_text.replace(key, value)
if normalize_alternateforms and 'normalize_alternateforms' in self.rules:
for key, value in self.rules['normalize_alternateforms'].items():
input_text = input_text.replace(key, value)
if keep_punctuations:
return replaced
return replaced.translate(self.punctuation_remover)

def LoadRules(self):
rules_dict = dict()
line = []
line_number = 0
rules_file = self.rules_file.open()
while True:
line_number = line_number + 1
text_raw = rules_file.readline()
try:
text = text_raw.decode('utf-8')
except (AttributeError, UnicodeEncodeError):
text = text_raw
if text == "":
break
if text[0] == '#':
continue # this is a comment - ignore
text = text.split("#")[0] # remove the comment part of the line
line = text.strip() # remove unwanted space
if (line == ""):
continue
if (len(line.split("=")) != 2):
print(
"[Error] Syntax Error in the Rules. Line number: ",
line_number)
print("Line: " + text)
continue
lhs = line.split("=")[0].strip()
rhs = line.split("=")[1].strip()
rules_dict[lhs] = rhs
rules_file.close()
return rules_dict

def process(self, form):
response = """
<h2>Normalizer</h2></hr>
<p>Enter the text for normalizing in the below text area.
Language of each word will be detected.
You can give the text in any language and even with mixed language
</p>
<form action="" method="post">
<textarea cols='100' rows='25' name='input_text' id='id1'>\
%s\
</textarea>
<input type="submit" id="Stem" value="Normalize" name="action" \
style="width:12em;"/>
<input type="reset" value="Clear" style="width:12em;"/>
</br>
</form>
"""
if ('input_text' in form):
text = form['input_text'].value.decode('utf-8')
response = response % text
result_dict = self.normalize(text)
response = response + "<h2>Normalized Result</h2></hr>"
response = response + \
"<table class=\"table1\"><tr><th>Word</th>\
<th>Normalized form</th></tr>"
for key in result_dict:
response = response + "<tr><td>" + key + \
"</td><td>" + result_dict[key] + "</td></tr>"
response = response + "</table>"
else:
response = response % ""
return response

def get_module_name(self):
return "Normalizer"

def get_info(self):
return "Malayalam Normalizer(Experimental)"


def getInstance():
return Normalizer()
return input_text
return input_text.translate(self.punctuation_remover)
6 changes: 6 additions & 0 deletions libindic/normalizer/rules/normalizer.en.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
normalize_alternateforms:
colour: color
favourite: favorite
theatre: theater
centre: center

33 changes: 33 additions & 0 deletions libindic/normalizer/rules/normalizer.ml.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
normalize_chillus:
"ണ്‍": ""
"ന്‍": ""
"ര്‍": ""
"ല്‍": ""
"ള്‍": ""
"ക്‍": "ൿ"

normalize_vowelsigns:
"െെ": ""
"ൊ": ""
"ാെ": ""
"ോ": ""
"ാേ": ""
"ൌ": ""
"ൗെ": ""
"": ""
"െഎ": ""
"ഇൗ": ""
"ഉൗ": ""
"ഒൗ": ""

normalize_typos:
"ദു:ഖ": "ദുഃഖ"
"നമ:": "നമഃ"
"ററ": "റ്റ" # Fails for കണ്ടംപററി

normalize_alternateforms:
"ൻ്റ": "ന്റ"
"ൎയ്യ": "ര്യ"
"അധ്യാപ": "അദ്ധ്യാപ"
"": ""
"ു്": ""
38 changes: 0 additions & 38 deletions libindic/normalizer/rules/normalizer_ml.rules

This file was deleted.

14 changes: 8 additions & 6 deletions libindic/normalizer/tests/test_normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,20 @@
from testtools import TestCase

from .. import Normalizer
normalize = Normalizer().normalize
normalize = Normalizer('ml').normalize


class MalayalamNormalizerTest(TestCase):

def setUp(self):
super(MalayalamNormalizerTest, self).setUp()
self.normalizer = Normalizer()
self.normalizer = Normalizer('ml')

def test_normalize(self):
self.assertEqual(self.normalizer.normalize(u'പൂമ്പാററ'), u'പൂമ്പാറ്റ')

# The chillus (ണ്‍ന്‍ര്‍ല്‍ള്‍ക്‍) defined by zero width joiners to be
# replaced with atomic chillus (ൺൻർൽൾൿ).

self.assertEqual(self.normalizer.normalize(u'അവിൽ'), u'അവിൽ')
self.assertEqual(self.normalizer.normalize(u'അവില്‍'), u'അവിൽ')
self.assertEqual(self.normalizer.normalize(u'രമണൻ'), u'രമണൻ')
self.assertEqual(self.normalizer.normalize(u'അവൾ'), u'അവൾ')
self.assertEqual(self.normalizer.normalize(u'ശ്രാവൺ'), u'ശ്രാവൺ')
Expand All @@ -31,8 +29,10 @@ def test_normalize(self):
self.assertEqual(normalize('അവില്‍പാെതി'), 'അവിൽപൊതി')
self.assertEqual(normalize('കാേടതി'), 'കോടതി')
self.assertEqual(normalize('കോടതി'), 'കോടതി')
self.assertEqual(normalize('പൌരൻ!!', keep_punctuations=True), 'പൗരൻ!!')


# Remove punctuations
# # Remove punctuations
self.assertEqual(normalize('1-ാം'), '1ാം')
self.assertEqual(normalize('1-ാം', keep_punctuations=True), '1-ാം')

Expand All @@ -45,6 +45,8 @@ def test_normalize(self):
# Alternate Spellings
self.assertEqual(normalize('കാൎത്തുമ്പി'), 'കാർത്തുമ്പി')
self.assertEqual(normalize('ഭാൎയ്യ'), 'ഭാര്യ')
self.assertEqual(normalize('എൻ്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ.'), 'എന്റെ കമ്പ്യൂട്ടറിന് എന്റെ ഭാഷ')


def test_multiline_string(self):
expected = """കുഞ്ചൻ നമ്പ്യാർ
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ classifiers = [
"Programming Language :: Python",
]
dependencies = [
"importlib-resources"
"importlib-resources",
"pyyaml"
]

[project.urls]
Expand Down

0 comments on commit fde4b71

Please sign in to comment.