Skip to content

Commit

Permalink
fix: simplify algorithm with three-pass strategy
Browse files Browse the repository at this point in the history
first pass: transpose prebase to end of postbase
second pass: translate to unicode
third pass: normalize

Fixes #10
  • Loading branch information
asdofindia committed Oct 22, 2023
1 parent 8688ab1 commit 2a7b231
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 64 deletions.
96 changes: 36 additions & 60 deletions libindic/payyans/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,17 @@
import codecs # കൊടച്ചക്രം
import os # ശീലക്കുട
from libindic.normalizer import Normalizer
from .reader import Reader


'''
പയ്യന്റെ ക്ലാസ് ഉന്നതകുലമാകുന്നു. ച്ചാല്‍ ആഢ്യന്‍ തന്നെ.
ഏ ക്ലാസ് പയ്യന്‍...!
'''

prebase_letters = ["േ", "ൈ", "്ര", "െ"]
postbase_letters = ["ാ", "ി", "ീ", "ു", "ൂ", "ൃ", "ൗ", "ം", "ഃ", "്യ", "്വ"]


class Payyans():

Expand Down Expand Up @@ -100,59 +104,40 @@ def Unicode2ASCII(self, unicode_text, font):
return ascii_text

def ASCII2Unicode(self, ascii_text, font):
ascii_text = self.normalizer.normalize(ascii_text)
index = 0
post_index = 0
prebase_letter = ""
postbase_letter = "" # "‌‌്യ", "്വ"
unicode_text = ""
next_ucode_letter = ""
self.direction = "a2u"
self.mapping_filename = os.path.join(os.path.dirname(__file__),
'maps', font + ".map")
self.rulesDict = self.LoadRules()
while index < len(ascii_text):
for charNo in [2, 1]:
letter = ascii_text[index:index + charNo]
if letter in self.rulesDict:
unicode_letter = self.rulesDict[letter]
if(self.isPrebase(unicode_letter)): # സ്വരചിഹ്നമാണോ?
prebase_letter = unicode_letter
else: # സ്വരചിഹ്നമല്ല
'''
എങ്കില്‍ വ്യഞ്ജനത്തിനു ശേഷം
പോസ്റ്റ്-ബേസ് ഉണ്ടോ എന്നു നോക്കൂ
'''
post_index = index + charNo
if post_index < len(ascii_text):
letter = ascii_text[post_index]
if letter in self.rulesDict:
next_ucode_letter = self.rulesDict[letter]
if self.isPostbase(next_ucode_letter):
postbase_letter = next_ucode_letter
index = index + 1
if ((unicode_letter.encode('utf-8') == "എ")
| (unicode_letter.encode('utf-8') == "ഒ")):
vowel_sign = self.getVowelSign(prebase_letter,
unicode_letter)
unicode_text = (unicode_text
+ postbase_letter
+ vowel_sign)
else:
unicode_text = (unicode_text
+ unicode_letter
+ postbase_letter
+ prebase_letter)
prebase_letter = ""
postbase_letter = ""
index = index + charNo
break
else:
if charNo == 1:
unicode_text = unicode_text + letter
index = index + 1
break
unicode_letter = letter

prebase_ascii_letters = [k for k, v in self.rulesDict.items() if v in prebase_letters]
postbase_ascii_letters = [k for k, v in self.rulesDict.items() if v in postbase_letters]

# ആദ്യത്തെ ഓട്ടം: മുമ്പേ ഗമിക്കും പ്രീബേസിനെ പിടിച്ച് തോളില്‍ കയറ്റുക
ascii_text = Reader(ascii_text)
transposed_text = ""
prebase = ""
while ascii_text.has_more_char():
letter = ascii_text.next_char()
if letter in prebase_ascii_letters:
prebase = letter + prebase
elif letter in postbase_ascii_letters:
transposed_text += letter + prebase
prebase = ""
else:
transposed_text += letter
if ascii_text.peek_next() not in postbase_ascii_letters:
transposed_text += prebase
prebase = ""
if prebase != "":
transposed_text += prebase

# രണ്ടാമത്തെ ഓട്ടം: പച്ച മലയാളം
translator = str.maketrans({k: v for k, v in self.rulesDict.items() if len(k) == 1})
unicode_text = transposed_text.translate(translator)

# മൂന്നാമത്തെ ഓട്ടം: ചേരുംപടി ചേര്‍ക്കുക
unicode_text = self.normalizer.normalize(unicode_text)

return unicode_text # മതം മാറ്റി തിരിച്ചു കൊടുക്ക്വാ !

def getVowelSign(self, vowel_letter, vowel_sign_letter):
Expand All @@ -177,12 +162,7 @@ def isPrebase(self, letter):
"തരികിട തരികിടോ ധീംതരികിട" (തരികിട തരികിടയാല്‍)
എന്നു പയ്യന്റെ ഗുരു പയ്യഗുരു പയ്യെ മൊഴിഞ്ഞിട്ടുണ്ടു്.
'''
unicode_letter = letter.encode('utf-8')
prebase_letters = ["േ", "ൈ", "ൊ", "ോ", "ൌ", "്ര", "െ"]
if (unicode_letter in prebase_letters):
return True # "ഇതു സത്യം... അ...സത്യം.... അസത്യം...!"
else:
return False
return letter in prebase_letters

def isPostbase(self, letter):
'''
Expand All @@ -192,11 +172,7 @@ def isPostbase(self, letter):
വ്യഞ്ജനം+പോസ്റ്റ്-ബേസ് കഴിഞ്ഞേ പ്രീ-ബേസ് ചേര്‍ക്കാവൂ!
ഹൊ, പയ്യന്‍ പാണിനീശിഷ്യനാണ്!!
'''
unicode_letter = letter.encode('utf-8')
if ((unicode_letter == "്യ") | (unicode_letter == "്വ")):
return True
else:
return False
return letter in postbase_letters

def LoadRules(self):
'''
Expand Down
2 changes: 1 addition & 1 deletion libindic/payyans/maps/karthika.map
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ ss=ൈ
sm=ൊ
tm=ോ
su=ൌ
v=്
v=്
u=ൗ
¡=ക്ക
¡=ക്ക
Expand Down
19 changes: 19 additions & 0 deletions libindic/payyans/reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
class Reader():
def __init__(self, text):
self.text = text
self.len = len(text)
self.current_letter = -1

def has_more_char(self, n = 1):
return self.len > self.current_letter + n

def next_char(self):
if self.has_more_char():
self.current_letter += 1
return self.text[self.current_letter]
else:
return None

def peek_next(self):
if self.has_more_char():
return self.text[self.current_letter + 1]
11 changes: 8 additions & 3 deletions libindic/payyans/tests/test_payyans.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ def test_ascii2unicode(self):
"aebmfw", "ambili"), u"മലയാളം")

def test_unicode2ascii(self):
self.assertEqual(
self.payyans.Unicode2ASCII(
u"മലയാളം", "ambili"), u"aebmfw")
self.assertEqual(self.payyans.Unicode2ASCII("മലയാളം", "ambili"), "aebmfw")

def test_double_swaras(self):
inputs = ["ss{U", "t{]aw", "kvss{XWX"]
expected = ["ഡ്രൈ", "പ്രേമം", "സ്ത്രൈണത"]
for i in range(len(inputs)):
actual = self.payyans.ASCII2Unicode(inputs[i], "karthika")
self.assertEqual(actual, expected[i])

0 comments on commit 2a7b231

Please sign in to comment.