Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/plural singular pt #37

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 28 additions & 4 deletions lingua_franca/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,30 @@
import re


def singularize(word, lang=None):
lang_code = get_primary_lang_code(lang)
if lang_code == "en":
return singularize_en(word)
elif lang_code == "pt":
return singularize_pt(word)

# TODO: Other languages
_log_unsupported_language(lang_code, ['en', 'pt'])
return word


def pluralize(word, lang=None):
lang_code = get_primary_lang_code(lang)
if lang_code == "en":
return pluralize_en(word)
elif lang_code == "pt":
return pluralize_pt(word)

# TODO: Other languages
_log_unsupported_language(lang_code, ['en', 'pt'])
return word


def _translate_word(name, lang):
""" Helper to get word tranlations

Expand All @@ -63,7 +87,7 @@ def _translate_word(name, lang):

lang_code = get_full_lang_code(lang)

filename = resolve_resource_file(join("text", lang_code, name+".word"))
filename = resolve_resource_file(join("text", lang_code, name + ".word"))
if filename:
# open the file
try:
Expand Down Expand Up @@ -141,7 +165,7 @@ def _number_strings(self, number, lang):
x_in_x000 = self.lang_config[lang]['number'].get(str(int(
number % 10000 / 1000))) or str(int(number % 10000 / 1000))
x0_in_x000 = self.lang_config[lang]['number'].get(str(int(
number % 10000 / 1000)*10)) or str(int(number % 10000 / 1000)*10)
number % 10000 / 1000) * 10)) or str(int(number % 10000 / 1000) * 10)
x_in_0x00 = self.lang_config[lang]['number'].get(str(int(
number % 1000 / 100)) or str(int(number % 1000 / 100)))

Expand Down Expand Up @@ -242,7 +266,7 @@ def year_format(self, dt, lang, bc):


date_time_format = DateTimeFormat(os.path.join(os.path.dirname(__file__),
'res/text'))
'res/text'))


def nice_number(number, lang=None, speech=True, denominators=None):
Expand Down Expand Up @@ -520,7 +544,7 @@ def nice_duration(duration, lang=None, speech=True):
out += str(hours) + ":"
if minutes < 10 and (hours > 0 or days > 0):
out += "0"
out += str(minutes)+":"
out += str(minutes) + ":"
if seconds < 10:
out += "0"
out += str(seconds)
Expand Down
55 changes: 55 additions & 0 deletions lingua_franca/lang/common_data_pt.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from lingua_franca.lang.parse_common import invert_dict

# Undefined articles ["um", "uma", "uns", "umas"] can not be supressed,
# in PT, "um cavalo" means "a horse" or "one horse".

Expand All @@ -18,6 +20,59 @@
_MALE_DETERMINANTS_PT = ["o", "os", "este", "estes", "esse", "esses"]
_FEMALE_DETERMINANTS_PT = ["a", "as", "estas", "estas", "essa", "essas"]

# constants used for singularize / pluralize
_VOWELS_PT = ["a", "ã", "á", "à",
"e", "é", "è",
"i", "ì", "í",
"o", "ó", "ò", "õ",
"u", "ú", "ù"]

_INVARIANTS_PT = ["ontem", "depressa", "ali", "além", "sob", "por", "contra", "desde", "entre",
"até", "perante", "porém", "contudo", "todavia", "entretanto", "senão", "portanto",
"oba", "eba", "exceto", "excepto", "apenas", "menos", "também", "inclusive", "aliás",
"que", "onde", "isto", "isso", "aquilo", "algo", "alguém", "nada", "ninguém", "tudo", "cada",
"outrem", "quem", "mais", "menos", "demais",
# NOTE some words ommited because it depends on POS_TAG
# NOTE these multi word expressions are also invariant
"ou melhor", "isto é", "por exemplo", "a saber", "digo", "ou seja",
"por assim dizer", "com efeito", "ou antes"]

_PLURAL_EXCEPTIONS_PT = {
"cânon": "cânones",
"cós": "coses", # cós (unchanged word) is also valid
"cais": "cais",
"xis": "xis",
"mal": "males",
"cônsul": "cônsules",
"mel": "méis", # "meles" also valid
"fel": "féis", # "feles" also valid
"cal": "cais", # "cales" also valid
"aval": "avais", # "avales also valid
"mol": "móis", # "moles also valid
"real": "réis",
"fax": "faxes",
"cálix": "cálices",
"índex": "índices",
"apêndix": "apêndices",
"hélix": "hélices",
"hálux": "háluces",
"códex": "códices",
"fénix": "fénixes", # "fénix" also valid
"til": "tis", # "tiles" also valid
"pão": "pães",
"cão": "cães",
"alemão": "alemães",
"balão": "balões",
"anão": "anões",
"dez": "dez",
"três": "três",
"seis": "seis"
}

# in general words that end with "s" in singular form should be added bellow
_SINGULAR_EXCEPTIONS_PT = invert_dict(_PLURAL_EXCEPTIONS_PT)

# constants for number handling
_NUMBERS_PT = {
"zero": 0,
"um": 1,
Expand Down
13 changes: 11 additions & 2 deletions lingua_franca/lang/format_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from lingua_franca.lang.format_common import convert_to_mixed_fraction
from lingua_franca.lang.common_data_en import _NUM_STRING_EN, \
_FRACTION_STRING_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN
import inflection


def nice_number_en(number, speech, denominators):
Expand Down Expand Up @@ -144,7 +145,7 @@ def pronounce_number_en(num, places=2, short_scale=True, scientific=False):
if _num[3:4] == '0':
last = number_names[int(_num[2:4])]
else:
second = number_names[int(_num[2:3])*10]
second = number_names[int(_num[2:3]) * 10]
last = second + " " + number_names[int(_num[3:4])]
return first + " " + last
# exception used to catch any unforseen edge cases
Expand Down Expand Up @@ -214,7 +215,7 @@ def _long_scale(n):
# plus one as we skip 'thousand'
# (and 'hundred', but this is excluded by index value)
number = number.replace(',', '')
number += " " + hundreds[i+1]
number += " " + hundreds[i + 1]
res.append(number)
return ", ".join(reversed(res))

Expand Down Expand Up @@ -318,3 +319,11 @@ def nice_time_en(dt, speech=True, use_24hour=False, use_ampm=False):
speak += " a.m."

return speak


def singularize_en(word):
return inflection.singularize(word)


def pluralize_en(word):
return inflection.pluralize(word)
64 changes: 63 additions & 1 deletion lingua_franca/lang/format_pt.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,69 @@

from lingua_franca.lang.format_common import convert_to_mixed_fraction
from lingua_franca.lang.common_data_pt import _FRACTION_STRING_PT, \
_NUM_STRING_PT
_NUM_STRING_PT, _VOWELS_PT, _PLURAL_EXCEPTIONS_PT, _SINGULAR_EXCEPTIONS_PT, _INVARIANTS_PT


def singularize_pt(word):
if word in _INVARIANTS_PT:
return _INVARIANTS_PT[word]
if word in _SINGULAR_EXCEPTIONS_PT:
return _SINGULAR_EXCEPTIONS_PT[word]
# TODO implement is_plural helper
# can not ensure word is in plural, assuming it is,
# if in singular form it might in some cases be wrongly mutated
# in general words that end with "s" in singular form should be added to exceptions dict
if word.endswith("is"):
return word.rstrip("is") + "il"
if word.endswith("ões"):
return word.replace("ões", "ão")
if word.endswith("ães"):
return word.replace("ães", "ão")
if word.endswith("es"):
return word.rstrip("es")
if word.endswith("s"):
return word.rstrip("s")
return word


def pluralize_pt(word):
if word in _INVARIANTS_PT:
return _INVARIANTS_PT[word]
if word in _PLURAL_EXCEPTIONS_PT:
return _PLURAL_EXCEPTIONS_PT[word]
if word.endswith("x"):
return word
if word.endswith("s"):
if word[-2] in _VOWELS_PT or word[-3] in _VOWELS_PT:
# if word is an oxytone, add "es", else word remains unchanged
# this check is overly simplified but should work 99% of the time
# https://en.wikipedia.org/wiki/Oxytone
return word + "es"
return word
if word.endswith("ão"):
# crap, can either end with "ãos", "aẽs" or "ões", most times they are all valid
# the other times lets hope the word is in exceptions dict
# TODO check if numeric, then it's always "ões"
return word + "s"
if word[-1] in _VOWELS_PT:
# if word ends with a vowel add an "s"
return word + 's'
for ending in ["r", "z", "n"]:
if word.endswith(ending):
return word + "es"
for ending in ["al", "el", "ol", "ul"]:
if word.endswith(ending):
return word.rstrip("l") + "is"
if word.endswith("il"):
return word.rstrip("l") + "s"
if word.endswith("m"):
return word.rstrip("m") + "ns"
# foreign words that have been "unportuguesified" have an "s" added
# simple check is looking for endings that don't exist in portuguese
for ending in ["w", "y", "k", "t"]:
if word.endswith(ending):
return word + "s"
return word


def nice_number_pt(number, speech, denominators):
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
python-dateutil==2.6.0
python-dateutil==2.6.0
inflection
36 changes: 33 additions & 3 deletions test/test_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from lingua_franca.format import pronounce_number
from lingua_franca.format import date_time_format
from lingua_franca.format import join_list
from lingua_franca.format import singularize, pluralize

NUMBERS_FIXTURE_EN = {
1.435634: '1.436',
Expand Down Expand Up @@ -186,12 +187,12 @@ def test_auto_scientific_notation(self):
"power of negative one hundred "
"and fifty")
# value is platform dependent so better not use in tests?
#self.assertEqual(
# self.assertEqual(
# pronounce_number(sys.float_info.min), "two point two two times "
# "ten to the power of "
# "negative three hundred "
# "and eight")
#self.assertEqual(
# self.assertEqual(
# pronounce_number(sys.float_info.max), "one point seven nine "
# "times ten to the power of"
# " three hundred and eight")
Expand Down Expand Up @@ -519,7 +520,7 @@ def test_nice_year(self):
self.assertTrue(len(nice_year(dt, lang=lang)) > 0)
# Looking through the date sequence can be helpful

# print(nice_year(dt, lang=lang))
# print(nice_year(dt, lang=lang))

def test_nice_duration(self):
self.assertEqual(nice_duration(1), "one second")
Expand Down Expand Up @@ -556,5 +557,34 @@ def test_join(self):
self.assertEqual(join_list([1, "b", 3, "d"], "or"), "1, b, 3 or d")


class TestInflection(unittest.TestCase):
def test_singularize(self):
self.assertEqual(singularize("posts"), "post")
self.assertEqual(singularize("octopi"), "octopus")
self.assertEqual(singularize("sheep"), "sheep")
# test already singular
self.assertEqual(singularize("word"), "word")
# test garbage
self.assertEqual(singularize("CamelOctopi"), "CamelOctopus")

def test_pluralize(self):
self.assertEqual(pluralize("post"), "posts")
self.assertEqual(pluralize("octopus"), "octopi")
self.assertEqual(pluralize("sheep"), "sheep")
# test already plural
self.assertEqual(pluralize("words"), "words")
# irregular verbs
self.assertEqual(pluralize("person"), "people")
self.assertEqual(pluralize("man"), "men")
self.assertEqual(pluralize("human"), "humans")
self.assertEqual(pluralize('child'), 'children')
self.assertEqual(pluralize('sex'), 'sexes')
self.assertEqual(pluralize('move'), 'moves')
self.assertEqual(pluralize('cow'), 'kine')
self.assertEqual(pluralize('zombie'), 'zombies')
# test garbage
self.assertEqual(pluralize("CamelOctopus"), "CamelOctopi")


if __name__ == "__main__":
unittest.main()