From 3e7dd61a2869fb70c9d41d2bdffeaaa545da60c1 Mon Sep 17 00:00:00 2001 From: jarbasai Date: Thu, 14 Jul 2022 00:10:51 +0100 Subject: [PATCH] get_plural_form_en/pt port https://github.com/MycroftAI/lingua-franca/pull/36 + https://github.com/MycroftAI/lingua-franca/pull/37 add pt pluralizations.json add tests --- lingua_franca/format.py | 4 +- lingua_franca/lang/common_data_pt.py | 57 +++++++++++++ lingua_franca/lang/format_en.py | 20 ++++- lingua_franca/lang/format_pt.py | 84 ++++++++++++++++++- .../res/text/pt-pt/pluralizations.json | 24 ++++++ requirements/requirements.txt | 3 +- test/unittests/test_format_en.py | 58 ++++++++----- test/unittests/test_format_pt.py | 31 +++++++ test/unittests/test_format_sl.py | 22 +++-- 9 files changed, 270 insertions(+), 33 deletions(-) create mode 100644 lingua_franca/res/text/pt-pt/pluralizations.json diff --git a/lingua_franca/format.py b/lingua_franca/format.py index 8cfcbbe8..35bc51a1 100755 --- a/lingua_franca/format.py +++ b/lingua_franca/format.py @@ -636,7 +636,7 @@ def get_plural_category(amount, type=PluralCategory.CARDINAL, lang=""): raise FunctionNotLocalizedError("This function has not been implemented in the specified language.") -@localized_function() +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) def get_plural_form(word, amount, type=PluralCategory.CARDINAL, lang=""): """ Get plural form of the specified word for the specified amount. @@ -651,3 +651,5 @@ def get_plural_form(word, amount, type=PluralCategory.CARDINAL, lang=""): Returns: (str): Pluralized word. """ + warn(RuntimeWarning("Pluralization has not been implemented in the specified language. Word unchanged")) + return word diff --git a/lingua_franca/lang/common_data_pt.py b/lingua_franca/lang/common_data_pt.py index 3d214009..0afe4476 100644 --- a/lingua_franca/lang/common_data_pt.py +++ b/lingua_franca/lang/common_data_pt.py @@ -1,3 +1,6 @@ +from lingua_franca.lang.parse_common import invert_dict + + _FUNCTION_NOT_IMPLEMENTED_WARNING = "esta função não foi implementada em 'pt'" # Undefined articles ["um", "uma", "uns", "umas"] can not be supressed, @@ -20,6 +23,60 @@ _MALE_DETERMINANTS_PT = ["o", "os", "este", "estes", "esse", "esses"] _FEMALE_DETERMINANTS_PT = ["a", "as", "estas", "estas", "essa", "essas"] + +# constants used for singularize / pluralize +_VOWELS_PT = ["a", "ã", "á", "à", + "e", "é", "è", + "i", "ì", "í", + "o", "ó", "ò", "õ", + "u", "ú", "ù"] + +_INVARIANTS_PT = ["ontem", "depressa", "ali", "além", "sob", "por", "contra", "desde", "entre", + "até", "perante", "porém", "contudo", "todavia", "entretanto", "senão", "portanto", + "oba", "eba", "exceto", "excepto", "apenas", "menos", "também", "inclusive", "aliás", + "que", "onde", "isto", "isso", "aquilo", "algo", "alguém", "nada", "ninguém", "tudo", "cada", + "outrem", "quem", "mais", "menos", "demais", + # NOTE some words ommited because it depends on POS_TAG + # NOTE these multi word expressions are also invariant + "ou melhor", "isto é", "por exemplo", "a saber", "digo", "ou seja", + "por assim dizer", "com efeito", "ou antes"] + +_PLURAL_EXCEPTIONS_PT = { + "cânon": "cânones", + "cós": "coses", # cós (unchanged word) is also valid + "cais": "cais", + "xis": "xis", + "mal": "males", + "cônsul": "cônsules", + "mel": "méis", # "meles" also valid + "fel": "féis", # "feles" also valid + "cal": "cais", # "cales" also valid + "aval": "avais", # "avales also valid + "mol": "móis", # "moles also valid + "real": "réis", + "fax": "faxes", + "cálix": "cálices", + "índex": "índices", + "apêndix": "apêndices", + "hélix": "hélices", + "hálux": "háluces", + "códex": "códices", + "fénix": "fénixes", # "fénix" also valid + "til": "tis", # "tiles" also valid + "pão": "pães", + "cão": "cães", + "alemão": "alemães", + "balão": "balões", + "anão": "anões", + "dez": "dez", + "três": "três", + "seis": "seis" +} + +# in general words that end with "s" in singular form should be added bellow +_SINGULAR_EXCEPTIONS_PT = invert_dict(_PLURAL_EXCEPTIONS_PT) + +# constants for number handling _NUMBERS_PT = { "zero": 0, "um": 1, diff --git a/lingua_franca/lang/format_en.py b/lingua_franca/lang/format_en.py index 8ab3c837..1ee58a06 100644 --- a/lingua_franca/lang/format_en.py +++ b/lingua_franca/lang/format_en.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import inflection from lingua_franca.lang.format_common import convert_to_mixed_fraction, PluralCategory, PluralAmount from lingua_franca.lang.common_data_en import _NUM_STRING_EN, \ _FRACTION_STRING_EN, _LONG_SCALE_EN, _SHORT_SCALE_EN, _SHORT_ORDINAL_EN, _LONG_ORDINAL_EN @@ -411,3 +411,21 @@ def get_plural_category_en(amount, type=PluralCategory.CARDINAL): else: return ValueError("Argument \"type\" must be cardinal|ordinal|range") + + +def get_plural_form_en(word, amount, type=PluralCategory.CARDINAL): + """ + Get plural form of the specified word for the specified amount. + + Args: + word(str): Word to be pluralized. + amount(int or float or pair or list): The amount that is used to + determine the category. If type is range, it must contain + the start and end numbers. + type(str): Either cardinal (default), ordinal or range. + Returns: + (str): Pluralized word. + """ + if amount == 1: + return inflection.singularize(word) + return inflection.pluralize(word) diff --git a/lingua_franca/lang/format_pt.py b/lingua_franca/lang/format_pt.py index 7c8107ed..94fa6dfc 100644 --- a/lingua_franca/lang/format_pt.py +++ b/lingua_franca/lang/format_pt.py @@ -14,9 +14,9 @@ # limitations under the License. # -from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.format_common import convert_to_mixed_fraction, PluralCategory, PluralAmount from lingua_franca.lang.common_data_pt import _FRACTION_STRING_PT, \ - _NUM_STRING_PT + _NUM_STRING_PT, _VOWELS_PT, _PLURAL_EXCEPTIONS_PT, _SINGULAR_EXCEPTIONS_PT, _INVARIANTS_PT def nice_number_pt(number, speech, denominators=range(1, 21)): @@ -221,3 +221,83 @@ def nice_time_pt(dt, speech=True, use_24hour=False, use_ampm=False): elif hour != 0 and hour != 12: speak += " da noite" return speak + + +def _singularize_pt(word): + if word in _INVARIANTS_PT: + return word + if word in _SINGULAR_EXCEPTIONS_PT: + return _SINGULAR_EXCEPTIONS_PT[word] + # TODO implement is_plural helper + # can not ensure word is in plural, assuming it is, + # if in singular form it might in some cases be wrongly mutated + # in general words that end with "s" in singular form should be added to exceptions dict + if word.endswith("is"): + return word.rstrip("is") + "il" + if word.endswith("ões"): + return word.replace("ões", "ão") + if word.endswith("ães"): + return word.replace("ães", "ão") + if word.endswith("es"): + return word.rstrip("es") + if word.endswith("s"): + return word.rstrip("s") + return word + + +def _pluralize_pt(word): + if word in _INVARIANTS_PT: + return word + if word in _PLURAL_EXCEPTIONS_PT: + return _PLURAL_EXCEPTIONS_PT[word] + if word.endswith("x"): + return word + if word.endswith("s"): + # TODO - this will catch too many words, need a better check + #if word[-2] in _VOWELS_PT or word[-3] in _VOWELS_PT: + # if word is an oxytone, add "es", else word remains unchanged + # https://en.wikipedia.org/wiki/Oxytone + # return word + "es" + return word + if word.endswith("ão"): + # crap, can either end with "ãos", "aẽs" or "ões", most times they are all valid + # the other times lets hope the word is in exceptions dict + # TODO check if numeric, then it's always "ões" + return word + "s" + if word[-1] in _VOWELS_PT: + # if word ends with a vowel add an "s" + return word + 's' + for ending in ["r", "z", "n"]: + if word.endswith(ending): + return word + "es" + for ending in ["al", "el", "ol", "ul"]: + if word.endswith(ending): + return word.rstrip("l") + "is" + if word.endswith("il"): + return word.rstrip("l") + "s" + if word.endswith("m"): + return word.rstrip("m") + "ns" + # foreign words that have been "unportuguesified" have an "s" added + # simple check is looking for endings that don't exist in portuguese + for ending in ["w", "y", "k", "t"]: + if word.endswith(ending): + return word + "s" + return word + + +def get_plural_form_pt(word, amount, type=PluralCategory.CARDINAL): + """ + Get plural form of the specified word for the specified amount. + + Args: + word(str): Word to be pluralized. + amount(int or float or pair or list): The amount that is used to + determine the category. If type is range, it must contain + the start and end numbers. + type(str): Either cardinal (default), ordinal or range. + Returns: + (str): Pluralized word. + """ + if amount == 1: + return _singularize_pt(word) + return _pluralize_pt(word) diff --git a/lingua_franca/res/text/pt-pt/pluralizations.json b/lingua_franca/res/text/pt-pt/pluralizations.json new file mode 100644 index 00000000..1bc016ff --- /dev/null +++ b/lingua_franca/res/text/pt-pt/pluralizations.json @@ -0,0 +1,24 @@ +{ + "day": { + "one": "dia", + "other": "dias" + }, + "hour": { + "one": "hora", + "other": "horas" + }, + "minute": { + "one": "minuto", + "other": "minutos" + }, + "second": { + "one": "segundo", + "other": "segundos" + }, + "and": { + "one": "e" + }, + "or": { + "one": "ou" + } +} diff --git a/requirements/requirements.txt b/requirements/requirements.txt index 5d943a5d..54c34efd 100644 --- a/requirements/requirements.txt +++ b/requirements/requirements.txt @@ -1,2 +1,3 @@ python-dateutil~=2.6 -rapidfuzz \ No newline at end of file +rapidfuzz +inflection \ No newline at end of file diff --git a/test/unittests/test_format_en.py b/test/unittests/test_format_en.py index 41306947..a5871b4f 100644 --- a/test/unittests/test_format_en.py +++ b/test/unittests/test_format_en.py @@ -13,30 +13,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import unittest import datetime import sys +import unittest + # TODO either write a getter for lingua_franca.internal._SUPPORTED_LANGUAGES, # or make it public somehow -from lingua_franca import load_language, unload_language, set_default_lang, \ - get_primary_lang_code, get_active_langs, get_supported_langs -from lingua_franca.internal import UnsupportedLanguageError -from lingua_franca.format import nice_number -from lingua_franca.format import nice_time -from lingua_franca.format import nice_date -from lingua_franca.format import nice_date_time +from lingua_franca import load_language, unload_language, set_default_lang +from lingua_franca.format import get_plural_category +from lingua_franca.format import join_list, get_plural_form from lingua_franca.format import nice_duration -from lingua_franca.format import nice_number, get_plural_category +from lingua_franca.format import nice_number from lingua_franca.format import nice_time -from lingua_franca.format import nice_year -from lingua_franca.format import nice_duration -from lingua_franca.format import pronounce_number -from lingua_franca.format import date_time_format -from lingua_franca.format import join_list from lingua_franca.format import pronounce_lang -from lingua_franca.time import default_timezone, set_default_tz, now_local, \ - to_local - +from lingua_franca.format import pronounce_number +from lingua_franca.time import default_timezone def setUpModule(): @@ -80,7 +71,6 @@ def tearDownModule(): class TestNiceNumberFormat(unittest.TestCase): - tmp_var = None def set_tmp_var(self, val): @@ -372,7 +362,7 @@ def test_ordinals(self): class TestNiceDateFormat(unittest.TestCase): def test_convert_times(self): - dt = datetime.datetime(2017, 1, 31, + dt = datetime.datetime(2017, 1, 31, 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed @@ -495,7 +485,6 @@ def test_convert_times(self): self.assertEqual(nice_time(dt), "quarter to two") - def test_nice_duration(self): self.assertEqual(nice_duration(1), "one second") self.assertEqual(nice_duration(3), "three seconds") @@ -576,5 +565,34 @@ def test_range_numbers(self): self.assertEqual(get_plural_category((0, 2), type="range"), "other") +class TestInflection(unittest.TestCase): + def test_singularize(self): + self.assertEqual(get_plural_form("posts", 1), "post") + self.assertEqual(get_plural_form("octopi", 1), "octopus") + self.assertEqual(get_plural_form("sheep", 1), "sheep") + # test already singular + self.assertEqual(get_plural_form("word", 1), "word") + # test garbage + self.assertEqual(get_plural_form("CamelOctopi", 1), "CamelOctopus") + + def test_pluralize(self): + self.assertEqual(get_plural_form("post", 2), "posts") + self.assertEqual(get_plural_form("octopus", 3), "octopi") + self.assertEqual(get_plural_form("sheep", 4), "sheep") + # test already plural + self.assertEqual(get_plural_form("words", 5), "words") + # irregular verbs + self.assertEqual(get_plural_form("person", 6), "people") + self.assertEqual(get_plural_form("man", 2), "men") + self.assertEqual(get_plural_form("human", 3), "humans") + self.assertEqual(get_plural_form('child', 4), 'children') + self.assertEqual(get_plural_form('sex', 2), 'sexes') + self.assertEqual(get_plural_form('move', 3), 'moves') + self.assertEqual(get_plural_form('cow', 4), 'kine') + self.assertEqual(get_plural_form('zombie', 5), 'zombies') + # test garbage + self.assertEqual(get_plural_form("CamelOctopus", 6), "CamelOctopi") + + if __name__ == "__main__": unittest.main() diff --git a/test/unittests/test_format_pt.py b/test/unittests/test_format_pt.py index 61c94406..c45bbbc6 100644 --- a/test/unittests/test_format_pt.py +++ b/test/unittests/test_format_pt.py @@ -20,6 +20,7 @@ from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.format import nice_time from lingua_franca.format import pronounce_number +from lingua_franca.format import get_plural_form from lingua_franca.time import default_timezone @@ -309,5 +310,35 @@ def test_minutes_past_hour(self): "onze e um quarto da noite") +class TestInflection(unittest.TestCase): + def test_singularize(self): + self.assertEqual(get_plural_form("homems", 1), "homem") + self.assertEqual(get_plural_form("cavalos", 1), "cavalo") + self.assertEqual(get_plural_form("ovelhas", 1), "ovelha") + # test already singular + self.assertEqual(get_plural_form("palavra", 1), "palavra") + # test garbage + self.assertEqual(get_plural_form("gerubicios", 1), "gerubicio") + + def test_pluralize(self): + self.assertEqual(get_plural_form("poste", 2), "postes") + self.assertEqual(get_plural_form("polvo", 3), "polvos") + self.assertEqual(get_plural_form("ovelha", 4), "ovelhas") + # test already plural + self.assertEqual(get_plural_form("palavras", 5), "palavras") + self.assertEqual(get_plural_form("ovelhas", 3), "ovelhas") + # irregular/invariant verbs + self.assertEqual(get_plural_form("anão", 6), "anões") + self.assertEqual(get_plural_form("alemão", 2), "alemães") + self.assertEqual(get_plural_form("apêndix", 3), "apêndices") + self.assertEqual(get_plural_form('três', 4), 'três') + self.assertEqual(get_plural_form('seis', 2), 'seis') + self.assertEqual(get_plural_form('ontem', 3), 'ontem') + self.assertEqual(get_plural_form('depressa', 4), 'depressa') + self.assertEqual(get_plural_form('contra', 5), 'contra') + # test garbage + self.assertEqual(get_plural_form("gerubicio", 6), "gerubicios") + + if __name__ == "__main__": unittest.main() diff --git a/test/unittests/test_format_sl.py b/test/unittests/test_format_sl.py index e7ecc042..416ea3e8 100644 --- a/test/unittests/test_format_sl.py +++ b/test/unittests/test_format_sl.py @@ -13,28 +13,34 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import json -import datetime import ast +import datetime +import json import sys import unittest -from lingua_franca import get_default_lang, set_default_lang -from lingua_franca.format import nice_number -from lingua_franca.format import nice_time +from lingua_franca import get_default_lang, set_default_lang, load_language, unload_language +from lingua_franca.format import date_time_format +from lingua_franca.format import join_list from lingua_franca.format import nice_date from lingua_franca.format import nice_date_time -from lingua_franca.format import nice_year from lingua_franca.format import nice_duration from lingua_franca.format import nice_number, get_plural_category from lingua_franca.format import nice_time from lingua_franca.format import nice_year from lingua_franca.format import pronounce_number -from lingua_franca.format import date_time_format -from lingua_franca.format import join_list from lingua_franca.time import default_timezone +def setUpModule(): + load_language("sl-si") + set_default_lang("sl") + + +def tearDownModule(): + unload_language("sl") + + NUMBERS_FIXTURE_SL = { 1.435634: '1.436', 2: '2',