From e8ccad1151b9cb8f6f88a1e1a3cc177c5da67b46 Mon Sep 17 00:00:00 2001 From: jarbasal Date: Fri, 4 Oct 2019 01:20:59 +0100 Subject: [PATCH] improve get_gender PT --- lingua_franca/lang/common_data_pt.py | 15 ++++++++++ lingua_franca/lang/parse_pt.py | 44 +++++++++++++++++++--------- test/test_parse_pt.py | 18 ++++++++++-- 3 files changed, 60 insertions(+), 17 deletions(-) diff --git a/lingua_franca/lang/common_data_pt.py b/lingua_franca/lang/common_data_pt.py index 6a0befd3..ca99ba3d 100644 --- a/lingua_franca/lang/common_data_pt.py +++ b/lingua_franca/lang/common_data_pt.py @@ -3,6 +3,21 @@ _ARTICLES_PT = ["o", "a", "os", "as"] +# word rules for gender +_FEMALE_ENDINGS_PT = ["a", "as"] +_MALE_ENDINGS_PT = ["o", "os"] + +# special cases, word lookup for words not covered by above rule +_GENDERS_PT = { + "mulher": "f", + "mulheres": "f", + "homem": "m" +} + +# context rules for gender +_MALE_DETERMINANTS_PT = ["o", "os", "este", "estes", "esse", "esses"] +_FEMALE_DETERMINANTS_PT = ["a", "as", "estas", "estas", "essa", "essas"] + _NUMBERS_PT = { "zero": 0, "um": 1, diff --git a/lingua_franca/lang/parse_pt.py b/lingua_franca/lang/parse_pt.py index a677e73d..04771377 100644 --- a/lingua_franca/lang/parse_pt.py +++ b/lingua_franca/lang/parse_pt.py @@ -25,7 +25,8 @@ from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions from lingua_franca.lang.common_data_pt import _FRACTION_STRING_PT, \ - _ARTICLES_PT, _NUMBERS_PT + _ARTICLES_PT, _NUMBERS_PT, _FEMALE_DETERMINANTS_PT, _FEMALE_ENDINGS_PT,\ + _MALE_DETERMINANTS_PT, _MALE_ENDINGS_PT, _GENDERS_PT def isFractional_pt(input_str): @@ -1122,18 +1123,33 @@ def pt_pruning(text, symbols=True, accents=True, agressive=True): return text -def get_gender_pt(word, raw_string=""): - word = word.rstrip("s") - gender = None - words = raw_string.split(" ") +def get_gender_pt(word, text=""): + # parse gender taking context into account + word = word.lower() + words = text.lower().split(" ") for idx, w in enumerate(words): if w == word and idx != 0: - previous = words[idx - 1] - gender = get_gender_pt(previous) - break - if not gender: - if word[-1] == "a": - gender = "f" - if word[-1] == "o" or word[-1] == "e": - gender = "m" - return gender + # in portuguese usually the previous word (a determinant) + # assigns gender to the next word + previous = words[idx - 1].lower() + if previous in _MALE_DETERMINANTS_PT: + return "m" + elif previous in _FEMALE_DETERMINANTS_PT: + return "f" + + # get gender using only the individual word + # see if this word has the gender defined + if word in _GENDERS_PT: + return _GENDERS_PT[word] + singular = word.rstrip("s") + if singular in _GENDERS_PT: + return _GENDERS_PT[singular] + # in portuguese the last vowel usually defines the gender of a word + # the gender of the determinant takes precedence over this rule + for end_str in _FEMALE_ENDINGS_PT: + if word.endswith(end_str): + return "f" + for end_str in _MALE_ENDINGS_PT: + if word.endswith(end_str): + return "m" + return None diff --git a/test/test_parse_pt.py b/test/test_parse_pt.py index f07b91dc..114c095d 100644 --- a/test/test_parse_pt.py +++ b/test/test_parse_pt.py @@ -27,6 +27,7 @@ class TestNormalize(unittest.TestCase): """ Test cases for Portuguese parsing """ + def test_articles_pt(self): self.assertEqual(normalize(u"isto é o teste", lang="pt", remove_articles=True), @@ -244,15 +245,26 @@ def test_extractdatetime_default_pt(self): anchor, lang='pt-pt', default_time=default) self.assertEqual(default, res[0].time()) + +class TestExtractGender(unittest.TestCase): def test_gender_pt(self): + # words with well defined grammatical gender rules self.assertEqual(get_gender("vaca", lang="pt"), "f") self.assertEqual(get_gender("cavalo", lang="pt"), "m") self.assertEqual(get_gender("vacas", lang="pt"), "f") - self.assertEqual(get_gender("boi", "o boi come erva", lang="pt"), "m") + + # words specifically defined in a lookup dictionary + self.assertEqual(get_gender("homem", lang="pt"), "m") + self.assertEqual(get_gender("mulher", lang="pt"), "f") + self.assertEqual(get_gender("homems", lang="pt"), "m") + self.assertEqual(get_gender("mulheres", lang="pt"), "f") + + # words where gender rules do not work but context does self.assertEqual(get_gender("boi", lang="pt"), None) - self.assertEqual(get_gender("homem", "estes homem come merda", + self.assertEqual(get_gender("boi", "o boi come erva", lang="pt"), "m") + self.assertEqual(get_gender("homem", "este homem come bois", lang="pt"), "m") - self.assertEqual(get_gender("ponte", lang="pt"), "m") + self.assertEqual(get_gender("ponte", lang="pt"), None) self.assertEqual(get_gender("ponte", "essa ponte caiu", lang="pt"), "f")