From da873acbdd24e07926e617750a317f4739fbe932 Mon Sep 17 00:00:00 2001 From: Rafal Janicki Date: Mon, 2 Nov 2020 22:16:21 +0100 Subject: [PATCH] Polish language support --- lingua_franca/format.py | 7 +- lingua_franca/internal.py | 5 +- lingua_franca/lang/common_data_pl.py | 377 ++++++ lingua_franca/lang/format_pl.py | 351 ++++++ lingua_franca/lang/parse_pl.py | 1403 +++++++++++++++++++++ lingua_franca/parse.py | 6 +- lingua_franca/res/text/pl-pl/and.word | 1 + lingua_franca/res/text/pl-pl/day.word | 1 + lingua_franca/res/text/pl-pl/days.word | 1 + lingua_franca/res/text/pl-pl/hour.word | 1 + lingua_franca/res/text/pl-pl/hours.word | 1 + lingua_franca/res/text/pl-pl/minute.word | 1 + lingua_franca/res/text/pl-pl/minutes.word | 1 + lingua_franca/res/text/pl-pl/or.word | 1 + lingua_franca/res/text/pl-pl/second.word | 1 + lingua_franca/res/text/pl-pl/seconds.word | 1 + test/test_format_pl.py | 348 +++++ test/test_parse_pl.py | 537 ++++++++ 18 files changed, 3037 insertions(+), 7 deletions(-) create mode 100644 lingua_franca/lang/common_data_pl.py create mode 100644 lingua_franca/lang/format_pl.py create mode 100644 lingua_franca/lang/parse_pl.py create mode 100644 lingua_franca/res/text/pl-pl/and.word create mode 100644 lingua_franca/res/text/pl-pl/day.word create mode 100644 lingua_franca/res/text/pl-pl/days.word create mode 100644 lingua_franca/res/text/pl-pl/hour.word create mode 100644 lingua_franca/res/text/pl-pl/hours.word create mode 100644 lingua_franca/res/text/pl-pl/minute.word create mode 100644 lingua_franca/res/text/pl-pl/minutes.word create mode 100644 lingua_franca/res/text/pl-pl/or.word create mode 100644 lingua_franca/res/text/pl-pl/second.word create mode 100644 lingua_franca/res/text/pl-pl/seconds.word create mode 100755 test/test_format_pl.py create mode 100644 test/test_parse_pl.py diff --git a/lingua_franca/format.py b/lingua_franca/format.py index bcf4caef..fcbd34af 100755 --- a/lingua_franca/format.py +++ b/lingua_franca/format.py @@ -28,13 +28,15 @@ populate_localized_function_dict, get_active_langs, \ get_full_lang_code, get_default_lang, get_default_loc, \ is_supported_full_lang, _raise_unsupported_language, \ - UnsupportedLanguageError, NoneLangWarning, InvalidLangWarning + UnsupportedLanguageError, NoneLangWarning, InvalidLangWarning, \ + FunctionNotLocalizedError _REGISTERED_FUNCTIONS = ("nice_number", "nice_time", "pronounce_number", - "nice_response") + "nice_response", + "nice_duration") populate_localized_function_dict("format", langs=get_active_langs()) @@ -367,6 +369,7 @@ def nice_year(dt, lang=None, bc=False): return date_time_format.year_format(dt, full_code, bc) +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) def nice_duration(duration, lang=None, speech=True): """ Convert duration in seconds to a nice spoken timespan diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py index e1e0e292..d21fa1f5 100644 --- a/lingua_franca/internal.py +++ b/lingua_franca/internal.py @@ -6,11 +6,11 @@ from warnings import warn _SUPPORTED_LANGUAGES = ("cs", "da", "de", "en", "es", "fr", "hu", - "it", "nl", "pt", "sv") + "it", "nl", "pl", "pt", "sv") _SUPPORTED_FULL_LOCALIZATIONS = ("cs-cz", "da-dk", "de-de", "en-au", "en-us", "es-es", "fr-fr", "hu-hu", "it-it", "nl-nl", - "pt-pt", "ru-ru", "sv-se", "tr-tr") + "pl-pl", "pt-pt", "ru-ru", "sv-se", "tr-tr") _DEFAULT_FULL_LANG_CODES = {'cs': 'cs-cz', 'da': 'da-dk', @@ -21,6 +21,7 @@ 'hu': 'hu-hu', 'it': 'it-it', 'nl': 'nl-nl', + 'pl': 'pl-pl', 'pt': 'pt-pt', 'ru': 'ru-ru', 'sv': 'sv-se', diff --git a/lingua_franca/lang/common_data_pl.py b/lingua_franca/lang/common_data_pl.py new file mode 100644 index 00000000..97cc37d8 --- /dev/null +++ b/lingua_franca/lang/common_data_pl.py @@ -0,0 +1,377 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict + + +_NUM_STRING_PL = { + 0: 'zero', + 1: 'jeden', + 2: 'dwa', + 3: 'trzy', + 4: 'cztery', + 5: 'pięć', + 6: 'sześć', + 7: 'siedem', + 8: 'osiem', + 9: 'dziewięć', + 10: 'dziesięć', + 11: 'jedenaście', + 12: 'dwanaście', + 13: 'trzynaście', + 14: 'czternaście', + 15: 'piętnaście', + 16: 'szesnaście', + 17: 'siedemnaście', + 18: 'osiemnaście', + 19: 'dziewiętnaście', + 20: 'dwadzieścia', + 30: 'trzydzieści', + 40: 'czterdzieści', + 50: 'pięćdziesiąt', + 60: 'sześćdziesiąt', + 70: 'siedemdziesiąt', + 80: 'osiemdziesiąt', + 90: 'dziewięćdziesiąt', + 100: 'sto', + 200: 'dwieście', + 300: 'trzysta', + 400: 'czterysta', + 500: 'pięćset', + 600: 'sześćset', + 700: 'siedemset', + 800: 'osiemset', + 900: 'dziewięćset', +} + + +_FRACTION_STRING_PL = { + 1: 'jedna', + 2: 'druga', + 3: 'trzecia', + 4: 'czwarta', + 5: 'piąta', + 6: 'szósta', + 7: 'siódma', + 8: 'ósma', + 9: 'dziewiąta', + 10: 'dziesiąta', + 11: 'jedenasta', + 12: 'dwunasta', + 13: 'trzynasta', + 14: 'czternasta', + 15: 'piętnasta', + 16: 'szesnasta', + 17: 'siedemnasta', + 18: 'osiemnasta', + 19: 'dziewiętnasta', + 20: 'dwudziesta', + 30: 'trzydziesta', + 40: 'czterdziesta', + 50: 'pięćdziesiąta', + 60: 'sześćdziesiąta', + 70: 'siedemdziesiąta', + 80: 'osiemdziesiąta', + 90: 'dziewięćdziesiąta', + 100: 'setna', + 200: 'dwusetna', + 300: 'trzysetna', + 400: 'czterysetna', + 500: 'pięćsetna', + 600: 'sześćsetna', + 700: 'siedemsetna', + 800: 'osiemsetna', + 900: 'dziewięćsetna', + 1000: 'tysięczna', +} + +_SHORT_SCALE_PL = OrderedDict([ + (100, 'sto'), + (200, 'dwieście'), + (300, 'trzysta'), + (400, 'czterysta'), + (500, 'pięćset'), + (600, 'sześćset'), + (700, 'siedemset'), + (800, 'osiemset'), + (900, 'dziewięćset'), + (1000, 'tysiąc'), + (1000000, 'milion'), + (1e9, "miliard"), + (1e12, 'bilion'), + (1e15, "biliard"), + (1e18, "trylion"), + (1e21, "sekstilion"), + (1e24, "kwadrylion"), + (1e27, "kwadryliard"), + (1e30, "kwintylion"), + (1e33, "kwintyliard"), + (1e36, "sekstylion"), + (1e39, "sekstyliard"), + (1e42, "septylion"), + (1e45, "septyliard"), + (1e48, "oktylion"), + (1e51, "oktyliard"), + (1e54, "nonilion"), + (1e57, "noniliard"), + (1e60, "decylion"), + (1e63, "decyliard"), + (1e66, "undecylion"), + (1e69, "undecyliard"), + (1e72, "duodecylion"), + (1e75, "duodecyliard"), + (1e78, "tredecylion"), + (1e81, "tredecyliard"), + (1e84, "kwartyduodecylion"), + (1e87, "kwartyduodecyliard"), + (1e90, "kwintyduodecylion"), + (1e93, "kwintyduodecyliard"), + (1e96, "seksdecylion"), + (1e99, "seksdecyliard"), + (1e102, "septydecylion"), + (1e105, "septydecyliard"), + (1e108, "oktodecylion"), + (1e111, "oktodecyliard"), + (1e114, "nondecylion"), + (1e117, "nondecyliard"), + (1e120, "wigintylion"), + (1e123, "wigintyliard"), + (1e153, "quinquagintylion"), + (1e183, "trycyliard"), + (1e213, "septuagintylion"), + (1e243, "kwadragiliard"), + (1e273, "nonagintylion"), + (1e303, "centezylion"), + (1e306, "uncentylion"), + (1e309, "duocentylion"), + (1e312, "trescentylion"), + (1e333, "decicentylion"), + (1e336, "undecicentylion"), + (1e363, "viginticentylion"), + (1e366, "unviginticentylion"), + (1e393, "trigintacentylion"), + (1e423, "quadragintacentylion"), + (1e453, "quinquagintacentylion"), + (1e483, "sexagintacentylion"), + (1e513, "septuagintacentylion"), + (1e543, "ctogintacentylion"), + (1e573, "nonagintacentylion"), + (1e603, "centyliard"), + (1e903, "trecentylion"), + (1e1203, "quadringentylion"), + (1e1503, "quingentylion"), + (1e1803, "sescentylion"), + (1e2103, "septingentylion"), + (1e2403, "octingentylion"), + (1e2703, "nongentylion"), + (1e3003, "milinylion") +]) + + +_ORDINAL_BASE_PL = { + 1: 'pierwszy', + 2: 'drugi', + 3: 'trzeci', + 4: 'czwarty', + 5: 'piąty', + 6: 'szósty', + 7: 'siódmy', + 8: 'ósmy', + 9: 'dziewiąty', + 10: 'dziesiąty', + 11: 'jedenasty', + 12: 'dwunasty', + 13: 'trzynasty', + 14: 'czternasty', + 15: 'piętnasty', + 16: 'szesnasty', + 17: 'siedemnasty', + 18: 'osiemnasty', + 19: 'dziewiętnasty', + 20: 'dwudziesty', + 30: 'trzydziesty', + 40: "czterdziesty", + 50: "pięćdziesiąty", + 60: "sześćdziesiąty", + 70: "siedemdziesiąty", + 80: "osiemdziesiąty", + 90: "dziewięćdziesiąty", + 1e2: "setny", + 1e3: "tysięczny" +} + + +_SHORT_ORDINAL_PL = { + 1e6: "milionowy", + 1e9: "miliardowy", + 1e12: "bilionowy", + 1e15: "biliardowy", + 1e18: "trylionowy", + 1e21: "tryliardowy", + 1e24: "kwadrylionowy", + 1e27: "kwadryliardowy", + 1e30: "kwintylionowy", + 1e33: "kwintyliardowy", + 1e36: "sektylionowy", + 1e42: "septylionowy", + 1e48: "oktylionowy", + 1e54: "nonylionowy", + 1e60: "decylionowy" + # TODO > 1e-33 +} +_SHORT_ORDINAL_PL.update(_ORDINAL_BASE_PL) + +_ALT_ORDINALS_PL = { + 1: 'pierwszej', + 2: 'drugiej', + 3: 'trzeciej', + 4: 'czwartej', + 5: 'piątej', + 6: 'szóstej', + 7: 'siódmej', + 8: 'ósmej', + 9: 'dziewiątej', + 10: 'dziesięcio', + 11: 'jedenasto', + 12: 'dwunasto', + 13: 'trzynasto', + 14: 'czternasto', + 15: 'piętnasto', + 16: 'szesnasto', + 17: 'siedemnasto', + 18: 'osiemnasto', + 19: 'dziewiętnasto', + 20: 'dwudziesto', + 30: 'trzydziesto', + 40: 'czterdziesto', + 50: 'pięćdziesiecio', + 60: 'sześćdziesięcio', + 70: 'siedemdziesięcio', + 80: 'osiemdziesięcio', + 90: 'dziewięćdziesięcio', +} + +_TIME_UNITS_CONVERSION = { + 'mikrosekund': 'microseconds', + 'mikrosekundy': 'microseconds', + 'milisekund': 'milliseconds', + 'milisekundy': 'milliseconds', + 'sekunda': 'seconds', + 'sekundy': 'seconds', + 'sekund': 'seconds', + 'minuta': 'minutes', + 'minuty': 'minutes', + 'minut': 'minutes', + 'godzina': 'hours', + 'godziny': 'hours', + 'godzin': 'hours', + 'dzień': 'days', + 'dni': 'days', + 'tydzień': 'weeks', + 'tygodni': 'weeks', + 'tygodnie': 'weeks', + 'tygodniu': 'weeks', +} + +_TIME_UNITS_NORMALIZATION = { + 'mikrosekunda': 'mikrosekunda', + 'mikrosekundę': 'mikrosekunda', + 'mikrosekund': 'mikrosekunda', + 'mikrosekundy': 'mikrosekunda', + 'milisekunda': 'milisekunda', + 'milisekundę': 'milisekunda', + 'milisekund': 'milisekunda', + 'milisekundy': 'milisekunda', + 'sekunda': 'sekunda', + 'sekundę': 'sekunda', + 'sekundy': 'sekunda', + 'sekund': 'sekunda', + 'minuta': 'minuta', + 'minutę': 'minuta', + 'minut': 'minuta', + 'minuty': 'minuta', + 'godzina': 'godzina', + 'godzinę': 'godzina', + 'godzin': 'godzina', + 'godziny': 'godzina', + 'dzień': 'dzień', + 'dni': 'dzień', + 'tydzień': 'tydzień', + 'tygodni': 'tydzień', + 'tygodnie': 'tydzień', + 'tygodniu': 'tydzień', + 'miesiąc': 'miesiąc', + 'miesiące': 'miesiąc', + 'miesięcy': 'miesiąc', + 'rok': 'rok', + 'lata': 'rok', + 'lat': 'rok', + 'dekada': 'dekada', + 'dekad': 'dekada', + 'dekady': 'dekada', + 'dekadę': 'dekada', + 'wiek': 'wiek', + 'wieki': 'wiek', + 'milenia': 'milenia', + 'milenium': 'milenia', +} + +_MONTHS_TO_EN = { + 'styczeń': 'January', + 'stycznia': 'January', + 'luty': 'February', + 'lutego': 'February', + 'marzec': 'March', + 'marca': 'March', + 'kwiecień': 'April', + 'kwietnia': 'April', + 'maj': 'May', + 'maja': 'May', + 'czerwiec': 'June', + 'czerwca': 'June', + 'lipiec': 'July', + 'lipca': 'July', + 'sierpień': 'August', + 'sierpnia': 'August', + 'wrzesień': 'September', + 'września': 'September', + 'październik': 'October', + 'października': 'October', + 'listopad': 'November', + 'listopada': 'November', + 'grudzień': 'December', + 'grudnia': 'December', +} + +_DAYS_TO_EN = { + 'poniedziałek': 0, + 'poniedziałku': 0, + 'wtorek': 1, + 'wtorku': 1, + 'środa': 2, + 'środę': 2, + 'środy': 2, + 'środzie': 2, + 'czwartek': 3, + 'czwartku': 3, + 'piątek': 4, + 'piątku': 4, + 'sobota': 5, + 'sobotę': 5, + 'soboty': 5, + 'sobocie': 5, + 'niedziela': 6, + 'niedzieli': 6 +} diff --git a/lingua_franca/lang/format_pl.py b/lingua_franca/lang/format_pl.py new file mode 100644 index 00000000..483a0fc8 --- /dev/null +++ b/lingua_franca/lang/format_pl.py @@ -0,0 +1,351 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_pl import _NUM_STRING_PL, \ + _FRACTION_STRING_PL, _SHORT_SCALE_PL, _SHORT_ORDINAL_PL, _ALT_ORDINALS_PL +from lingua_franca.internal import FunctionNotLocalizedError + + +def nice_number_pl(number, speech=True, denominators=range(1, 21)): + """ English helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 and a half" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_PL[den] + if whole == 0: + return_string = '{} {}'.format(num, den_str) + else: + return_string = '{} i {} {}'.format(whole, num, den_str) + if num > 1: + return_string = return_string[:-1] + 'e' + return return_string + + +def pronounce_number_pl(num, places=2, short_scale=True, scientific=False, + ordinals=False, scientific_run=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'five point two' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + # deal with infinity + if num == float("inf"): + return "nieskończoność" + elif num == float("-inf"): + return "minus nieskończoność" + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + if ordinals: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} razy dziesięć do {}{} potęgi'.format( + 'minus ' if float(n) < 0 else '', + pronounce_number_pl( + abs(float(n)), places, short_scale, False, ordinals=False, scientific_run=True), + 'minus ' if power < 0 else '', + pronounce_number_pl(abs(power), places, short_scale, False, ordinals=True, scientific_run=True)) + else: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} razy dziesięć do potęgi {}{}'.format( + 'minus ' if float(n) < 0 else '', + pronounce_number_pl( + abs(float(n)), places, short_scale, False), + 'minus ' if power < 0 else '', + pronounce_number_pl(abs(power), places, short_scale, False)) + + number_names = _NUM_STRING_PL.copy() + number_names.update(_SHORT_SCALE_PL) + + digits = [number_names[n] for n in range(0, 20)] + if ordinals: + tens = [_SHORT_ORDINAL_PL[n] for n in range(10, 100, 10)] + else: + tens = [number_names[n] for n in range(10, 100, 10)] + hundreds = [_SHORT_SCALE_PL[n] for n in _SHORT_SCALE_PL.keys()] + + # deal with negatives + result = "" + if num < 0: + result = "minus " + num = abs(num) + + # check for a direct match + if num in number_names and not ordinals: + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False, iteration=0): + assert 0 <= n <= 999 + + _, n_mod = divmod(n, 10) + if iteration > 0 and n in _ALT_ORDINALS_PL and ordinals: + return _ALT_ORDINALS_PL[n] + elif n in _SHORT_ORDINAL_PL and ordinals: + return _SHORT_ORDINAL_PL[n] if not scientific_run \ + else _ALT_ORDINALS_PL[n] + if n <= 19: + return digits[n] if not scientific_run or not ordinals\ + else digits[n][:-1] + "ej" + elif n <= 99: + q, r = divmod(n, 10) + tens_text = tens[q - 1] + if scientific_run: + tens_text = tens_text[:-1] + "ej" + return tens_text + (" " + _sub_thousand(r, ordinals) if r + else "") + else: + q, r = divmod(n, 100) + digit_name = digits[q] + if q*100 in _NUM_STRING_PL: + digit_name = _NUM_STRING_PL[q*100] + + return digit_name + ( + " " + _sub_thousand(r, ordinals) if r else "") + + def _short_scale(n): + if n >= max(_SHORT_SCALE_PL.keys()): + return "nieskończoność" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z, ordi, iteration=i) + + if i: + if i >= len(hundreds): + return "" + number += " " + if ordi: + if i * 1000 in _SHORT_ORDINAL_PL: + if z == 1: + number = _SHORT_ORDINAL_PL[i * 1000] + else: + number += _SHORT_ORDINAL_PL[i * 1000] + else: + if n not in _SHORT_SCALE_PL: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += _SHORT_SCALE_PL[num] + "owa" + else: + number = _SHORT_SCALE_PL[n] + "ty" + else: + hundreds_text = _SHORT_SCALE_PL[float(pow(1000, i))] + if z != 1: + _, z_mod = divmod(z, 10) + _, z_mod_tens = divmod(z, 100) + n_main, _ = divmod(z_mod_tens, 10) + if i == 1: + if n_main != 1 and 5 > z_mod > 0: + hundreds_text += "e" + else: + hundreds_text = "tysięcy" + elif i > 1: + hundreds_text += "y" if 5 > z_mod > 0 else "ów" + + number += hundreds_text + res.append(number) + ordi = False + + return ", ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + result += _short_scale(num) + + # deal with scientific notation unpronounceable as number + if not result and "e" in str(num): + return pronounce_number_pl(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "minus " or not result): + result += "zero" + result += " przecinek" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + + +def nice_time_pl(dt, speech=True, use_24hour=True, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'five thirty' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + string = dt.strftime("%H:%M") + if not speech: + return string + + # Generate a speakable version of the time + speak = "" + + # Either "0 8 hundred" or "13 hundred" + if string[0:2] == '00': + speak = "" + elif string[0] == '0': + speak += pronounce_number_pl(int(string[1]), ordinals=True) + speak = speak[:-1] + 'a' + else: + speak = pronounce_number_pl(int(string[0:2]), ordinals=True) + speak = speak[:-1] + 'a' + + speak += ' ' if string[0:2] != '00' else '' + if string[3:5] == '00': + speak += 'zero zero' + else: + if string[3] == '0': + speak += pronounce_number_pl(int(string[4])) + else: + speak += pronounce_number_pl(int(string[3:5])) + + if string[0:2] == '00': + speak += " po północy" + return speak + + +def nice_duration_pl(duration, speech=True): + """ Convert duration to a nice spoken timespan + + Args: + seconds: number of seconds + minutes: number of minutes + hours: number of hours + days: number of days + Returns: + str: timespan as a string + """ + + # TODO this is a kludge around the fact that only Polish has a + # localized nice_duration() + if not speech: + raise FunctionNotLocalizedError + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + out = '' + sec_main, sec_div = divmod(seconds, 10) + min_main, min_div = divmod(minutes, 10) + hour_main, hour_div = divmod(hours, 10) + + if days > 0: + out += pronounce_number_pl(days) + " " + if days == 1: + out += 'dzień' + else: + out += 'dni' + if hours > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(hours) + " " + if hours == 1: + out += 'godzina' + elif hour_main == 1 or hour_div > 4: + out += 'godzin' + else: + out += 'godziny' + if minutes > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(minutes) + " " + if minutes == 1: + out += 'minuta' + elif min_main == 1 or min_div > 4: + out += 'minut' + else: + out += 'minuty' + if seconds > 0: + if out: + out += " " + out += get_pronounce_number_for_duration(seconds) + " " + if sec_div == 0: + out += 'sekund' + elif seconds == 1: + out += 'sekunda' + elif sec_main == 1 or sec_div > 4: + out += 'sekund' + else: + out += 'sekundy' + + return out + + +def get_pronounce_number_for_duration(num): + pronounced = pronounce_number_pl(num) + + return 'jedna' if pronounced == 'jeden' else pronounced diff --git a/lingua_franca/lang/parse_pl.py b/lingua_franca/lang/parse_pl.py new file mode 100644 index 00000000..e8df170b --- /dev/null +++ b/lingua_franca/lang/parse_pl.py @@ -0,0 +1,1403 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta + +from dateutil.relativedelta import relativedelta + +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + invert_dict, ReplaceableNumber, partition_list, tokenize, Token +from lingua_franca.lang.common_data_pl import _NUM_STRING_PL, \ + _SHORT_SCALE_PL, _SHORT_ORDINAL_PL, _FRACTION_STRING_PL, _TIME_UNITS_CONVERSION, \ + _TIME_UNITS_NORMALIZATION, _MONTHS_TO_EN, _DAYS_TO_EN, _ORDINAL_BASE_PL, \ + _ALT_ORDINALS_PL + +import re + + +def generate_plurals_pl(originals): + """ + Return a new set or dict containing the plural form of the original values, + + In English this means all with 's' appended to them. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + if isinstance(originals, dict): + result = {key + 'y': value for key, value in originals.items()} + result = {**result, **{key + 'ów': value for key, value in originals.items()}} + result = {**result, **{'tysiące': 1000, 'tysięcy': 1000}} + + return result + + result = {value + "y" for value in originals} + result = result.union({value + "ów" for value in originals}) + result = result.union({'tysiące', 'tysięcy'}) + + return result + + +def generate_fractions_pl(fractions): + '''Returns a list of all fraction combinations. E.g.: + trzecia, trzecich, trzecie + czwarta, czwarte, czwartych + + :param fractions: Existing fractions + :return: Fractions with add suffixes + ''' + + result = {**fractions} + for k, v in fractions.items(): + k_no_last = k[:-1] + result[k_no_last + 'e'] = v + if k_no_last[-1:] == 'i': + result[k_no_last + 'ch'] = v + else: + result[k_no_last + 'ych'] = v + + for k,v in _SHORT_ORDINAL_PL.items(): + result[v[:-1] + 'a'] = k + + result['jedno'] = 1 + result['czwartego'] = 4 + + return result + + +# negate next number (-2 = 0 - 2) +_NEGATIVES = {"ujemne", "minus"} + +# sum the next number (twenty two = 20 + 2) +_SUMS = {'dwadzieścia', '20', 'trzydzieści', '30', 'czterdzieści', '40', 'pięćdziesiąt', '50', + 'sześćdziesiąt', '60', 'siedemdziesiąt', '70', 'osiemdziesiąt', '80', 'dziewięćdziesiąt', '90'} + +_MULTIPLIES_SHORT_SCALE_PL = generate_plurals_pl(_SHORT_SCALE_PL.values()) + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER = {'i'} + +# decimal marker ( 1 point 5 = 1 + 0.5) +_DECIMAL_MARKER = {'kropka', 'przecinek'} + +_STRING_NUM_PL = invert_dict(_NUM_STRING_PL) +_STRING_NUM_PL.update(generate_plurals_pl(_STRING_NUM_PL)) +_STRING_NUM_PL.update({ + 'pół': 0.5, + 'połówka': 0.5, + 'połowa': 0.5, +}) + +_STRING_SHORT_ORDINAL_PL = invert_dict(_SHORT_ORDINAL_PL) + +_REV_FRACTITONS = generate_fractions_pl(invert_dict(_FRACTION_STRING_PL)) + + +def _convert_words_to_numbers_pl(text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. first, second, third) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + text = text.lower() + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_pl(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + +def _extract_numbers_with_text_pl(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_pl(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_pl(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + _extract_number_with_text_pl_helper(tokens, short_scale, + ordinals, fractional_numbers) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_pl_helper(tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_en. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + _extract_fraction_with_text_pl(tokens, short_scale, ordinals) + if fraction: + return fraction, fraction_text + + decimal, decimal_text = \ + _extract_decimal_with_text_pl(tokens, short_scale, ordinals) + if decimal: + return decimal, decimal_text + + return _extract_whole_number_with_text_pl(tokens, short_scale, ordinals) + + +def _extract_fraction_with_text_pl(tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 and 3/4'. Note that "one half" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in _FRACTION_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_pl(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_pl(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + +def _extract_decimal_with_text_pl(tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 point 5'. + + Notes: + While this is a helper for extractnumber_en, it also depends on + extractnumber_en, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in _DECIMAL_MARKER: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_pl(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_pl(partitions[2], short_scale, + ordinals, fractional_numbers=False) + + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + +def _extract_whole_number_with_text_pl(tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "one half" will be + handled by this function, while "one and a half" are handled by the + fraction function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data(short_scale) + + number_words = [] # type: [Token] + val = False + prev_val = None + next_val = None + to_sum = [] + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word + + prev_word = tokens[idx - 1].word if idx > 0 else "" + next_word = tokens[idx + 1].word if idx + 1 < len(tokens) else "" + + if is_numeric(word[:-1]) and word.endswith('.'): + # explicit ordinals, 1., 2., 3., 4.... N. + word = word[:-1] + + word = normalize_word_pl(word) + + if word not in string_num_scale and \ + word not in _STRING_NUM_PL and \ + word not in _SUMS and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not isFractional_pl(word) and \ + not look_for_fractions(word.split('/')): + words_only = [token.word for token in number_words] + if number_words and not all([w in _NEGATIVES for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and prev_word not in multiplies \ + and prev_word not in _SHORT_SCALE_PL.values() \ + and prev_word not in _SUMS \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in _NEGATIVES: + number_words = [token] + elif prev_word in _SUMS and word in _SUMS: + number_words = [token] + else: + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in _STRING_NUM_PL: + val = _STRING_NUM_PL.get(word) + current_val = val + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + prev_val = None + + # is the prev word a number and should we sum it? + # twenty two, fifty six + if prev_val: + if (prev_word in string_num_ordinal and val and val < prev_val) or \ + (prev_word in _STRING_NUM_PL and val and val < prev_val and val // 10 != prev_val // 10) or \ + all([prev_word in multiplies, val < prev_val if prev_val else False]): + val += prev_val + + if next_word in multiplies: + prev_val = val + continue + + # is this a spoken fraction? + # half cup + if val is False: + val = isFractional_pl(word) + current_val = val + + # 2 fifths + if not ordinals: + next_val = isFractional_pl(next_word) + if next_val: + if not val: + val = 1 + val *= next_val + number_words.append(tokens[idx + 1]) + + # is this a negative number? + if val and prev_word and prev_word in _NEGATIVES: + val = 0 - val + + if next_word in _STRING_NUM_PL: + prev_val = val + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + number_words.append(tokens[idx + 1]) + else: + if all([ + prev_word in _SUMS, + word not in _SUMS, + word not in multiplies, + current_val >= 10]): + # Backtrack - we've got numbers we can't sum. + number_words.pop() + val = prev_val + break + prev_val = val + + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + + time_to_sum = True + for other_token in tokens[idx+1:]: + if other_token.word in multiplies: + if string_num_scale[other_token.word] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + val += sum(to_sum) + + return val, number_words + + +def _initialize_number_data(short_scale): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale boolean: + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = _MULTIPLIES_SHORT_SCALE_PL + + string_num_scale = invert_dict(_SHORT_SCALE_PL) + string_num_scale.update(generate_plurals_pl(string_num_scale)) + return multiplies, _STRING_SHORT_ORDINAL_PL, string_num_scale + + +def extract_number_pl(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + return _extract_number_with_text_pl(tokenize(text.lower()), + True, ordinals).value + + +def extract_duration_pl(text): + """ + Convert an english phrase into a number of seconds + + Convert things like: + "10 minute" + "2 and a half hours" + "3 days 8 hours 10 minutes and 49 seconds" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': None, + 'milliseconds': None, + 'seconds': None, + 'minutes': None, + 'hours': None, + 'days': None, + 'weeks': None + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ayeę]?" + text = _convert_words_to_numbers_pl(text) + + for unit in _TIME_UNITS_CONVERSION: + unit_pattern = pattern.format(unit=unit) + matches = re.findall(unit_pattern, text) + value = sum(map(float, matches)) + unit_en = _TIME_UNITS_CONVERSION.get(unit) + if time_units[unit_en] is None or time_units.get(unit_en) == 0: + time_units[unit_en] = value + text = re.sub(unit_pattern, '', text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_pl(string, dateNow=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "today" + "tomorrow afternoon" + "next Tuesday at 4pm" + "August 3rd" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "what is Tuesday's weather forecast" + returns the date for the forthcoming Tuesday relative to the reference + date and the remainder string + "what is weather forecast". + + The "next" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "next Monday" would be in 3 days. + On Saturday, "next Monday" would be in 9 days. + + Args: + string (str): string containing date words + dateNow (datetime): A reference date/time for "tommorrow", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s): + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace('.', '').replace(',', '') \ + .replace("para", "2") + + wordList = s.split() + for idx, word in enumerate(wordList): + ordinals = ["ci", "szy", "gi"] + if word[0].isdigit(): + for ordinal in ordinals: + if ordinal in word: + word = word.replace(ordinal, "") + wordList[idx] = word + + return wordList + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if string == "" or not dateNow: + return None + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = dateNow.strftime("%w") + currentYear = dateNow.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['rano'] + timeQualifiersPM = ['wieczór', 'w nocy'] + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['na', 'w', 'we', 'na', 'przez', 'ten', 'około', 'dla', 'o', "pomiędzy", 'za', 'do'] + days = list(_DAYS_TO_EN.keys()) + recur_markers = days + ['weekend', 'weekendy'] + monthsShort = ['sty', 'lut', 'mar', 'kwi', 'maj', 'cze', 'lip', 'sie', + 'wrz', 'paź', 'lis', 'gru'] + year_multiples = ['dekada', 'wiek', 'milenia'] + + words = clean_string(string) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + start = idx + used = 0 + # save timequalifier for later + if word == 'w' and wordNext == 'tę': + used += 2 + if word == "temu" and dayOffset: + dayOffset = - dayOffset + used += 1 + if word == "teraz" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = dateNow.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_pl(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "dekada": + yearOffset = multiplier * 10 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "wiek": + yearOffset = multiplier * 100 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "milenia": + yearOffset = multiplier * 1000 + elif word in timeQualifiersList: + timeQualifier = word + # parse today, tomorrow, day after tomorrow + elif word == "dzisiaj" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "jutro" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "przedwczoraj" and not fromFlag: + dayOffset = -2 + used += 1 + elif word == "wczoraj" and not fromFlag: + dayOffset = -1 + used += 1 + elif word == "pojutrze" and not fromFlag: + dayOffset = 2 + used = 1 + elif word == "dzień" and wordNext != 'robocze': + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif word == "tydzień" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordPrev == "następny": + dayOffset = 7 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "miesiąc" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "następny": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 years, next year, last year + elif word == "rok" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "następny": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + yearOffset = -1 + start -= 1 + used = 2 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + d = _DAYS_TO_EN.get(word) + dayOffset = (d + 1) - int(today) + used = 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "następny": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "poprzedni" or wordPrev == 'ostatni': + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in _MONTHS_TO_EN or word in monthsShort and not fromFlag: + used += 1 + datestr = _MONTHS_TO_EN[word] + if wordPrev and wordPrev[0].isdigit(): + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False + + # parse 5 days from tomorrow, 10 weeks from next thursday, + # 2 months from July + validFollowups = days + list(_MONTHS_TO_EN.keys()) + monthsShort + validFollowups.append("dzisiaj") + validFollowups.append("jutro") + validFollowups.append("wczoraj") + validFollowups.append("następny") + validFollowups.append("poprzedni") + validFollowups.append('ostatni') + validFollowups.append("teraz") + validFollowups.append("tego") + if (word == "od" or word == "po") and wordNext in validFollowups: + used = 2 + fromFlag = True + if wordNext == "jutro": + dayOffset += 1 + elif wordNext == "wczoraj": + dayOffset -= 1 + elif wordNext in days: + d = _DAYS_TO_EN.get(wordNext) + tmpOffset = (d + 1) - int(today) + used = 2 + if tmpOffset < 0: + tmpOffset += 7 + dayOffset += tmpOffset + elif wordNextNext and wordNextNext in days: + d = _DAYS_TO_EN.get(wordNextNext) + tmpOffset = (d + 1) - int(today) + used = 3 + if wordNext == "następny": + if dayOffset <= 2: + tmpOffset += 7 + used += 1 + start -= 1 + elif wordNext == "poprzedni" or wordNext == 'ostatni': + tmpOffset -= 7 + used += 1 + start -= 1 + dayOffset += tmpOffset + if used > 0: + if start - 1 > 0 and words[start - 1] == "ten": # this + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "południe": + hrAbs = 12 + used += 1 + elif word == "północ" or word == 'północy': + hrAbs = 0 + used += 1 + elif word == "rano": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "po" and wordNext == "południu": + if hrAbs is None: + hrAbs = 15 + used += 2 + elif word == "wieczór" or word == 'wieczorem': + if hrAbs is None: + hrAbs = 19 + used += 1 + elif word == "nocy": + if hrAbs is None: + hrAbs = 22 + used += 1 + # parse half an hour, quarter hour + elif word == "godzina" and (wordPrev.isdigit() or wordPrev in markers or wordPrevPrev in markers): + if wordPrev == "pół": + minOffset = 30 + else: + hrOffset = 1 + if wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "dzisiaj": + daySpecified = True + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + # parse in a minute + elif word == "minuta" and (wordPrev.isdigit() or wordPrev in markers): + minOffset = 1 + words[idx - 1] = "" + used += 1 + # parse in a second + elif word == "sekunda" and (wordPrev.isdigit() or wordPrev in markers): + secOffset = 1 + words[idx - 1] = "" + used += 1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + if wordNext == "wieczorem" or wordPrev == "wieczorem" or \ + wordNext == 'wieczór' or wordPrev == 'wieczór' or \ + (wordNext == 'po' and wordNextNext == 'południu'): + remainder = "pm" + used += 2 if wordNext == 'po' else 1 + if wordPrev == "wieczorem" or wordPrev == 'wieczór': + words[idx - 1] = "" + + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + if wordNext == "rano": + remainder = "am" + used += 1 + elif wordNext == "po" and wordNextNext == "południu": + remainder = "pm" + used += 2 + elif wordNext == "wieczorem": + remainder = "pm" + used += 1 + elif wordNext == "rano": + remainder = "am" + used += 1 + elif wordNext == "w" and wordNextNext == "nocy": + if strHH and int(strHH) > 5: + remainder = "pm" + else: + remainder = "am" + used += 2 + + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + (word[0].isdigit() and (wordNext == 'wieczorem' or wordNext == 'wieczór')) or + (word[0].isdigit() and wordNext == 'po' and wordNextNext == 'południu') or + (word[0].isdigit() and wordNext == 'w' and wordNextNext == 'nocy')): + strHH = strNum + remainder = "pm" + used = 2 if wordNext in ['po', 'w'] else 1 + elif ( + remainder == "am" or + (word[0].isdigit() and wordNext == 'rano')): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers or ( + wordNext == 'w' and wordNextNext == 'dzień' and + wordNextNextNext == 'robocze' + )): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "godzina" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "godzina": + # "in 10 hours" + hrOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "minuta" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "minuta": + # "in 10 minutes" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif _TIME_UNITS_NORMALIZATION.get(wordNext) == "sekunda" \ + or _TIME_UNITS_NORMALIZATION.get(remainder) == "sekunda": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif int(strNum) > 100: + # military time, eg. "3300 hours" + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if _TIME_UNITS_NORMALIZATION.get(wordNext) == "godzina" or \ + _TIME_UNITS_NORMALIZATION.get(remainder) == "godzina": + used += 1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + elif ( + wordNext == "" or wordNext == "w" or wordNext == 'nocy' or + wordNextNext == 'nocy'): + strHH = strNum + strMM = "00" + + if wordNext == "za" or wordNextNext == "za": + used += (1 if wordNext == "za" else 2) + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if (wordNextNext and + (wordNextNext in timeQualifier or + wordNextNextNext in timeQualifier)): + if (wordNextNext in timeQualifiersPM or + wordNextNextNext in timeQualifiersPM): + remainder = "pm" + used += 1 + if (wordNextNext in timeQualifiersAM or + wordNextNextNext in timeQualifiersAM): + remainder = "am" + used += 1 + + if timeQualifier != "": + if timeQualifier in timeQualifiersPM: + remainder = "pm" + used += 1 + + elif timeQualifier in timeQualifiersAM: + remainder = "am" + used += 1 + else: + # TODO: Unsure if this is 100% accurate + used += 1 + military = True + else: + isTime = False + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "pm" and HH < 12 else HH + HH = HH - 12 if remainder == "am" and HH >= 12 else HH + + if (not military and + remainder not in ['am', 'pm'] and + remainder not in _TIME_UNITS_NORMALIZATION and + ((not daySpecified) or 0 <= dayOffset < 1)): + + # ambiguous time, detect whether they mean this evening or + # the next morning based on whether it has already passed + if dateNow.hour < HH or (dateNow.hour == HH and + dateNow.minute < MM): + pass # No modification needed + elif dateNow.hour < HH + 12: + HH += 12 + else: + # has passed, assume the next morning + dayOffset += 1 + + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + + if wordPrev == "rano": + hrOffset = -1 + words[idx - 1] = "" + idx -= 1 + elif wordPrev == "wieczorem": + hrOffset = 1 + words[idx - 1] = "" + idx -= 1 + if idx > 0 and wordPrev in markers: + words[idx - 1] = "" + if wordPrev == "najbliższą": + daySpecified = True + if idx > 1 and wordPrevPrev in markers: + words[idx - 2] = "" + if wordPrevPrev == "najbliższą": + daySpecified = True + + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = dateNow.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and dateNow > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "i" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def isFractional_pl(input_str, short_scale=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + lower_input = input_str.lower() + if lower_input in _REV_FRACTITONS: + return 1.0 / _REV_FRACTITONS[lower_input] + + return False + + +def extract_numbers_pl(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + results = _extract_numbers_with_text_pl(tokenize(text), + short_scale, ordinals) + return [float(result.value) for result in results] + + +def normalize_word_pl(word): + if word.startswith('jedn'): + suffix = 'ą', 'ej', 'ym' + if word.endswith(suffix): + return 'jedna' + if word == 'dwie': + return 'dwa' + + return word + + +def normalize_pl(text, remove_articles=True): + """ Polish string normalization """ + + words = text.split() # this also removed extra spaces + normalized = "" + for word in words: + if remove_articles and word in ["i"]: + continue + + if word in _TIME_UNITS_NORMALIZATION: + word = _TIME_UNITS_NORMALIZATION[word] + + if word in _REV_FRACTITONS: + word = str(_REV_FRACTITONS[word]) + + if word in _ORDINAL_BASE_PL.values(): + word = str(list(_ORDINAL_BASE_PL.keys())[list(_ORDINAL_BASE_PL.values()).index(word)]) + + if word in _NUM_STRING_PL.values(): + word = str(list(_NUM_STRING_PL.keys())[list(_NUM_STRING_PL.values()).index(word)]) + + if word in _ALT_ORDINALS_PL.values(): + word = str(list(_ALT_ORDINALS_PL.keys())[list(_ALT_ORDINALS_PL.values()).index(word)]) + + if word == 'następną' or word == 'następna' or word == 'następnym' or word == 'następnej': + word = 'następny' + elif word == 'ostatnią' or word == 'ostatnia' or word == 'ostatnim' or word == 'ostatniej' or \ + word == 'poprzednią' or word == 'poprzednia' or word == 'poprzednim' or word == 'poprzedniej': + word = 'poprzedni' + elif word == 'jutra' or word == 'jutrze': + word = 'jutro' + elif word == 'wieczorem': + word = 'wieczór' + elif word == 'poranne': + word = 'rano' + + normalized += " " + word + + return normalized[1:] # strip the initial space diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index 6af573d0..b357ada3 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -172,19 +172,19 @@ def extract_datetime(text, anchorDate=None, lang=None, default_time=None): >>> extract_datetime( ... "What is the weather like the day after tomorrow?", - ... datetime(2017, 06, 30, 00, 00) + ... datetime(2017, 6, 30, 00, 00) ... ) [datetime.datetime(2017, 7, 2, 0, 0), 'what is weather like'] >>> extract_datetime( ... "Set up an appointment 2 weeks from Sunday at 5 pm", - ... datetime(2016, 02, 19, 00, 00) + ... datetime(2016, 2, 19, 00, 00) ... ) [datetime.datetime(2016, 3, 6, 17, 0), 'set up appointment'] >>> extract_datetime( ... "Set up an appointment", - ... datetime(2016, 02, 19, 00, 00) + ... datetime(2016, 2, 19, 00, 00) ... ) None """ diff --git a/lingua_franca/res/text/pl-pl/and.word b/lingua_franca/res/text/pl-pl/and.word new file mode 100644 index 00000000..0ddf2bae --- /dev/null +++ b/lingua_franca/res/text/pl-pl/and.word @@ -0,0 +1 @@ +i diff --git a/lingua_franca/res/text/pl-pl/day.word b/lingua_franca/res/text/pl-pl/day.word new file mode 100644 index 00000000..fdd7af4d --- /dev/null +++ b/lingua_franca/res/text/pl-pl/day.word @@ -0,0 +1 @@ +dzień diff --git a/lingua_franca/res/text/pl-pl/days.word b/lingua_franca/res/text/pl-pl/days.word new file mode 100644 index 00000000..a7738f00 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/days.word @@ -0,0 +1 @@ +dni diff --git a/lingua_franca/res/text/pl-pl/hour.word b/lingua_franca/res/text/pl-pl/hour.word new file mode 100644 index 00000000..8d8bde0b --- /dev/null +++ b/lingua_franca/res/text/pl-pl/hour.word @@ -0,0 +1 @@ +godzina diff --git a/lingua_franca/res/text/pl-pl/hours.word b/lingua_franca/res/text/pl-pl/hours.word new file mode 100644 index 00000000..0973a517 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/hours.word @@ -0,0 +1 @@ +godzin diff --git a/lingua_franca/res/text/pl-pl/minute.word b/lingua_franca/res/text/pl-pl/minute.word new file mode 100644 index 00000000..1b52e617 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/minute.word @@ -0,0 +1 @@ +minuta diff --git a/lingua_franca/res/text/pl-pl/minutes.word b/lingua_franca/res/text/pl-pl/minutes.word new file mode 100644 index 00000000..7a715f81 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/minutes.word @@ -0,0 +1 @@ +minut diff --git a/lingua_franca/res/text/pl-pl/or.word b/lingua_franca/res/text/pl-pl/or.word new file mode 100644 index 00000000..54b24883 --- /dev/null +++ b/lingua_franca/res/text/pl-pl/or.word @@ -0,0 +1 @@ +lub diff --git a/lingua_franca/res/text/pl-pl/second.word b/lingua_franca/res/text/pl-pl/second.word new file mode 100644 index 00000000..f07a357f --- /dev/null +++ b/lingua_franca/res/text/pl-pl/second.word @@ -0,0 +1 @@ +sekunda diff --git a/lingua_franca/res/text/pl-pl/seconds.word b/lingua_franca/res/text/pl-pl/seconds.word new file mode 100644 index 00000000..bbf810ee --- /dev/null +++ b/lingua_franca/res/text/pl-pl/seconds.word @@ -0,0 +1 @@ +sekund diff --git a/test/test_format_pl.py b/test/test_format_pl.py new file mode 100755 index 00000000..bf1d99d7 --- /dev/null +++ b/test/test_format_pl.py @@ -0,0 +1,348 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest +import datetime +import sys + +from lingua_franca.format import nice_number +from lingua_franca.format import nice_time +from lingua_franca.format import nice_duration +from lingua_franca.format import pronounce_number + +from lingua_franca import get_default_lang, set_default_lang, \ + load_language, unload_language + + +def setUpModule(): + load_language("pl-pl") + set_default_lang("pl") + + +def tearDownModule(): + unload_language("pl") + + +NUMBERS_FIXTURE_PL = { + 1.435634: '1.436', + 2: '2', + 5.0: '5', + 0.027: '0.027', + 0.5: '1 druga', + 1.333: '1 i 1 trzecia', + 2.666: '2 i 2 trzecie', + 0.25: '1 czwarta', + 1.25: '1 i 1 czwarta', + 0.75: '3 czwarte', + 1.75: '1 i 3 czwarte', + 3.4: '3 i 2 piąte', + 16.8333: '16 i 5 szóste', + 12.5714: '12 i 4 siódme', + 9.625: '9 i 5 ósme', + 6.777: '6 i 7 dziewiąte', + 3.1: '3 i 1 dziesiąta', + 2.272: '2 i 3 jedenaste', + 5.583: '5 i 7 dwunaste', + 8.384: '8 i 5 trzynaste', + 0.071: '1 czternasta', + 6.466: '6 i 7 piętnaste', + 8.312: '8 i 5 szesnaste', + 2.176: '2 i 3 siedemnaste', + 200.722: '200 i 13 osiemnaste', + 7.421: '7 i 8 dziewiętnaste', + 0.05: '1 dwudziesta' +} + + +class TestNiceNumberFormat(unittest.TestCase): + def test_convert_float_to_nice_number(self): + for number, number_str in NUMBERS_FIXTURE_PL.items(): + self.assertEqual(nice_number(number, lang='pl'), number_str, + 'should format {} as {} and not {}'.format( + number, number_str, nice_number(number))) + + def test_specify_denominator(self): + self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), + '5 i 1 druga', + 'should format 5.5 as 5 and a half not {}'.format( + nice_number(5.5, denominators=[1, 2, 3]))) + self.assertEqual(nice_number(2.333, denominators=[1, 2]), + '2.333', + 'should format 2.333 as 2.333 not {}'.format( + nice_number(2.333, denominators=[1, 2]))) + + def test_no_speech(self): + self.assertEqual(nice_number(6.777, speech=False), + '6 7/9', + 'should format 6.777 as 6 7/9 not {}'.format( + nice_number(6.777, speech=False))) + self.assertEqual(nice_number(6.0, speech=False), + '6', + 'should format 6.0 as 6 not {}'.format( + nice_number(6.0, speech=False))) + + +class TestPronounceNumber(unittest.TestCase): + def test_convert_int(self): + self.assertEqual(pronounce_number(0), "zero") + self.assertEqual(pronounce_number(1), "jeden") + self.assertEqual(pronounce_number(10), "dziesięć") + self.assertEqual(pronounce_number(15), "piętnaście") + self.assertEqual(pronounce_number(20), "dwadzieścia") + self.assertEqual(pronounce_number(27), "dwadzieścia siedem") + self.assertEqual(pronounce_number(30), "trzydzieści") + self.assertEqual(pronounce_number(33), "trzydzieści trzy") + + def test_convert_negative_int(self): + self.assertEqual(pronounce_number(-1), "minus jeden") + self.assertEqual(pronounce_number(-10), "minus dziesięć") + self.assertEqual(pronounce_number(-15), "minus piętnaście") + self.assertEqual(pronounce_number(-20), "minus dwadzieścia") + self.assertEqual(pronounce_number(-27), "minus dwadzieścia siedem") + self.assertEqual(pronounce_number(-30), "minus trzydzieści") + self.assertEqual(pronounce_number(-33), "minus trzydzieści trzy") + + def test_convert_decimals(self): + self.assertEqual(pronounce_number(0.05), "zero przecinek zero pięć") + self.assertEqual(pronounce_number(-0.05), "minus zero przecinek zero pięć") + self.assertEqual(pronounce_number(1.234), + "jeden przecinek dwa trzy") + self.assertEqual(pronounce_number(21.234), + "dwadzieścia jeden przecinek dwa trzy") + self.assertEqual(pronounce_number(21.234, places=1), + "dwadzieścia jeden przecinek dwa") + self.assertEqual(pronounce_number(21.234, places=0), + "dwadzieścia jeden") + self.assertEqual(pronounce_number(21.234, places=3), + "dwadzieścia jeden przecinek dwa trzy cztery") + self.assertEqual(pronounce_number(21.234, places=4), + "dwadzieścia jeden przecinek dwa trzy cztery") + self.assertEqual(pronounce_number(21.234, places=5), + "dwadzieścia jeden przecinek dwa trzy cztery") + self.assertEqual(pronounce_number(-1.234), + "minus jeden przecinek dwa trzy") + self.assertEqual(pronounce_number(-21.234), + "minus dwadzieścia jeden przecinek dwa trzy") + self.assertEqual(pronounce_number(-21.234, places=1), + "minus dwadzieścia jeden przecinek dwa") + self.assertEqual(pronounce_number(-21.234, places=0), + "minus dwadzieścia jeden") + self.assertEqual(pronounce_number(-21.234, places=3), + "minus dwadzieścia jeden przecinek dwa trzy cztery") + self.assertEqual(pronounce_number(-21.234, places=4), + "minus dwadzieścia jeden przecinek dwa trzy cztery") + self.assertEqual(pronounce_number(-21.234, places=5), + "minus dwadzieścia jeden przecinek dwa trzy cztery") + + def test_convert_hundreds(self): + self.assertEqual(pronounce_number(100), "sto") + self.assertEqual(pronounce_number(666), "sześćset sześćdziesiąt sześć") + self.assertEqual(pronounce_number(1456), "jeden tysiąc, czterysta pięćdziesiąt sześć") + self.assertEqual(pronounce_number(103254654), "sto trzy miliony, dwieście " + "pięćdziesiąt cztery tysiące, sześćset " + "pięćdziesiąt cztery") + self.assertEqual(pronounce_number(1512457), "jeden milion, pięćset dwanaście tysięcy, czterysta " + "pięćdziesiąt siedem") + self.assertEqual(pronounce_number(209996), "dwieście dziewięć tysięcy, dziewięćset " + "dziewięćdziesiąt sześć") + + def test_convert_scientific_notation(self): + self.assertEqual(pronounce_number(0, scientific=True), "zero") + self.assertEqual(pronounce_number(33, scientific=True), + "trzy przecinek trzy razy dziesięć do potęgi jeden") + self.assertEqual(pronounce_number(299792458, scientific=True), + "dwa przecinek dziewięć dziewięć razy dziesięć do potęgi osiem") + self.assertEqual(pronounce_number(299792458, places=6, + scientific=True), + "dwa przecinek dziewięć dziewięć siedem dziewięć dwa pięć razy " + "dziesięć do potęgi osiem") + self.assertEqual(pronounce_number(1.672e-27, places=3, + scientific=True), + "jeden przecinek sześć siedem dwa razy dziesięć do potęgi " + "minus dwadzieścia siedem") + + def test_auto_scientific_notation(self): + self.assertEqual( + pronounce_number(1.1e-150), "jeden przecinek jeden razy dziesięć do " + "potęgi minus sto pięćdziesiąt") + + def test_large_numbers(self): + self.assertEqual( + pronounce_number(299792458), + "dwieście dziewięćdziesiąt dziewięć milionów, siedemset " + "dziewięćdziesiąt dwa tysiące, czterysta pięćdziesiąt osiem") + self.assertEqual( + pronounce_number(100034000000299792458), + "sto trylionów, trzydzieści cztery biliardy, " + "dwieście dziewięćdziesiąt dziewięć milionów, siedemset " + "dziewięćdziesiąt dwa tysiące, czterysta pięćdziesiąt osiem") + self.assertEqual( + pronounce_number(10000000000), + "dziesięć miliardów") + self.assertEqual( + pronounce_number(1000001), + "jeden milion, jeden") + self.assertEqual(pronounce_number(95505896639631893), + "dziewięćdziesiąt pięć biliardów, pięćset pięć " + "bilionów, osiemset dziewięćdziesiąt sześć miliardów, " + "sześćset trzydzieści dziewięć milionów, sześćset " + "trzydzieści jeden tysiące, osiemset dziewięćdziesiąt trzy") + self.assertEqual(pronounce_number(10e80, places=1), + "tredecyliard") + self.assertEqual(pronounce_number(1.9874522571e80, places=9), + "sto dziewięćdziesiąt osiem tredecylionów, " + "siedemset czterdzieści pięć duodecyliardów, " + "dwieście dwadzieścia pięć duodecylionów, " + "siedemset dziewięć undecyliardów, " + "dziewięćset dziewięćdziesiąt dziewięć undecylionów, " + "dziewięćset osiemdziesiąt dziewięć decyliardów, " + "siedemset trzydzieści decylionów, dziewięćset " + "dziewiętnaście noniliardów, dziewięćset " + "dziewięćdziesiąt dziewięć nonilionów, dziewięćset " + "pięćdziesiąt pięć oktyliardów, czterysta " + "dziewięćdziesiąt osiem oktylionów, dwieście " + "czternaście septyliardy, osiemset " + "czterdzieści pięć septylionów, czterysta " + "dwadzieścia dziewięć sekstyliardów, czterysta " + "czterdzieści cztery sekstyliony, trzysta " + "trzydzieści sześć kwintyliardów, siedemset dwadzieścia " + "cztery kwintyliony, pięćset sześćdziesiąt dziewięć " + "kwadryliardów, trzysta siedemdziesiąt pięć " + "kwadrylionów, dwieście trzydzieści dziewięć sekstilionów," + " sześćset siedemdziesiąt trylionów, pięćset " + "siedemdziesiąt cztery biliardy, siedemset " + "trzydzieści dziewięć bilionów, siedemset czterdzieści " + "osiem miliardów, czterysta siedemdziesiąt milionów, " + "dziewięćset piętnaście tysięcy, siedemdziesiąt dwa") + + # infinity + self.assertEqual( + pronounce_number(sys.float_info.max * 2), "nieskończoność") + self.assertEqual( + pronounce_number(float("inf")), + "nieskończoność") + self.assertEqual( + pronounce_number(float("-inf")), + "minus nieskończoność") + + def test_ordinals(self): + self.assertEqual(pronounce_number(1, ordinals=True), "pierwszy") + self.assertEqual(pronounce_number(10, ordinals=True), "dziesiąty") + self.assertEqual(pronounce_number(15, ordinals=True), "piętnasty") + self.assertEqual(pronounce_number(20, ordinals=True), "dwudziesty") + self.assertEqual(pronounce_number(27, ordinals=True), "dwudziesty siódmy") + self.assertEqual(pronounce_number(30, ordinals=True), "trzydziesty") + self.assertEqual(pronounce_number(33, ordinals=True), "trzydziesty trzeci") + self.assertEqual(pronounce_number(100, ordinals=True), "setny") + self.assertEqual(pronounce_number(1000, ordinals=True), "tysięczny") + self.assertEqual(pronounce_number(10000, ordinals=True), + "dziesięcio tysięczny") + self.assertEqual(pronounce_number(18691, ordinals=True), + "osiemnaście tysięcy, sześćset dziewięćdziesiąty pierwszy") + self.assertEqual(pronounce_number(1567, ordinals=True), + "jeden tysiąc, pięćset sześćdziesiąty siódmy") + self.assertEqual(pronounce_number(1.672e-27, places=3, + scientific=True, ordinals=True), + "jeden przecinek sześć siedem dwa razy dziesięć do " + "minus dwudziestej siódmej potęgi") + self.assertEqual(pronounce_number(18e6, ordinals=True), + "osiemnasto milionowa") + self.assertEqual(pronounce_number(18e12, ordinals=True), + "osiemnasto bilionowa") + self.assertEqual(pronounce_number(18e18, ordinals=True, + short_scale=False), "osiemnasto " + "trylionowa") + + +class TestNiceDateFormat(unittest.TestCase): + def test_convert_times(self): + dt = datetime.datetime(2017, 1, 31, + 13, 22, 3) + + self.assertEqual(nice_time(dt), + "trzynasta dwadzieścia dwa") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:22") + self.assertEqual(nice_time(dt, use_24hour=True), + "trzynasta dwadzieścia dwa") + + dt = datetime.datetime(2017, 1, 31, + 13, 0, 3) + self.assertEqual(nice_time(dt), + "trzynasta zero zero") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:00") + self.assertEqual(nice_time(dt, use_24hour=True), + "trzynasta zero zero") + + dt = datetime.datetime(2017, 1, 31, + 13, 2, 3) + self.assertEqual(nice_time(dt), + "trzynasta dwa") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "trzynasta dwa") + + dt = datetime.datetime(2017, 1, 31, + 0, 2, 3) + self.assertEqual(nice_time(dt), + "dwa po północy") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "00:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "dwa po północy") + + dt = datetime.datetime(2018, 2, 8, + 1, 2, 33) + self.assertEqual(nice_time(dt), + "pierwsza dwa") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "01:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "pierwsza dwa") + + dt = datetime.datetime(2017, 1, 31, + 12, 15, 9) + self.assertEqual(nice_time(dt), + "dwunasta piętnaście") + + dt = datetime.datetime(2017, 1, 31, + 1, 45, 00) + self.assertEqual(nice_time(dt), + "pierwsza czterdzieści pięć") + + def test_nice_duration(self): + self.assertEqual(nice_duration(1), "jedna sekunda") + self.assertEqual(nice_duration(3), "trzy sekundy") + self.assertEqual(nice_duration(1, speech=False), "0:01") + self.assertEqual(nice_duration(61), "jedna minuta jedna sekunda") + self.assertEqual(nice_duration(61, speech=False), "1:01") + self.assertEqual(nice_duration(5000), + "jedna godzina dwadzieścia trzy minuty dwadzieścia sekund") + self.assertEqual(nice_duration(5000, speech=False), "1:23:20") + self.assertEqual(nice_duration(50000), + "trzynaście godzin pięćdziesiąt trzy minuty dwadzieścia sekund") + self.assertEqual(nice_duration(50000, speech=False), "13:53:20") + self.assertEqual(nice_duration(500000), + "pięć dni osiemnaście godzin pięćdziesiąt trzy minuty dwadzieścia sekund") # nopep8 + self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") + self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), + speech=False), + "5d 18:53:20") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_parse_pl.py b/test/test_parse_pl.py new file mode 100644 index 00000000..7c1136aa --- /dev/null +++ b/test/test_parse_pl.py @@ -0,0 +1,537 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest +from datetime import datetime, timedelta + +from lingua_franca import get_default_lang, set_default_lang, \ + load_language, unload_language +from lingua_franca.parse import extract_datetime +from lingua_franca.parse import extract_duration +from lingua_franca.parse import extract_number, extract_numbers +from lingua_franca.parse import normalize + + +def setUpModule(): + load_language("pl-pl") + set_default_lang("pl") + + +def tearDownModule(): + unload_language("cs") + + +class TestNormalize(unittest.TestCase): + def test_extract_number(self): + self.assertEqual(extract_number('to jest pół testu'), 0.5) + self.assertEqual(extract_number("to jest pierwszy test", + ordinals=True), 1) + self.assertEqual(extract_number("to jest 2 test"), 2) + self.assertEqual(extract_number("to jest drugi test", + ordinals=True), 2) + self.assertEqual(extract_number("to jest trzeci test", + ordinals=True), 3.0) + self.assertEqual(extract_number("czwarty test", + ordinals=True), 4.0) + self.assertEqual(extract_number("trzydziesty szósty test", + ordinals=True), 36.0) + self.assertEqual(extract_number("to jest test numer 4"), 4) + self.assertEqual(extract_number("jedna trzecia szklanki"), 1.0 / 3.0) + self.assertEqual(extract_number("trzy szklanki"), 3) + self.assertEqual(extract_number("1/3 szklanki"), 1.0 / 3.0) + self.assertEqual(extract_number("jedna czwarta szklanki"), 0.25) + self.assertEqual(extract_number("1/4 szklanki"), 0.25) + self.assertEqual(extract_number("jedna czwarta szklanki"), 0.25) + self.assertEqual(extract_number("2/3 szklanki"), 2.0 / 3.0) + self.assertEqual(extract_number("3/4 szklanki"), 3.0 / 4.0) + self.assertEqual(extract_number("1 i 3/4 szklanki"), 1.75) + self.assertEqual(extract_number("1 szklanka i jedna druga"), 1.5) + self.assertEqual(extract_number("jedna szklanka i jedna druga"), 1.5) + self.assertEqual(extract_number("jeden i jedna druga szklanki"), 1.5) + self.assertEqual(extract_number("trzy czwarte szklanki"), 3.0 / 4.0) + self.assertEqual(extract_number("dwadzieścia dwa"), 22) + self.assertEqual(extract_number("Dwadzieścia dwa i trzy piąte"), 22.6) + self.assertEqual(extract_number("dwieście"), 200) + self.assertEqual(extract_number("dziewięć tysięcy"), 9000) + self.assertEqual(extract_number("sześćset sześćdziesiąt sześć"), 666) + self.assertEqual(extract_number("dwa miliony"), 2000000) + self.assertEqual(extract_number("dwa miliony pięćset tysięcy " + "ton metalu"), 2500000) + self.assertEqual(extract_number("sześć bilionów"), 6000000000000.0) + self.assertEqual(extract_number("jeden przecinek pięć"), 1.5) + self.assertEqual(extract_number("trzy kropka czternaście"), 3.14) + self.assertEqual(extract_number("zero przecinek dwa"), 0.2) + self.assertEqual(extract_number("miliardy lat starsze"), + 1000000000.0) + self.assertEqual(extract_number("sto tysięcy"), 100000) + self.assertEqual(extract_number("minus 2"), -2) + self.assertEqual(extract_number("ujemne siedemdziesiąt"), -70) + self.assertEqual(extract_number("tysiąc milionów"), 1000000000) + self.assertEqual(extract_number("sześć trzecich"), + 6 / 3) + self.assertEqual(extract_number("trzydzieści sekund"), 30) + self.assertEqual(extract_number("to jest miliardowy test", + ordinals=True), 1e09) + self.assertEqual(extract_number("to jest miliardowa część"), 1e-9) + + # Verify non-power multiples of ten no longer discard + # adjacent multipliers + self.assertEqual(extract_number("dwadzieścia tysięcy"), 20000) + self.assertEqual(extract_number("pięćdziesiąt milionów"), 50000000) + + # Verify smaller powers of ten no longer cause miscalculation of larger + # powers of ten (see MycroftAI#86) + self.assertEqual(extract_number("trzysta dwadzieścia miliardów trzysta milionów \ + dziewięćset pięćdziesiąt tysięcy sześćset \ + siedemdziesiąt pięć kropka osiem"), + 320300950675.8) + self.assertEqual(extract_number("dziewięćset dziewięćdziesiąt dziewięć milionów \ + dziewięćset dziewięćdziesiąt dziewięć tysięcy \ + dziewięćset dziewięćdziesiąt dziewięć przecinek dziewięć"), + 999999999.9) + + # TODO why does "trillion" result in xxxx.0? + self.assertEqual(extract_number("osiemset bilionów dwieście \ + pięćdziesiąt siedem"), 800000000000257.0) + + self.assertTrue(extract_number("Szybki gracz") is False) + self.assertTrue(extract_number("krejzi") is False) + + self.assertTrue(extract_number("krejzi zero") is not False) + self.assertEqual(extract_number("krejzi zero"), 0) + + self.assertTrue(extract_number("super 0") is not False) + self.assertEqual(extract_number("super 0"), 0) + + self.assertEqual(extract_number( + "jesteś drugi", ordinals=True), 2) + self.assertEqual(extract_number("całkowicie 100%"), 100) + + def test_extract_duration_pl(self): + self.assertEqual(extract_duration("10 sekund"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5 minut"), + (timedelta(minutes=5), "")) + self.assertEqual(extract_duration("2 godziny"), + (timedelta(hours=2), "")) + self.assertEqual(extract_duration("3 dni"), + (timedelta(days=3), "")) + self.assertEqual(extract_duration("25 tygodni"), + (timedelta(weeks=25), "")) + self.assertEqual(extract_duration("siedem godzin"), + (timedelta(hours=7), "")) + self.assertEqual(extract_duration("7.5 sekundy"), + (timedelta(seconds=7.5), "")) + self.assertEqual(extract_duration("osiem i pół dnia trzydzieści dziewięć sekund", + lang='pl-pl'), + (timedelta(days=8.5, seconds=39), "")) + self.assertEqual(extract_duration("Ustaw stoper na 30 minut"), + (timedelta(minutes=30), "ustaw stoper na")) + self.assertEqual(extract_duration("Cztery i pół minuty do zachodu"), + (timedelta(minutes=4.5), "do zachodu")) + self.assertEqual(extract_duration("dziewiętnaście minut po pełnej godzinie"), + (timedelta(minutes=19), "po pełnej godzinie")) + self.assertEqual(extract_duration("obudź mnie za 3 tygodnie, czterysta dziewięćdziesiąt siedem dni i" + " trzysta 91.6 sekund"), + (timedelta(weeks=3, days=497, seconds=391.6), + "obudź mnie za , i")) + self.assertEqual(extract_duration("ten film trwa jedną godzinę, pięćdziesiąt siedem i pół minuty", + lang='pl-pl'), + (timedelta(hours=1, minutes=57.5), + "ten film trwa ,")) + self.assertEqual(extract_duration("10-sekund"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5-minut"), + (timedelta(minutes=5), "")) + + def test_extractdatetime_pl(self): + def extractWithFormat(text): + date = datetime(2017, 6, 27, 13, 4) # Tue June 27, 2017 @ 1:04pm + print(text) # TODO Remove me + [extractedDate, leftover] = extract_datetime(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("teraz jest czas", + "2017-06-27 13:04:00", "jest czas") + testExtract("za sekundę", + "2017-06-27 13:04:01", "") + testExtract("za minutę", + "2017-06-27 13:05:00", "") + testExtract("następna dekada", + "2027-06-27 00:00:00", "") + testExtract("za jeden wiek", + "2117-06-27 00:00:00", "") + testExtract("za jedno milenium", + "3017-06-27 00:00:00", "") + testExtract("za 5 dekad", + "2067-06-27 00:00:00", "") + testExtract("za 2 wieki", + "2217-06-27 00:00:00", "") + testExtract("za godzinę", + "2017-06-27 14:04:00", "") + testExtract("chcę to do godziny", + "2017-06-27 14:04:00", "chcę to") + testExtract("za 1 sekundę", + "2017-06-27 13:04:01", "") + testExtract("za 2 sekundy", + "2017-06-27 13:04:02", "") + testExtract("Nastaw zasadzkę na za minutę", + "2017-06-27 13:05:00", "nastaw zasadzkę") + testExtract("Nastaw zasadzkę na pół godziny", + "2017-06-27 13:34:00", "nastaw zasadzkę") + testExtract("Nastaw zasadzkę za 5 dni od dzisiaj", + "2017-07-02 00:00:00", "nastaw zasadzkę") + testExtract("pojutrze", + "2017-06-29 00:00:00", "") + testExtract("Jaka będzie pogoda pojutrze?", + "2017-06-29 00:00:00", "jaka będzie pogoda") + testExtract("Przypomnij mi o 10:45 po południu", + "2017-06-27 22:45:00", "przypomnij mi") + testExtract("Jaka będzie pogoda w piątek rano", + "2017-06-30 08:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda jutro", + "2017-06-28 00:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda dzisiaj po południu", + "2017-06-27 15:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda dzisiaj wieczorem?", + "2017-06-27 19:00:00", "jaka będzie pogoda") + testExtract("jaka była pogoda dzisiaj rano", + "2017-06-27 08:00:00", "jaka była pogoda") + testExtract("przypomnij mi bym zadzwonił do mamy za 8 tygodni i 2 dni", + "2017-08-24 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy 3 Sierpnia", + "2017-08-03 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy jutro o 7 rano", + "2017-06-28 07:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi jutro bym zadzwonił do mamy o 9 w nocy", + "2017-06-28 21:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi jutro bym zadzwonił do mamy o 7 rano", + "2017-06-28 07:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za godzinę", + "2017-06-27 14:04:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy o 1730", + "2017-06-27 17:30:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy o 0630", + "2017-06-28 06:30:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy o 7", + "2017-06-27 19:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy w czwartek o 7 wieczorem", + "2017-06-29 19:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy w Czwartek o 7 rano", + "2017-06-29 07:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy o 7 rano w Czwartek", + "2017-06-29 07:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za 2 godziny", + "2017-06-27 15:04:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za 15 minut", + "2017-06-27 13:19:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za piętnaście minut", + "2017-06-27 13:19:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za pół godziny", + "2017-06-27 13:34:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy o 10 rano 2 dni po Sobocie", + "2017-07-03 10:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Zagraj Rick Astley za 2 dni po Piątku", + "2017-07-02 00:00:00", "zagraj rick astley") + testExtract("Zacznij inwazję o 3:45 po południu", + "2017-06-27 15:45:00", "zacznij inwazję") + testExtract("W poniedziałek, zamów ciasto z piekarni", + "2017-07-03 00:00:00", "zamów ciasto z piekarni") + testExtract("Zagraj Wszystkiego Najlepszego za 5 lat od dzisiaj", + "2022-06-27 00:00:00", "zagraj wszystkiego najlepszego") + testExtract("Skype z Mamą o 12:45 w następny Czwartek", + "2017-07-06 12:45:00", "skype z mamą") + testExtract("Jaka będzie pogoda w następny Piątek", + "2017-06-30 00:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda w następną Środę", + "2017-07-05 00:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda w następny Czwartek", + "2017-07-06 00:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda w następny piątek rano", + "2017-06-30 08:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda w następny Piątek wieczorem", + "2017-06-30 19:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda w następny Piątek po południu", + "2017-06-30 15:00:00", "jaka będzie pogoda") + testExtract("Przypomnij mi bym zadzwonił do mamy 3 Sierpnia", + "2017-08-03 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Kup fajerwerki 4 Lipca", + "2017-07-04 00:00:00", "kup fajerwerki") + testExtract("Jaka będzie pogoda za 2 tygodnie po następnym Piątku", + "2017-07-14 00:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda w Środę o 7 rano", + "2017-06-28 07:00:00", "jaka będzie pogoda") + testExtract("Ustaw spotkanie na 12:45 w następny Czwartek", + "2017-07-06 12:45:00", "ustaw spotkanie") + testExtract("Jaka będzie pogoda w ten Czwartek", + "2017-06-29 00:00:00", "jaka będzie pogoda") + testExtract("Ustaw wizytę na za 2 tygodnie i 6 dni od Soboty", + "2017-07-21 00:00:00", "ustaw wizytę na") + testExtract("Zacznij inwazję o 03 45 w Czwartek", + "2017-06-29 03:45:00", "zacznij inwazję") + testExtract("Zacznij inwazję o 8 wieczorem w Czwartek", + "2017-06-29 20:00:00", "zacznij inwazję") + testExtract("Zacznij inwazję w Czwartek południe", + "2017-06-29 12:00:00", "zacznij inwazję") + testExtract("Zacznij inwazję w Czwartek o północy", + "2017-06-29 00:00:00", "zacznij inwazję") + testExtract("Przypomnij mi bym się obudził za 4 lata", + "2021-06-27 00:00:00", "przypomnij mi bym się obudził") + testExtract("Przypomnij mi bym się obudził za 4 lata i 4 dni", + "2021-07-01 00:00:00", "przypomnij mi bym się obudził") + testExtract("Jaka będzie pogoda za 3 dni od jutra", + "2017-07-01 00:00:00", "jaka będzie pogoda") + testExtract("grudzień trzeci", + "2017-12-03 00:00:00", "") + testExtract("Spotkajmy się o 8 wieczorem", + "2017-06-27 20:00:00", "spotkajmy się") + testExtract("Spotkajmy się o 5 po południu", + "2017-06-27 17:00:00", "spotkajmy się") + testExtract("Spotkajmy się o 8 rano", + "2017-06-28 08:00:00", "spotkajmy się") + testExtract("Przypomnij mi bym się obudził o 8 rano", + "2017-06-28 08:00:00", "przypomnij mi bym się obudził") + testExtract("Jaka będzie pogoda we Wtorek", + "2017-06-27 00:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda w Poniedziałek", + "2017-07-03 00:00:00", "jaka będzie pogoda") + testExtract("Jaka będzie pogoda w środę", + "2017-06-28 00:00:00", "jaka będzie pogoda") + testExtract("w Czwartek jaka będzie pogoda", + "2017-06-29 00:00:00", "jaka będzie pogoda") + testExtract("w ten Czwartek jaka będzie pogoda", + "2017-06-29 00:00:00", "jaka będzie pogoda") + testExtract("Jaka była pogoda w ostatni Poniedziałek", + "2017-06-26 00:00:00", "jaka była pogoda") + testExtract("Ustaw alarm na Środę 8 wieczór", + "2017-06-28 20:00:00", "ustaw alarm") + testExtract("Ustaw alarm na Środę o trzeciej po południu", + "2017-06-28 15:00:00", "ustaw alarm") + testExtract("Ustaw alarm na Środę o 3 rano", + "2017-06-28 03:00:00", "ustaw alarm") + testExtract("Ustaw alarm na 7:00 wieczorem", + "2017-06-27 19:00:00", "ustaw alarm") + testExtract("5 czerwca 2017 wieczorem przypomnij mi bym" + + " zadzwonił do mamy", + "2017-06-05 19:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("dodaj do mojego kalendarza poranne spotkanie z Juliuszem" + + " czwartego Marca", + "2018-03-04 08:00:00", + "dodaj do mojego kalendarza spotkanie z juliuszem") + testExtract("Przypomnij mi bym zadzwonił do mamy w następny Wtorek", + "2017-07-04 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za 3 tygodnie", + "2017-07-18 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za 8 tygodni", + "2017-08-22 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za 8 tygodni i 2 dni", + "2017-08-24 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za 4 dni", + "2017-07-01 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za 3 miesiące", + "2017-09-27 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy za 2 lata i 2 dni", + "2019-06-29 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy w następnym tygodniu", + "2017-07-04 00:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy o 10 rano w Sobotę", + "2017-07-01 10:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy o 10 rano w tę Sobotę", + "2017-07-01 10:00:00", "przypomnij mi bym zadzwonił do mamy") + testExtract("Przypomnij mi bym zadzwonił do mamy o 10 w następną Sobotę", + "2017-07-01 10:00:00", "przypomnij mi bym zadzwonił do mamy") + # test yesterday + testExtract("Jaki dzień był wczoraj", + "2017-06-26 00:00:00", "jaki dzień był") + testExtract("Jaki dzień był przedwczoraj", + "2017-06-25 00:00:00", "jaki dzień był") + testExtract("Miałem kolację wczoraj o 6", + "2017-06-26 06:00:00", "miałem kolację") + testExtract("Miałem kolację wczoraj o 6 rano", + "2017-06-26 06:00:00", "miałem kolację") + testExtract("Miałem kolację wczoraj o 6 wieczorem", + "2017-06-26 18:00:00", "miałem kolację") + + # Below two tests, ensure that time is picked + # even if no am/pm is specified + # in case of weekdays/tonight + # TODO imperfect as leaves "dzień robocze", but calculates time correctly + testExtract("Nastaw alarm na 9 w dni robocze", + "2017-06-27 21:00:00", "nastaw alarm dzień robocze") + testExtract("na 8 wieczorem", + "2017-06-27 20:00:00", "") + testExtract("na 8:30 wieczorem", + "2017-06-27 20:30:00", "") + # Tests a time with ':' & without am/pm + testExtract("nastaw alarm na 9:30 wieczorem", + "2017-06-27 21:30:00", "nastaw alarm") + testExtract("nastaw alarm na 9:00 wieczorem", + "2017-06-27 21:00:00", "nastaw alarm") + # Check if it picks the intent irrespective of correctness + testExtract("przypomnij mi o grze dzisiaj o 11:30 wieczorem", + "2017-06-27 23:30:00", "przypomnij mi o grze") + testExtract("ustaw alarm na 7:30 w dni robocze", + "2017-06-27 19:30:00", "ustaw alarm w dzień robocze") + + # "# days " + testExtract("moje urodziny są za 2 dni", + "2017-06-29 00:00:00", "moje urodziny są") + testExtract("moje urodziny są za 2 dni od dzisiaj", + "2017-06-29 00:00:00", "moje urodziny są") + testExtract("moje urodziny są za 2 dni od jutra", + "2017-06-30 00:00:00", "moje urodziny są") + testExtract("moje urodziny są 2 dni po jutrze", + "2017-06-30 00:00:00", "moje urodziny są") + testExtract("przypomnij mi żebym zadzwonił do mamy o 10 rano 2 dni po następnej Sobocie", + "2017-07-10 10:00:00", "przypomnij mi żebym zadzwonił do mamy") + testExtract("moje urodziny są za 2 dni od wczoraj", + "2017-06-28 00:00:00", "moje urodziny są") + + # "# days ago>" + testExtract("moje urodziny były 1 dzień temu", + "2017-06-26 00:00:00", "moje urodziny były") + testExtract("moje urodziny były 2 dni temu", + "2017-06-25 00:00:00", "moje urodziny były") + testExtract("moje urodziny były 3 dni temu", + "2017-06-24 00:00:00", "moje urodziny były") + testExtract("moje urodziny były 4 dni temu", + "2017-06-23 00:00:00", "moje urodziny były") + testExtract("spotkajmy się w nocy", + "2017-06-27 22:00:00", "spotkajmy się") + testExtract("jaka będzie pogoda jutro w nocy", + "2017-06-28 22:00:00", "jaka będzie pogoda") + testExtract("jaka będzie pogoda w następny Wtorek nocy", + "2017-07-04 22:00:00", "jaka będzie pogoda") + + def test_extract_ambiguous_time_pl(self): + morning = datetime(2017, 6, 27, 8, 1, 2) + evening = datetime(2017, 6, 27, 20, 1, 2) + noonish = datetime(2017, 6, 27, 12, 1, 2) + self.assertEqual( + extract_datetime('nakarm rybę'), None) + self.assertEqual( + extract_datetime('dzień'), None) + self.assertEqual( + extract_datetime('tydzień'), None) + self.assertEqual( + extract_datetime('miesiąc'), None) + self.assertEqual( + extract_datetime('rok'), None) + self.assertEqual( + extract_datetime(' '), None) + + def test_extract_relativedatetime_pl(self): + def extractWithFormat(text): + date = datetime(2017, 6, 27, 10, 1, 2) + [extractedDate, leftover] = extract_datetime(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("spotkajmy się za 5 minut", + "2017-06-27 10:06:02", "spotkajmy się") + testExtract("spotkajmy się za 5minut", + "2017-06-27 10:06:02", "spotkajmy się") + testExtract("spotkajmy się za 5 sekund", + "2017-06-27 10:01:07", "spotkajmy się") + testExtract("spotkajmy się za 1 godzinę", + "2017-06-27 11:01:02", "spotkajmy się") + testExtract("spotkajmy się za 2 godziny", + "2017-06-27 12:01:02", "spotkajmy się") + testExtract("spotkajmy się za 2godziny", + "2017-06-27 12:01:02", "spotkajmy się") + testExtract("spotkajmy się za 1 minutę", + "2017-06-27 10:02:02", "spotkajmy się") + testExtract("spotkajmy się za 1 sekundę", + "2017-06-27 10:01:03", "spotkajmy się") + testExtract("spotkajmy się za 5sekund", + "2017-06-27 10:01:07", "spotkajmy się") + + def test_spaces(self): + self.assertEqual(normalize(" to jest test"), + "to jest test") + self.assertEqual(normalize(" to jest test "), + "to jest test") + self.assertEqual(normalize(" to jest jeden test"), + "to jest 1 test") + + def test_numbers(self): + self.assertEqual(normalize("to jest jeden dwa trzy test"), + "to jest 1 2 3 test") + self.assertEqual(normalize(" to jest cztery pięć sześć test"), + "to jest 4 5 6 test") + self.assertEqual(normalize("to jest dziesięć jedenaście dwanaście test"), + "to jest 10 11 12 test") + self.assertEqual(normalize("to jest osiemnaście dziewiętnaście dwadzieścia"), + "to jest 18 19 20") + self.assertEqual(normalize("to jest jeden dziewiętnaście dwadzieścia dwa"), + "to jest 1 19 20 2") + self.assertEqual(normalize("to jest jeden dwa dwadzieścia dwa"), + "to jest 1 2 20 2") + self.assertEqual(normalize("to jest jeden i pół"), + "to jest 1 pół") + self.assertEqual(normalize("to jest jeden i pół i pięć sześć"), + "to jest 1 pół 5 6") + + def test_multiple_numbers(self): + self.assertEqual(extract_numbers("to jest jeden dwa trzy test"), + [1.0, 2.0, 3.0]) + self.assertEqual(extract_numbers("to jest cztery pięć sześć test"), + [4.0, 5.0, 6.0]) + self.assertEqual(extract_numbers("to jest dziesięć jedenaście dwanaście test"), + [10.0, 11.0, 12.0]) + self.assertEqual(extract_numbers("to jest jeden dwadzieścia jeden test"), + [1.0, 21.0]) + self.assertEqual(extract_numbers("1 pies, siedem świń, macdonald miał " + "farmę, 3 razy 5 macarena"), + [1, 7, 3, 5]) + self.assertEqual(extract_numbers("dwa piwa dwa wina"), + [2.0, 2.0]) + self.assertEqual(extract_numbers("dwadzieścia 20 dwadzieścia"), + [20, 20, 20]) + self.assertEqual(extract_numbers("dwadzieścia 20 22"), + [20.0, 20.0, 22.0]) + self.assertEqual(extract_numbers("dwadzieścia dwadzieścia dwa dwadzieścia"), + [20.0, 22.0, 20.0]) + self.assertEqual(extract_numbers("dwadzieścia 2"), + [22.0]) + self.assertEqual(extract_numbers("dwadzieścia 20 dwadzieścia 2"), + [20, 20, 22]) + self.assertEqual(extract_numbers("jedna trzecia jeden"), + [1 / 3, 1]) + self.assertEqual(extract_numbers("trzeci", ordinals=True), [3]) + self.assertEqual(extract_numbers("sześć trylionów"), + [6e18]) + self.assertEqual(extract_numbers("dwie świnie i sześć bilionów bakterii", + lang='pl-pl'), [2, 6e12]) + self.assertEqual(extract_numbers("trzydziesty drugi lub pierwszy", + ordinals=True), [32, 1]) + self.assertEqual(extract_numbers("to jest siedem osiem dziewięć i" + " pół test"), + [7.0, 8.0, 9.5]) + + +if __name__ == "__main__": + unittest.main()