From 078b7d4f3ace6c59b74a9d74ae27e48597fc0208 Mon Sep 17 00:00:00 2001 From: jarbasai Date: Wed, 19 Jan 2022 11:58:51 +0000 Subject: [PATCH] port/azerbeijan https://github.com/MycroftAI/lingua-franca/pull/212 --- lingua_franca/internal.py | 7 +- lingua_franca/lang/common_data_az.py | 296 ++++ lingua_franca/lang/format_az.py | 402 ++++++ lingua_franca/lang/parse_az.py | 1227 +++++++++++++++++ lingua_franca/res/text/az-az/and.word | 1 + lingua_franca/res/text/az-az/date_time.json | 130 ++ .../res/text/az-az/date_time_test.json | 43 + lingua_franca/res/text/az-az/day.word | 1 + lingua_franca/res/text/az-az/days.word | 1 + lingua_franca/res/text/az-az/hour.word | 1 + lingua_franca/res/text/az-az/hours.word | 1 + lingua_franca/res/text/az-az/minute.word | 1 + lingua_franca/res/text/az-az/minutes.word | 1 + lingua_franca/res/text/az-az/normalize.json | 45 + lingua_franca/res/text/az-az/or.word | 1 + lingua_franca/res/text/az-az/second.word | 1 + lingua_franca/res/text/az-az/seconds.word | 1 + test/test_format_az.py | 549 ++++++++ test/test_parse_az.py | 429 ++++++ 19 files changed, 3135 insertions(+), 3 deletions(-) create mode 100644 lingua_franca/lang/common_data_az.py create mode 100644 lingua_franca/lang/format_az.py create mode 100644 lingua_franca/lang/parse_az.py create mode 100644 lingua_franca/res/text/az-az/and.word create mode 100644 lingua_franca/res/text/az-az/date_time.json create mode 100644 lingua_franca/res/text/az-az/date_time_test.json create mode 100644 lingua_franca/res/text/az-az/day.word create mode 100644 lingua_franca/res/text/az-az/days.word create mode 100644 lingua_franca/res/text/az-az/hour.word create mode 100644 lingua_franca/res/text/az-az/hours.word create mode 100644 lingua_franca/res/text/az-az/minute.word create mode 100644 lingua_franca/res/text/az-az/minutes.word create mode 100644 lingua_franca/res/text/az-az/normalize.json create mode 100644 lingua_franca/res/text/az-az/or.word create mode 100644 lingua_franca/res/text/az-az/second.word create mode 100644 lingua_franca/res/text/az-az/seconds.word create mode 100644 test/test_format_az.py create mode 100644 test/test_parse_az.py diff --git a/lingua_franca/internal.py b/lingua_franca/internal.py index bb2e04a2..b9e19609 100644 --- a/lingua_franca/internal.py +++ b/lingua_franca/internal.py @@ -9,16 +9,17 @@ from lingua_franca.time import to_local -_SUPPORTED_LANGUAGES = ("ca", "cs", "da", "de", "en", "es", "fr", "hu", +_SUPPORTED_LANGUAGES = ("az", "ca", "cs", "da", "de", "en", "es", "fr", "hu", "it", "nl", "pl", "pt", "ru", "sl", "sv", "fa") -_SUPPORTED_FULL_LOCALIZATIONS = ("ca-es", "cs-cz", "da-dk", "de-de", +_SUPPORTED_FULL_LOCALIZATIONS = ("az-az", "ca-es", "cs-cz", "da-dk", "de-de", "en-au", "en-us", "es-es", "fr-fr", "hu-hu", "it-it", "nl-nl", "pl-pl", "fa-ir", "pt-pt", "ru-ru", "sl-si", "sv-se", "tr-tr") -_DEFAULT_FULL_LANG_CODES = {'ca': 'ca-es', +_DEFAULT_FULL_LANG_CODES = {'az': 'az-az', + 'ca': 'ca-es', 'cs': 'cs-cz', 'da': 'da-dk', 'de': 'de-de', diff --git a/lingua_franca/lang/common_data_az.py b/lingua_franca/lang/common_data_az.py new file mode 100644 index 00000000..5a96e130 --- /dev/null +++ b/lingua_franca/lang/common_data_az.py @@ -0,0 +1,296 @@ +# +# Copyright 2021 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from collections import OrderedDict +from .parse_common import invert_dict + +_FUNCTION_NOT_IMPLEMENTED_WARNING = "Tələb olunan funksiya Azərbaycan dilində yerinə yetirilmir." + +_NUM_STRING_AZ = { + 0: 'sıfır', + 1: 'bir', + 2: 'iki', + 3: 'üç', + 4: 'dörd', + 5: 'beş', + 6: 'altı', + 7: 'yeddi', + 8: 'səkkiz', + 9: 'doqquz', + 10: 'on', + 11: 'on bir', + 12: 'on iki', + 13: 'on üç', + 14: 'on dörd', + 15: 'on beş', + 16: 'on altı', + 17: 'on yeddi', + 18: 'on səkkiz', + 19: 'on doqquz', + 20: 'iyirmi', + 30: 'otuz', + 40: 'qırx', + 50: 'əlli', + 60: 'altmış', + 70: 'yetmiş', + 80: 'səksən', + 90: 'doxsan' +} + +_FRACTION_STRING_AZ = { + 2: 'ikidə', + 3: 'üçdə', + 4: 'dörddə', + 5: 'beşdə', + 6: 'altıda', + 7: 'yeddidə', + 8: 'səkkizdə', + 9: 'doqquzda', + 10: 'onda', + 11: 'on birdə', + 12: 'on ikidə', + 13: 'on üçdə', + 14: 'on dörddə', + 15: 'on beşdə', + 16: 'on altıda', + 17: 'on yeddidə', + 18: 'on səkkizdə', + 19: 'on doqquzda', + 20: 'iyirmidə', + 30: 'otuzda', + 40: 'qırxda', + 50: 'əllidə', + 60: 'altmışda', + 70: 'yetmişdə', + 80: 'səksəndə', + 90: 'doxsanda', + 1e2: 'yüzdə', + 1e3: 'mində' +} + + +_LONG_SCALE_AZ = OrderedDict([ + (100, 'yüz'), + (1000, 'min'), + (1000000, 'milyon'), + (1e12, "milyard"), + (1e18, 'trilyon'), + (1e24, "kvadrilyon"), + (1e30, "kvintilyon"), + (1e36, "sekstilyon"), + (1e42, "septilyon"), + (1e48, "oktilyon"), + (1e54, "nonilyon"), + (1e60, "dekilyon") +]) + + +_SHORT_SCALE_AZ = OrderedDict([ + (100, 'yüz'), + (1000, 'min'), + (1000000, 'milyon'), + (1e9, "milyard"), + (1e12, 'trilyon'), + (1e15, "kvadrilyon"), + (1e18, "kvintilyon"), + (1e21, "sekstilyon"), + (1e24, "septilyon"), + (1e27, "oktilyon"), + (1e30, "nonilyon"), + (1e33, "dekilyon") +]) + +_ORDINAL_BASE_AZ = { + 1: 'birinci', + 2: 'ikinci', + 3: 'üçüncü', + 4: 'dördüncü', + 5: 'beşinci', + 6: 'altıncı', + 7: 'yeddinci', + 8: 'səkkizinci', + 9: 'doqquzuncu', + 10: 'onuncu', + 11: 'on birinci', + 12: 'on ikinci', + 13: 'on üçüncü', + 14: 'on dördüncü', + 15: 'on beşinci', + 16: 'on altıncı', + 17: 'on yeddinci', + 18: 'on səkkizinci', + 19: 'on doqquzuncu', + 20: 'iyirminci', + 30: 'otuzuncu', + 40: "qırxıncı", + 50: "əllinci", + 60: "altmışıncı", + 70: "yetmışinci", + 80: "səksəninci", + 90: "doxsanınçı", + 1e2: "yüzüncü", + 1e3: "mininci" +} + +_SHORT_ORDINAL_AZ = { + 1e6: "milyonuncu", + 1e9: "milyardıncı", + 1e12: "trilyonuncu", + 1e15: "kvadrilyonuncu", + 1e18: "kvintilyonuncu", + 1e21: "sekstilyonuncu", + 1e24: "septilyonuncu", + 1e27: "oktilyonuncu", + 1e30: "nonilyonuncu", + 1e33: "dekilyonuncu" + # TODO > 1e-33 +} +_SHORT_ORDINAL_AZ.update(_ORDINAL_BASE_AZ) + + +_LONG_ORDINAL_AZ = { + 1e6: "milyonuncu", + 1e12: "milyardıncı", + 1e18: "trilyonuncu", + 1e24: "kvadrilyonuncu", + 1e30: "kvintilyonuncu", + 1e36: "sekstilyonuncu", + 1e42: "septilyonuncu", + 1e48: "oktilyonuncu", + 1e54: "nonilyonuncu", + 1e60: "dekilyonuncu" + # TODO > 1e60 +} +_LONG_ORDINAL_AZ.update(_ORDINAL_BASE_AZ) + + +# negate next number (-2 = 0 - 2) +_NEGATIVES_AZ = {"mənfi", "minus"} + +# sum the next number (iyirmi iki = 20 + 2) +_SUMS_AZ = {'on', '10', 'iyirmi', '20', 'otuz', '30', 'qırx', '40', 'əlli', '50', + 'altmış', '60', 'yetmiş', '70', 'səksən', '80', 'doxsan', '90'} + +_HARD_VOWELS = ['a', 'ı', 'o', 'u'] +_SOFT_VOWELS = ['e', 'ə', 'i', 'ö', 'ü'] +_VOWELS = _HARD_VOWELS + _SOFT_VOWELS + +def _get_last_vowel(word): + is_last = True + for char in word[::-1]: + if char in _VOWELS: + return char, is_last + is_last = False + + return "", is_last + +def _last_vowel_type(word): + return _get_last_vowel(word)[0] in _HARD_VOWELS + +def _get_ordinal_ak(word): + last_vowel, is_last = _get_last_vowel(word) + if not last_vowel: + return "" + + if last_vowel in ["a", "ı"]: + if is_last: + return "ncı" + return "ıncı" + + if last_vowel == ["e", "ə", "i"]: + if is_last: + return "nci" + return "inci" + + if last_vowel in ["o", "u"]: + if is_last: + return "ncu" + return "uncu" + + if last_vowel == ["ö", "ü"]: + if is_last: + return "ncü" + return "üncü" + +def _get_full_time_ak(hour): + if hour in [1, 3, 4, 5, 8, 11]: + return "ə" + if hour in [2, 7, 12]: + return "yə" + if hour in [9, 10]: + return "a" + return "ya" + +def _get_half_time_ak(hour): + if hour in [1, 5, 8, 11]: + return "in" + if hour in [2, 7, 12]: + return "nin" + if hour in [3, 4]: + return "ün" + if hour in [9, 10]: + return "un" + return "nın" + +def _get_daytime(hour): + if hour < 6: + return "gecə" + if hour < 12: + return "səhər" + if hour < 18: + return "gündüz" + return "axşam" + +def _generate_plurals_az(originals): + """ + Return a new set or dict containing the plural form of the original values, + + In Azerbaijani this means appending 'lar' or 'lər' to them according to the last vowel in word. + + Args: + originals set(str) or dict(str, any): values to pluralize + + Returns: + set(str) or dict(str, any) + + """ + + if isinstance(originals, dict): + return {key + ('lar' if _last_vowel_type(key) else 'lər'): value for key, value in originals.items()} + return {value + ('lar' if _last_vowel_type(value) else 'lər') for value in originals} + + +_MULTIPLIES_LONG_SCALE_AZ = set(_LONG_SCALE_AZ.values()) | \ + set(_LONG_SCALE_AZ.values()) + +_MULTIPLIES_SHORT_SCALE_AZ = set(_SHORT_SCALE_AZ.values()) | \ + set(_SHORT_SCALE_AZ.values()) + +# split sentence parse separately and sum ( 2 and a half = 2 + 0.5 ) +_FRACTION_MARKER_AZ = {"və"} + +# decimal marker ( 1 nöqtə 5 = 1 + 0.5) +_DECIMAL_MARKER_AZ = {"nöqtə"} + +_STRING_NUM_AZ = invert_dict(_NUM_STRING_AZ) + +_SPOKEN_EXTRA_NUM_AZ = { + "yarım": 0.5, + "üçdəbir": 1 / 3, + "dörddəbir": 1 / 4 + } + +_STRING_SHORT_ORDINAL_AZ = invert_dict(_SHORT_ORDINAL_AZ) +_STRING_LONG_ORDINAL_AZ = invert_dict(_LONG_ORDINAL_AZ) diff --git a/lingua_franca/lang/format_az.py b/lingua_franca/lang/format_az.py new file mode 100644 index 00000000..ea4a4f9f --- /dev/null +++ b/lingua_franca/lang/format_az.py @@ -0,0 +1,402 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2021 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import datetime + +from lingua_franca.lang.format_common import convert_to_mixed_fraction +from lingua_franca.lang.common_data_az import _NUM_STRING_AZ, _get_ordinal_ak, _get_daytime, \ + _FRACTION_STRING_AZ, _LONG_SCALE_AZ, _SHORT_SCALE_AZ, _SHORT_ORDINAL_AZ, _LONG_ORDINAL_AZ, \ + _get_full_time_ak, _get_half_time_ak + + +def nice_number_az(number, speech=True, denominators=range(1, 21)): + """ Azerbaijani helper for nice_number + + This function formats a float to human understandable functions. Like + 4.5 becomes "4 yarım" for speech and "4 1/2" for text + + Args: + number (int or float): the float to format + speech (bool): format for speech (True) or display (False) + denominators (iter of ints): denominators to use, default [1 .. 20] + Returns: + (str): The formatted string. + """ + + result = convert_to_mixed_fraction(number, denominators) + if not result: + # Give up, just represent as a 3 decimal number + return str(round(number, 3)) + + whole, num, den = result + + if not speech: + if num == 0: + # TODO: Number grouping? E.g. "1,000,000" + return str(whole) + else: + return '{} {}/{}'.format(whole, num, den) + + if num == 0: + return str(whole) + den_str = _FRACTION_STRING_AZ[den] + if whole == 0: + if den == 2: + return 'yarım' + return '{} {}'.format(den_str, num) + if den == 2: + return '{} yarım'.format(whole) + return '{} və {} {}'.format(whole, den_str, num) + +def pronounce_number_az(number, places=2, short_scale=True, scientific=False, + ordinals=False): + """ + Convert a number to it's spoken equivalent + + For example, '5.2' would return 'beş nöqtə iki' + + Args: + num(float or int): the number to pronounce (under 100) + places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers + scientific (bool): pronounce in scientific notation + ordinals (bool): pronounce in ordinal form "first" instead of "one" + Returns: + (str): The pronounced number + """ + num = number + # deal with infinity + if num == float("inf"): + return "sonsuzluq" + elif num == float("-inf"): + return "mənfi sonsuzluq" + if scientific: + number = '%E' % num + n, power = number.replace("+", "").split("E") + power = int(power) + if power != 0: + if ordinals: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} vurulsun on üstü {}{}'.format( + 'mənfi ' if float(n) < 0 else '', + pronounce_number_az( + abs(float(n)), places, short_scale, False, ordinals=False), + 'mənfi ' if power < 0 else '', + pronounce_number_az(abs(power), places, short_scale, False, ordinals=True)) + else: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} vurulsun on üstü {}{}'.format( + 'mənfi ' if float(n) < 0 else '', + pronounce_number_az( + abs(float(n)), places, short_scale, False), + 'mənfi ' if power < 0 else '', + pronounce_number_az(abs(power), places, short_scale, False)) + + if short_scale: + number_names = _NUM_STRING_AZ.copy() + number_names.update(_SHORT_SCALE_AZ) + else: + number_names = _NUM_STRING_AZ.copy() + number_names.update(_LONG_SCALE_AZ) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale: + hundreds = [_SHORT_SCALE_AZ[n] for n in _SHORT_SCALE_AZ.keys()] + else: + hundreds = [_LONG_SCALE_AZ[n] for n in _LONG_SCALE_AZ.keys()] + + # deal with negatives + result = "" + if num < 0: + # result = "mənfi " if scientific else "minus " + result = "mənfi " + num = abs(num) + + # check for a direct match + if num in number_names and not ordinals: + if num > 1000: + result += "bir " + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False): + assert 0 <= n <= 999 + if n in _SHORT_ORDINAL_AZ and ordinals: + return _SHORT_ORDINAL_AZ[n] + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + return tens[q - 1] + (" " + _sub_thousand(r, ordinals) if r + else "") + else: + q, r = divmod(n, 100) + return (digits[q] + " " if q != 1 else "") + "yüz" + ( + " " + _sub_thousand(r, ordinals) if r else "") + + def _short_scale(n): + if n >= 999*max(_SHORT_SCALE_AZ.keys()): + return "sonsuzluq" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + + number = _sub_thousand(z, not i and ordi) + + if i: + if i >= len(hundreds): + return "" + number += " " + if ordi: + + if i * 1000 in _SHORT_ORDINAL_AZ: + if z == 1: + number = _SHORT_ORDINAL_AZ[i * 1000] + else: + number += _SHORT_ORDINAL_AZ[i * 1000] + else: + if n not in _SHORT_SCALE_AZ: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += _SHORT_SCALE_AZ[num] + _get_ordinal_ak(_SHORT_SCALE_AZ[num]) + else: + number = _SHORT_SCALE_AZ[n] + _get_ordinal_ak(_SHORT_SCALE_AZ[n]) + else: + number += hundreds[i] + if number.startswith("bir min"): + number = number[4:] + res.append(number) + ordi = False + + return ", ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_AZ.keys()): + return "sonsuzluq" + ordi = ordinals + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000000)): + if not z: + continue + number = pronounce_number_az(z, places, True, scientific, + ordinals=ordi and not i) + # strip off the comma after the thousand + if i: + if i >= len(hundreds): + return "" + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + number = number.replace(',', '') + + if ordi: + if i * 1000000 in _LONG_ORDINAL_AZ: + if z == 1: + number = _LONG_ORDINAL_AZ[ + (i + 1) * 1000000] + else: + number += _LONG_ORDINAL_AZ[ + (i + 1) * 1000000] + else: + if n not in _LONG_SCALE_AZ: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += " " + _LONG_SCALE_AZ[ + num] + _get_ordinal_ak(_LONG_SCALE_AZ[num]) + else: + number = " " + _LONG_SCALE_AZ[n] + _get_ordinal_ak(_LONG_SCALE_AZ[n]) + else: + + number += " " + hundreds[i + 1] + res.append(number) + return ", ".join(reversed(res)) + + if short_scale: + result += _short_scale(num) + else: + result += _long_scale(num) + + # deal with scientific notation unpronounceable as number + if not result and "e" in str(num): + return pronounce_number_az(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if abs(num) < 1.0 and (result == "mənfi " or not result): + result += "sıfır" + result += " nöqtə" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + for char in _num_str: + result += " " + number_names[int(char)] + return result + +def nice_time_az(dt, speech=True, use_24hour=False, use_ampm=False): + """ + Format a time to a comfortable human format + For example, generate 'altının yarısı' for speech or '5:30' for + text display. + Args: + dt (datetime): date to format (assumes already in local timezone) + speech (bool): format for speech (default/True) or display (False)=Fal + use_24hour (bool): output in 24-hour/military or 12-hour format + use_ampm (bool): include the am/pm for 12-hour format + Returns: + (str): The formatted time string + """ + if use_24hour: + # e.g. "03:01" or "14:22" + string = dt.strftime("%H:%M") + else: + if use_ampm: + # e.g. "3:01 AM" or "2:22 PM" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + string = _get_daytime(dt.hour) + " " + string + else: + # e.g. "3:01" or "2:22" + string = dt.strftime("%I:%M") + if string[0] == '0': + string = string[1:] # strip leading zeros + + if not speech: + return string + + # Generate a speakable version of the time + if use_24hour: + speak = "" + + # Either "0 8" or "13" + if string[0] == '0': + speak += pronounce_number_az(int(string[0])) + " " + speak += pronounce_number_az(int(string[1])) + else: + speak = pronounce_number_az(int(string[0:2])) + + speak += " " + if string[3] == '0': + speak += pronounce_number_az(0) + " " + speak += pronounce_number_az(int(string[4])) + else: + speak += pronounce_number_az(int(string[3:5])) + return speak + else: + + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 + next_hour = (dt.hour + 1) % 12 or 12 + speak = "" + if use_ampm: + speak += _get_daytime(dt.hour) + " " + + if dt.minute == 0: + speak += "{} tamamdır".format(pronounce_number_az(hour)) + elif dt.minute < 30: + speak += "{}{} {} dəqiqə işləyib".format(pronounce_number_az(next_hour), _get_full_time_ak(next_hour), + pronounce_number_az(dt.minute)) + elif dt.minute == 30: + speak += "{}{} yarısı".format(pronounce_number_az(next_hour), _get_half_time_ak(next_hour)) + else: + speak += "{}{} {} dəqiqə qalıb".format(pronounce_number_az(next_hour), _get_full_time_ak(next_hour), + pronounce_number_az(dt.minute - 30)) + + return speak + +def nice_duration_az(duration, speech=True): + """ Convert duration in seconds to a nice spoken timespan + + Examples: + duration = 60 -> "1:00" or "bir dəqiqə" + duration = 163 -> "2:43" or "iki deqiqe qırx üç saniyə" + + Args: + duration: time, in seconds + speech (bool): format for speech (True) or display (False) + + Returns: + str: timespan as a string + """ + + if isinstance(duration, datetime.timedelta): + duration = duration.total_seconds() + + # Do traditional rounding: 2.5->3, 3.5->4, plus this + # helps in a few cases of where calculations generate + # times like 2:59:59.9 instead of 3:00. + duration += 0.5 + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + if speech: + out = "" + if days > 0: + out += pronounce_number_az(days) + " " + out += "gün" + if hours > 0: + if out: + out += " " + out += pronounce_number_az(hours) + " " + out += "saat" + if minutes > 0: + if out: + out += " " + out += pronounce_number_az(minutes) + " " + out += "dəqiqə" + if seconds > 0: + if out: + out += " " + out += pronounce_number_az(seconds) + " " + out += "saniyə" + else: + # M:SS, MM:SS, H:MM:SS, Dd H:MM:SS format + out = "" + if days > 0: + out = str(days) + "g " + if hours > 0 or days > 0: + out += str(hours) + ":" + if minutes < 10 and (hours > 0 or days > 0): + out += "0" + out += str(minutes) + ":" + if seconds < 10: + out += "0" + out += str(seconds) + + return out \ No newline at end of file diff --git a/lingua_franca/lang/parse_az.py b/lingua_franca/lang/parse_az.py new file mode 100644 index 00000000..8c1c81aa --- /dev/null +++ b/lingua_franca/lang/parse_az.py @@ -0,0 +1,1227 @@ +# +# Copyright 2017 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List +from datetime import datetime, timedelta +from dateutil.relativedelta import relativedelta + +from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ + invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer +from lingua_franca.lang.common_data_az import _LONG_SCALE_AZ, \ + _SHORT_SCALE_AZ, _NEGATIVES_AZ, _SUMS_AZ, _MULTIPLIES_LONG_SCALE_AZ, \ + _MULTIPLIES_SHORT_SCALE_AZ, _FRACTION_MARKER_AZ, _DECIMAL_MARKER_AZ, \ + _STRING_NUM_AZ, _STRING_SHORT_ORDINAL_AZ, _STRING_LONG_ORDINAL_AZ, \ + _FRACTION_STRING_AZ, _generate_plurals_az, _SPOKEN_EXTRA_NUM_AZ + +import re +import json +from lingua_franca.internal import resolve_resource_file + +def _convert_words_to_numbers_az(text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. birinci, ikinci, üçüncü) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_az(tokens, short_scale, ordinals) + + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) + else: + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + + +def _extract_numbers_with_text_az(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (birinci, ikinci, üçüncü, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_az(tokens, short_scale, + ordinals, fractional_numbers) + if not to_replace: + break + + results.append(to_replace) + + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results + + +def _extract_number_with_text_az(tokens, short_scale=True, + ordinals=False, fractional_numbers=True): + """ + This function extracts a number from a list of Tokens. + + Args: + tokens str: the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers + fractional_numbers (bool): True if we should look for fractions and + decimals. + Returns: + ReplaceableNumber + + """ + number, tokens = \ + _extract_number_with_text_az_helper(tokens, short_scale, + ordinals, fractional_numbers) + return ReplaceableNumber(number, tokens) + + +def _extract_number_with_text_az_helper(tokens, + short_scale=True, ordinals=False, + fractional_numbers=True): + """ + Helper for _extract_number_with_text_az. + + This contains the real logic for parsing, but produces + a result that needs a little cleaning (specific, it may + contain leading articles that can be trimmed off). + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + int or float, [Tokens] + + """ + if fractional_numbers: + fraction, fraction_text = \ + _extract_fraction_with_text_az(tokens, short_scale, ordinals) + if fraction: + # print("fraction") + return fraction, fraction_text + + decimal, decimal_text = \ + _extract_decimal_with_text_az(tokens, short_scale, ordinals) + if decimal: + # print("decimal") + return decimal, decimal_text + + return _extract_whole_number_with_text_az(tokens, short_scale, ordinals) + + +def _extract_fraction_with_text_az(tokens, short_scale, ordinals): + """ + Extract fraction numbers from a string. + + This function handles text such as '2 və dörddə üç'. Note that "yarım" or + similar will be parsed by the whole number function. + + Args: + tokens [Token]: words and their indexes in the original string. + short_scale boolean: + ordinals boolean: + + Returns: + (int or float, [Token]) + The value found, and the list of relevant tokens. + (None, None) if no fraction value is found. + + """ + for c in _FRACTION_MARKER_AZ: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_az(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_az(partitions[2], short_scale, + ordinals, fractional_numbers=True) + + if not numbers1 or not numbers2: + return None, None + + # ensure first is not a fraction and second is a fraction + num1 = numbers1[-1] + num2 = numbers2[0] + if num1.value >= 1 and 0 < num2.value < 1: + return num1.value + num2.value, \ + num1.tokens + partitions[1] + num2.tokens + + return None, None + + +def _extract_decimal_with_text_az(tokens, short_scale, ordinals): + """ + Extract decimal numbers from a string. + + This function handles text such as '2 nöqtə 5'. + + Notes: + While this is a helper for extractnumber_az, it also depends on + extractnumber_az, to parse out the components of the decimal. + + This does not currently handle things like: + number dot number number number + + Args: + tokens [Token]: The text to parse. + short_scale boolean: + ordinals boolean: + + Returns: + (float, [Token]) + The value found and relevant tokens. + (None, None) if no decimal value is found. + + """ + for c in _DECIMAL_MARKER_AZ: + partitions = partition_list(tokens, lambda t: t.word == c) + + if len(partitions) == 3: + numbers1 = \ + _extract_numbers_with_text_az(partitions[0], short_scale, + ordinals, fractional_numbers=False) + numbers2 = \ + _extract_numbers_with_text_az(partitions[2], short_scale, + ordinals, fractional_numbers=False) + if not numbers1 or not numbers2: + return None, None + + number = numbers1[-1] + decimal = numbers2[0] + + # TODO handle number dot number number number + if "." not in str(decimal.text): + return number.value + float('0.' + str(decimal.value)), \ + number.tokens + partitions[1] + decimal.tokens + return None, None + + +def _extract_whole_number_with_text_az(tokens, short_scale, ordinals): + """ + Handle numbers not handled by the decimal or fraction functions. This is + generally whole numbers. Note that phrases such as "yarım" will be + handled by this function. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + + Returns: + int or float, [Tokens] + The value parsed, and tokens that it corresponds to. + + """ + multiplies, string_num_ordinal, string_num_scale = \ + _initialize_number_data_az(short_scale, speech=ordinals is not None) + + number_words = [] # type: List[Token] + val = False + prev_val = None + next_val = None + to_sum = [] + # print(tokens, ordinals) + for idx, token in enumerate(tokens): + current_val = None + if next_val: + next_val = None + continue + + word = token.word.lower() + if word in _NEGATIVES_AZ: + number_words.append(token) + continue + + prev_word = tokens[idx - 1].word.lower() if idx > 0 else "" + next_word = tokens[idx + 1].word.lower() if idx + 1 < len(tokens) else "" + # print(prev_word, word, next_word, number_words) + if word not in string_num_scale and \ + word not in _STRING_NUM_AZ and \ + word not in _SUMS_AZ and \ + word not in multiplies and \ + not (ordinals and word in string_num_ordinal) and \ + not is_numeric(word) and \ + not is_fractional_az(word, short_scale=short_scale) and \ + not look_for_fractions(word.split('/')): + # print("a1") + words_only = [token.word for token in number_words] + + if number_words and not all([w.lower() in + _NEGATIVES_AZ for w in words_only]): + break + else: + number_words = [] + continue + elif word not in multiplies \ + and word not in _SPOKEN_EXTRA_NUM_AZ \ + and prev_word not in multiplies \ + and prev_word not in _SUMS_AZ \ + and not (ordinals and prev_word in string_num_ordinal) \ + and prev_word not in _NEGATIVES_AZ: + number_words = [token] + # print("a2") + elif prev_word in _SUMS_AZ and word in _SUMS_AZ: + number_words = [token] + # print("a3") + elif ordinals is None and \ + (word in string_num_ordinal or word in _SPOKEN_EXTRA_NUM_AZ): + # print("a4") + # flagged to ignore this token + continue + else: + # print("a5") + number_words.append(token) + + # is this word already a number ? + if is_numeric(word): + # print("b") + if word.isdigit(): # doesn't work with decimals + val = int(word) + else: + val = float(word) + current_val = val + + # is this word the name of a number ? + if word in _STRING_NUM_AZ: + val = _STRING_NUM_AZ.get(word) + current_val = val + # print("c1", current_val) + elif word in string_num_scale: + val = string_num_scale.get(word) + current_val = val + # print("c2") + elif ordinals and word in string_num_ordinal: + val = string_num_ordinal[word] + current_val = val + # print("c3") + # is the prev word a number and should we sum it? + # twenty two, fifty six + if (prev_word in _SUMS_AZ and val and val < 10) or all([prev_word in + multiplies, + val < prev_val if prev_val else False]): + val = prev_val + val + # print("d") + + # is the prev word a number and should we multiply it? + # twenty hundred, six hundred + if word in multiplies: + if not prev_val: + prev_val = 1 + val = prev_val * val + # print("e") + + # is this a spoken fraction? + # 1 yarım fincan - yarım fincan + if current_val is None and not (ordinals is None and word in _SPOKEN_EXTRA_NUM_AZ): + val = is_fractional_az(word, short_scale=short_scale, + spoken=ordinals is not None) + if val: + if prev_val: + val += prev_val + current_val = val + # print("f", current_val, prev_val) + if word in _SPOKEN_EXTRA_NUM_AZ: + break + + # dörddə bir + if ordinals is False: + temp = prev_val + prev_val = is_fractional_az(prev_word, short_scale=short_scale) + if prev_val: + if not val: + val = 1 + val = val * prev_val + if idx + 1 < len(tokens): + number_words.append(tokens[idx + 1]) + else: + prev_val = temp + # print("g", prev_val) + + # is this a negative number? + if val and prev_word and prev_word in _NEGATIVES_AZ: + val = 0 - val + # print("h") + + # let's make sure it isn't a fraction + if not val: + # look for fractions like "2/3" + aPieces = word.split('/') + if look_for_fractions(aPieces): + val = float(aPieces[0]) / float(aPieces[1]) + current_val = val + # print("i") + + else: + if current_val and all([ + prev_word in _SUMS_AZ, + word not in _SUMS_AZ, + word not in multiplies, + current_val >= 10]): + # Backtrack - we've got numbers we can't sum. + # print("j", number_words, prev_val) + number_words.pop() + val = prev_val + break + prev_val = val + + if word in multiplies and next_word not in multiplies: + # handle long numbers + # six hundred sixty six + # two million five hundred thousand + # + # This logic is somewhat complex, and warrants + # extensive documentation for the next coder's sake. + # + # The current word is a power of ten. `current_val` is + # its integer value. `val` is our working sum + # (above, when `current_val` is 1 million, `val` is + # 2 million.) + # + # We have a dict `string_num_scale` containing [value, word] + # pairs for "all" powers of ten: string_num_scale[10] == "ten. + # + # We need go over the rest of the tokens, looking for other + # powers of ten. If we find one, we compare it with the current + # value, to see if it's smaller than the current power of ten. + # + # Numbers which are not powers of ten will be passed over. + # + # If all the remaining powers of ten are smaller than our + # current value, we can set the current value aside for later, + # and begin extracting another portion of our final result. + # For example, suppose we have the following string. + # The current word is "million".`val` is 9000000. + # `current_val` is 1000000. + # + # "nine **million** nine *hundred* seven **thousand** + # six *hundred* fifty seven" + # + # Iterating over the rest of the string, the current + # value is larger than all remaining powers of ten. + # + # The if statement passes, and nine million (9000000) + # is appended to `to_sum`. + # + # The main variables are reset, and the main loop begins + # assembling another number, which will also be appended + # under the same conditions. + # + # By the end of the main loop, to_sum will be a list of each + # "place" from 100 up: [9000000, 907000, 600] + # + # The final three digits will be added to the sum of that list + # at the end of the main loop, to produce the extracted number: + # + # sum([9000000, 907000, 600]) + 57 + # == 9,000,000 + 907,000 + 600 + 57 + # == 9,907,657 + # + # >>> foo = "nine million nine hundred seven thousand six + # hundred fifty seven" + # >>> extract_number(foo) + # 9907657 + # print("k", tokens[idx+1:]) + time_to_sum = True + for other_token in tokens[idx+1:]: + if other_token.word.lower() in multiplies: + if string_num_scale[other_token.word.lower()] >= current_val: + time_to_sum = False + else: + continue + if not time_to_sum: + break + if time_to_sum: + # print("l") + to_sum.append(val) + val = 0 + prev_val = 0 + + if val is not None and to_sum: + # print("m", to_sum) + val += sum(to_sum) + # print(val, number_words, "end") + return val, number_words + + +def _initialize_number_data_az(short_scale, speech=True): + """ + Generate dictionaries of words to numbers, based on scale. + + This is a helper function for _extract_whole_number. + + Args: + short_scale (bool): + speech (bool): consider extra words (_SPOKEN_EXTRA_NUM_AZ) to be numbers + + Returns: + (set(str), dict(str, number), dict(str, number)) + multiplies, string_num_ordinal, string_num_scale + + """ + multiplies = _MULTIPLIES_SHORT_SCALE_AZ if short_scale \ + else _MULTIPLIES_LONG_SCALE_AZ + + string_num_ordinal_az = _STRING_SHORT_ORDINAL_AZ if short_scale \ + else _STRING_LONG_ORDINAL_AZ + + string_num_scale_az = _SHORT_SCALE_AZ if short_scale else _LONG_SCALE_AZ + string_num_scale_az = invert_dict(string_num_scale_az) + + return multiplies, string_num_ordinal_az, string_num_scale_az + + +def extract_number_az(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + return _extract_number_with_text_az(tokenize(text.lower()), + short_scale, ordinals).value + + +def extract_duration_az(text): + """ + Convert an azerbaijani phrase into a number of seconds + + Convert things like: + "10 dəqiqə" + "2 yarım saat" + "3 gün 8 saat 10 dəqiqə 49 saniyə" + into an int, representing the total number of seconds. + + The words used in the duration will be consumed, and + the remainder returned. + + As an example, "5 dəqiqəyə taymer qur" would return + (300, "taymer qur"). + + Args: + text (str): string containing a duration + + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + if not text: + return None + + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + time_units_az = { + 'mikrosaniyə': 'microseconds', + 'milisaniyə': 'milliseconds', + 'saniyə': 'seconds', + 'dəqiqə': 'minutes', + 'saat': 'hours', + 'gün': 'days', + 'həftə': 'weeks' + } + + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}?(?:yə|a|ə)?(?:(?:\s|,)+)?(?Pyarım|0\.5)?(?:a)?" + text = _convert_words_to_numbers_az(text) + for unit_az in time_units_az: + unit_pattern = pattern.format(unit=unit_az) + def repl(match): + time_units[time_units_az[unit_az]] += float(match.group(1)) + (0.5 if match.group(2) else 0) + return '' + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) + + +def extract_datetime_az(text, anchorDate=None, default_time=None): + """ Convert a human date reference into an exact datetime + + Convert things like + "bu gün" + "sabah günortadan sonra" + "gələn çərşənbə axşamı günorta 4 də" + "3 avqust" + into a datetime. If a reference date is not provided, the current + local time is used. Also consumes the words used to define the date + returning the remaining string. For example, the string + "çərşənbə axşamı hava necədir" + returns the date for the forthcoming çərşənbə axşamı relative to the reference + date and the remainder string + "hava necədir". + + The "gələn" instance of a day or weekend is considered to be no earlier than + 48 hours in the future. On Friday, "gələn Bazar ertəsi" would be in 3 days. + On Saturday, "gələn Bazar ertəsi" would be in 9 days. + + Args: + text (str): string containing date words + anchorDate (datetime): A reference date/time for "sabah", etc + default_time (time): Time to set if no time was found in the string + + Returns: + [datetime, str]: An array containing the datetime and the remaining + text not consumed in the parsing, or None if no + date or time related text was found. + """ + + def clean_string(s, word_list): + # normalize and lowercase utt (replaces words with numbers) + s = _convert_words_to_numbers_az(s, ordinals=None) + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace('.', '').replace(',', '') + + wordList = s.split() + skip_next_word = False + new_words = [] + for idx, word in enumerate(wordList): + if skip_next_word: + skip_next_word = False + continue + wordNext = wordList[idx + 1] if idx + 1 < len(wordList) else "" + ordinals = ["ci", "cü", "cı", "cu"] + if word[0].isdigit(): + for ordinal in ordinals: + if ordinal in wordNext: + skip_next_word = True + if ((word == "bu" and wordNext == "gün") or + (word in ['cümə', 'çərşənbə'] and 'axşamı'in wordNext) or + (word == 'bazar' and 'ertəsi' in wordNext) or + (word == 'günortadan' and wordNext == 'sonra') or + (word == 'gecə' and 'yarısı' in wordNext)): + word = word + ' ' + wordNext + skip_next_word = True + + for orig_word in word_list: + if word.startswith(orig_word): + word = orig_word + break + + new_words.append(word) + + return new_words + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if not anchorDate: + anchorDate = now_local() + + if text == "": + return None + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + currentYear = anchorDate.strftime("%Y") + fromFlag = False + datestr = "" + hasYear = False + timeQualifier = "" + word_list = [] + timeQualifiersAM = ['səhər', 'gecə'] + timeQualifiersPM = ['günorta', 'axşam', 'nahar'] + word_list += timeQualifiersAM + timeQualifiersPM + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['da', 'də', 'sonra', "ərzində", "günündən", "günü", "gündən", "gün"] + days = ['bazar ertəsi', 'çərşənbə axşamı', 'çərşənbə', + 'cümə axşamı', 'cümə', 'şənbə', 'bazar'] + months = ['yanvar', 'fevral', 'mart', 'aprel', 'may', 'iyun', + 'iyul', 'avqust', 'sentyabr', 'oktyabr', 'moyabr', + 'dekabr'] + eng_months = ['january', 'february', 'march', 'april', 'may', 'june', + 'july', 'august', 'september', 'october', 'november', + 'december'] + word_list += days + months + recur_markers = days + [_generate_plurals_az(d) for d in days] + ['həftə sonu', 'iş günü', + 'həftə sonları', 'iş günləri'] + monthsShort = ['yan', 'fev', 'mar', 'apr', 'may', 'ıyn', 'ıyl', 'avq', + 'sen', 'okt', 'noy', 'dek'] + year_multiples = ["onillik", "yüzillik", "minillik"] + day_multiples = ["həftə", "ay", "il"] + word_list += year_multiples + day_multiples + ['saat', 'dəqiqə', 'saniyə', 'sonra', 'gecə yarısı', 'günortadan sonra', 'gün'] + word_list.sort(key=lambda x: len(x), reverse=True) + words = clean_string(text, word_list) + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + + start = idx + used = 0 + # save timequalifier for later + if word == "indi" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_az(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if "onillik" in wordNext: + yearOffset = multiplier * 10 + elif "yüzillik" in wordNext: + yearOffset = multiplier * 100 + elif "minillik" in wordNext: + yearOffset = multiplier * 1000 + elif word in timeQualifiersList: + timeQualifier = word + # parse bu qün, sabah, srağagün, dünən, birigün + elif word == "bu gün" and not fromFlag: + dayOffset = 0 + used += 1 + elif word == "sabah" and not fromFlag: + dayOffset = 1 + used += 1 + elif word == "srağagün" and not fromFlag: + dayOffset = -2 + used += 1 + elif word == "dünən" and not fromFlag: + dayOffset = -1 + used += 1 + elif word == "birigün" and not fromFlag: + dayOffset = 2 + used = 1 + # parse 5 gün, 10 həftə, keçən həftə, gələn həftə + elif word == "gün": + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + if wordNext == "sonra": + used += 1 + elif word == "həftə" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + if wordNext == "sonra": + used += 1 + elif wordPrev == "gələn": + dayOffset = 7 + start -= 1 + used = 2 + if wordNext == "sonra": + used += 1 + elif wordPrev == "keçən": + dayOffset = -7 + start -= 1 + used = 2 + # parse 10 months, next month, last month + elif word == "ay" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "gələn": + monthOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "keçən": + monthOffset = -1 + start -= 1 + used = 2 + # parse 5 il, gələn il, keçən il + elif word == "il" and not fromFlag and wordPrev: + if wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordPrev == "gələn": + yearOffset = 1 + start -= 1 + used = 2 + elif wordPrev == "keçən": + yearOffset = -1 + start -= 1 + used = 2 + if wordNext in markers: + used += 1 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days and not fromFlag: + if wordNext in markers: + used += 1 + d = days.index(word) + dayOffset = (d + 1) - int(today) + used += 1 + if dayOffset < 0: + dayOffset += 7 + if wordPrev == "gələn": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + start -= 1 + elif wordPrev == "keçən": + dayOffset -= 7 + used += 1 + start -= 1 + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months or word in monthsShort and not fromFlag: + try: + m = months.index(word) + except ValueError: + m = monthsShort.index(word) + used += 1 + datestr = eng_months[m] + if wordPrev and wordPrev[0].isdigit(): + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + if (wordNextNext and wordNextNext in markers) or wordNextNext == 'il': + used += 1 + else: + if wordNext and wordNext in markers: + used += 1 + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + if wordNextNextNext and wordNextNextNext in markers: + used += 1 + else: + if wordNextNext and wordNextNext in markers: + used += 1 + hasYear = False + + elif word == "bu": + used += 1 + dayOffset = 0 + if wordNext in markers: + used += 1 + + if used > 0: + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse günorta, gecə yarısı, səhər, günortadan sonra, axşam, gecə + used = 0 + if word == "günorta": + hrAbs = 12 + used += 1 + elif word == "gecə yarısı": + hrAbs = 0 + used += 1 + elif word == "səhər": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "günortadan sonra": + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word == "axşam": + if hrAbs is None: + hrAbs = 19 + used += 1 + elif word == "gecə": + if hrAbs is None: + hrAbs = 21 + used += 1 + # parse yarım saat + elif word == "saat": + if wordPrev == "yarım": + minOffset = 30 + if wordNext in markers: + used +=1 + + words[idx - 1] = "" + used += 1 + hrAbs = -1 + minAbs = -1 + # parse 5:00 am, 12:00 p.m., etc + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] \ + if idx + 3 < len(words) else "" + + if ':' in word: + # parse colons + # "gecə 3:00" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + + else: + # try to parse numbers without colons + # 5 hours, 10 minutes etc. + length = len(word) + strNum = "" + remainder = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + + if remainder == "": + remainder = wordNext.replace(".", "").lstrip().rstrip() + if ( + remainder == "pm" or + wordNext == "pm" or + remainder == "p.m." or + wordNext == "p.m."): + strHH = strNum + remainder = "pm" + used = 1 + elif ( + remainder == "am" or + wordNext == "am" or + remainder == "a.m." or + wordNext == "a.m."): + strHH = strNum + remainder = "am" + used = 1 + elif ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if ( + ("saat" in wordNext or "saat" in remainder) and + word[0] != '0' and + ( + int(strNum) < 100 or + int(strNum) > 2400 + )): + # "3 saat" + hrOffset = int(strNum) + used = 1 + isTime = False + hrAbs = -1 + minAbs = -1 + elif "dəqiqə" in wordNext or "dəqiqə" in wordNext: + # "10 dəqiqə" + minOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + if wordNextNext in markers: + used += 1 + elif "saniyə" in wordNext or "saniyə" in remainder: + # 5 saniyə + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + elif wordNext and wordNext[0].isdigit(): + # military time, e.g. "04 38 hours" + strHH = strNum + strMM = wordNext + military = True + used += 1 + if (wordNextNext and wordNextNext == "da" or + wordNextNext == "də" or + remainder == "da" or remainder == "də"): + used += 1 + elif wordNext in markers: + strHH = strNum + + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 + + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + + if wordNext in markers or word in markers: + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" + # check that we found a date + if not date_found(): + return None + + if dayOffset is False: + dayOffset = 0 + + # perform date manipulation + + extractedDate = anchorDate.replace(microsecond=0) + + if datestr != "": + # date included an explicit date, e.g. "iyun 5" or "iyun 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "və" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + + +def is_fractional_az(input_str, short_scale=True, spoken=True): + """ + This function takes the given text and checks if it is a fraction. + + Args: + input_str (str): the string to check if fractional + short_scale (bool): use short scale if True, long scale if False + spoken (bool): + Returns: + (bool) or (float): False if not a fraction, otherwise the fraction + + """ + + fracts = {"dörddəbir": 4, "yarım": 2, "üçdəbir": 3} + for num in _FRACTION_STRING_AZ: + if num > 2: + fracts[_FRACTION_STRING_AZ[num]] = num + + if input_str.lower() in fracts and spoken: + return 1.0 / fracts[input_str.lower()] + return False + + +def extract_numbers_az(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" for large + numbers -- over a million. The default is short scale, which + is now common in most English speaking countries. + See https://en.wikipedia.org/wiki/Names_of_large_numbers + ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Returns: + list: list of extracted numbers as floats + """ + results = _extract_numbers_with_text_az(tokenize(text), + short_scale, ordinals) + return [float(result.value) for result in results] + + +class AzerbaijaniNormalizer(Normalizer): + with open(resolve_resource_file("text/az-az/normalize.json")) as f: + _default_config = json.load(f) + + def numbers_to_digits(self, utterance): + return _convert_words_to_numbers_az(utterance, ordinals=None) + + +def normalize_az(text, remove_articles=True): + """ Azerbaijani string normalization """ + return AzerbaijaniNormalizer().normalize(text, remove_articles) diff --git a/lingua_franca/res/text/az-az/and.word b/lingua_franca/res/text/az-az/and.word new file mode 100644 index 00000000..66928ff8 --- /dev/null +++ b/lingua_franca/res/text/az-az/and.word @@ -0,0 +1 @@ +və \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/date_time.json b/lingua_franca/res/text/az-az/date_time.json new file mode 100644 index 00000000..bf0c0e72 --- /dev/null +++ b/lingua_franca/res/text/az-az/date_time.json @@ -0,0 +1,130 @@ +{ + "decade_format": { + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$", "format": "{xx}"}, + "3": {"match": "^\\d0$", "format": "{x0}"}, + "4": {"match": "^[2-9]\\d$", "format": "{x0} {x}"}, + "default": "{number}" + }, + "hundreds_format": { + "1": {"match": "^1\\d{2}$", "format": "yüz"}, + "2": {"match": "^[2-9]\\d{2}$", "format": "{x_in_x00} yüz"}, + "default": "{number}" + }, + "thousand_format": { + "1": {"match": "^10\\d{2}$", "format": "min"}, + "2": {"match": "^[2-9]0\\d{2}$", "format": "{x_in_x000} min"}, + "3": {"match": "^11\\d{2}$", "format": "min yüz"}, + "4": {"match": "^1\\d{3}$", "format": "min {x_in_x00} yüz"}, + "5": {"match": "^[2-9]1\\d{2}$", "format": "{x_in_x000} min yüz"}, + "6": {"match": "^[2-9]\\d{3}$", "format": "{x_in_x000} min {x_in_x00} yüz"}, + "default": "{number}" + }, + "year_format": { + "1": {"match": "^\\d{1}$", "format": "{bc} {formatted_decade}"}, + "2": {"match": "^\\d{2}$", "format": "{bc} {formatted_decade}"}, + "3": {"match": "^\\d00$", "format": "{bc} {formatted_hundreds}"}, + "4": {"match": "^\\d{3}$", "format": "{bc} {formatted_hundreds} {formatted_decade}"}, + "5": {"match": "^\\d\\d00$", "format": "{bc} {formatted_thousand}"}, + "6": {"match": "^\\d{4}$", "format": "{bc} {formatted_thousand} {formatted_decade}"}, + "default": "{bc} {year}", + "bc": "e.ə." + }, + "date_format": { + "date_full": "{weekday}, {month} {day}, {formatted_year}", + "date_full_no_year": "{weekday}, {month} {day}", + "date_full_no_year_month": "{weekday}, ayın {day}", + "today": "bu gün", + "tomorrow": "sabah", + "yesterday": "dünən" + }, + "date_time_format": { + "date_time": "{formatted_date} {formatted_time}" + }, + "weekday": { + "0": "bazar ertəsi", + "1": "çərşənbə axşamı", + "2": "çərşənbə", + "3": "cümə axşamı", + "4": "cümə", + "5": "şənbə", + "6": "bazar" + }, + "date": { + "1": "biri", + "2": "ikisi", + "3": "üçü", + "4": "dördü", + "5": "beşi", + "6": "altısı", + "7": "yeddisi", + "8": "səkkizi", + "9": "doqquzu", + "10": "onu", + "11": "on biri", + "12": "on ikisi", + "13": "on üçü", + "14": "on dördü", + "15": "on beşi", + "16": "on altısı", + "17": "on yeddisi", + "18": "on səkkizi", + "19": "on doqquzu", + "20": "iyirmisi", + "21": "iyirmi biri", + "22": "iyirmi ikisi", + "23": "iyirmi üçü", + "24": "iyirmi dördü", + "25": "iyirmi beşi", + "26": "iyirmi altısı", + "27": "iyirmi yeddisi", + "28": "iyirmi səkkizi", + "29": "iyirmi doqquzu", + "30": "otuzu", + "31": "otuz biri" + }, + "month": { + "1": "yanvarın", + "2": "fevralın", + "3": "martın", + "4": "aprelin", + "5": "mayın", + "6": "iyunun", + "7": "iyulun", + "8": "avgustun", + "9": "sentyabrın", + "10": "oktyabrın", + "11": "noyabrın", + "12": "dekabrın" + }, + "number": { + "0": "sıfır", + "1": "bir", + "2": "iki", + "3": "üç", + "4": "dörd", + "5": "beş", + "6": "altı", + "7": "yeddi", + "8": "səkkiz", + "9": "doqquz", + "10": "on", + "11": "on bir", + "12": "on iki", + "13": "on üç", + "14": "on dörd", + "15": "on beş", + "16": "on altı", + "17": "on yeddi", + "18": "on səkkiz", + "19": "on doqquz", + "20": "iyirmi", + "30": "otuz", + "40": "qırx", + "50": "əlli", + "60": "altmış", + "70": "yetmiş", + "80": "səksən", + "90": "doxsan" + } +} diff --git a/lingua_franca/res/text/az-az/date_time_test.json b/lingua_franca/res/text/az-az/date_time_test.json new file mode 100644 index 00000000..d43f740f --- /dev/null +++ b/lingua_franca/res/text/az-az/date_time_test.json @@ -0,0 +1,43 @@ +{ + "test_nice_year": { + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "e.ə. bir" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "e.ə. on" }, + "3": {"datetime_param": "92, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "e.ə. doxsan iki" }, + "4": {"datetime_param": "803, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "səkkiz yüz üç" }, + "5": {"datetime_param": "811, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "səkkiz yüz on bir" }, + "6": {"datetime_param": "454, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "dörd yüz əlli dörd" }, + "7": {"datetime_param": "1005, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "min beş" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "min on iki" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "min qırx altı" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "min səkkiz yüz yeddi" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "min yeddi yüz on yeddi" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "min doqquz yüz səksən səkkiz"}, + "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "iki min doqquz"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "iki min on səkkiz"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "iki min iyirmi bir"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "iki min otuz"}, + "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "iki min yüz" }, + "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "min" }, + "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "iki min" }, + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "e.ə. üç min yüz iyirmi" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "e.ə. üç min iki yüz qırx bir" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "beş min iki yüz" }, + "23": {"datetime_param": "1100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "min yüz" }, + "24": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "iki min yüz" } + }, + "test_nice_date": { + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "çərşənbə axşamı, yanvarın otuz biri, iki min on yeddi"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "bazar, fevralın dördü, iki min on səkkiz"}, + "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "bazar, fevralın dördü"}, + "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "bazar, ayın dördü"}, + "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "sabah"}, + "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "bu gün"}, + "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "dünən"}, + "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "bazar, fevralın dördü"}, + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "bazar, fevralın dördü, iki min on səkkiz"} + }, + "test_nice_date_time": { + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "çərşənbə axşamı, yanvarın otuz biri, iki min on yeddi gündüz ikiyə iyirmi iki dəqiqə işləyib"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "çərşənbə axşamı, yanvarın otuz biri, iki min on yeddi on üç iyirmi iki"} + } +} diff --git a/lingua_franca/res/text/az-az/day.word b/lingua_franca/res/text/az-az/day.word new file mode 100644 index 00000000..e202db16 --- /dev/null +++ b/lingua_franca/res/text/az-az/day.word @@ -0,0 +1 @@ +gün \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/days.word b/lingua_franca/res/text/az-az/days.word new file mode 100644 index 00000000..e202db16 --- /dev/null +++ b/lingua_franca/res/text/az-az/days.word @@ -0,0 +1 @@ +gün \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/hour.word b/lingua_franca/res/text/az-az/hour.word new file mode 100644 index 00000000..a1073d72 --- /dev/null +++ b/lingua_franca/res/text/az-az/hour.word @@ -0,0 +1 @@ +saat \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/hours.word b/lingua_franca/res/text/az-az/hours.word new file mode 100644 index 00000000..a1073d72 --- /dev/null +++ b/lingua_franca/res/text/az-az/hours.word @@ -0,0 +1 @@ +saat \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/minute.word b/lingua_franca/res/text/az-az/minute.word new file mode 100644 index 00000000..d6040db0 --- /dev/null +++ b/lingua_franca/res/text/az-az/minute.word @@ -0,0 +1 @@ +dəqiqə \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/minutes.word b/lingua_franca/res/text/az-az/minutes.word new file mode 100644 index 00000000..d6040db0 --- /dev/null +++ b/lingua_franca/res/text/az-az/minutes.word @@ -0,0 +1 @@ +dəqiqə \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/normalize.json b/lingua_franca/res/text/az-az/normalize.json new file mode 100644 index 00000000..da2f0ca7 --- /dev/null +++ b/lingua_franca/res/text/az-az/normalize.json @@ -0,0 +1,45 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": {}, + "word_replacements": { + "sora": "sonra" + }, + "number_replacements": { + "sıfır": "0", + "bir": "1", + "iki": "2", + "üç": "3", + "dörd": "4", + "beş": "5", + "altı": "6", + "yeddi": "7", + "səkkiz": "8", + "doqquz": "9", + "on": "10", + "on bir": "11", + "on iki": "12", + "on üç": "13", + "on dörd": "14", + "on beş": "15", + "on altı": "16", + "on yeddi": "17", + "on səkkiz": "18", + "on doqquz": "19", + "iyirmi": "20", + "otuz": "30", + "qırx": "40", + "əlli": "50", + "altmiş": "60", + "yetmiş": "70", + "səksən": "80", + "doxsan": "90" + }, + "stopwords": [], + "articles": [] +} \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/or.word b/lingua_franca/res/text/az-az/or.word new file mode 100644 index 00000000..0bc20c09 --- /dev/null +++ b/lingua_franca/res/text/az-az/or.word @@ -0,0 +1 @@ +ya \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/second.word b/lingua_franca/res/text/az-az/second.word new file mode 100644 index 00000000..8badec1e --- /dev/null +++ b/lingua_franca/res/text/az-az/second.word @@ -0,0 +1 @@ +saniyə \ No newline at end of file diff --git a/lingua_franca/res/text/az-az/seconds.word b/lingua_franca/res/text/az-az/seconds.word new file mode 100644 index 00000000..8badec1e --- /dev/null +++ b/lingua_franca/res/text/az-az/seconds.word @@ -0,0 +1 @@ +saniyə \ No newline at end of file diff --git a/test/test_format_az.py b/test/test_format_az.py new file mode 100644 index 00000000..40036ef8 --- /dev/null +++ b/test/test_format_az.py @@ -0,0 +1,549 @@ +# +# Copyright 2021 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions və +# limitations under the License. +# +import json +import unittest +import datetime +import ast +import warnings +import sys +from pathlib import Path + +from lingua_franca import load_languages, unload_languages, set_default_lang, \ + get_primary_lang_code, get_active_langs, get_supported_langs +from lingua_franca.internal import UnsupportedLanguageError +from lingua_franca.format import nice_number +from lingua_franca.format import nice_time +from lingua_franca.format import nice_date +from lingua_franca.format import nice_date_time +from lingua_franca.format import nice_year +from lingua_franca.format import nice_duration +from lingua_franca.format import pronounce_number +from lingua_franca.format import date_time_format +from lingua_franca.format import join_list +from lingua_franca.time import default_timezone + + +def setUpModule(): + load_languages(get_supported_langs()) + # TODO spin English tests off into another file, like other languages, so we + # don't have to do this confusing thing in the "master" test_format.py + set_default_lang('az-az') + + +def tearDownModule(): + unload_languages(get_active_langs()) + + +NUMBERS_FIXTURE_AZ = { + 1.435634: '1.436', + 2: '2', + 5.0: '5', + 0.027: '0.027', + 0.5: 'yarım', + 1.333: '1 və üçdə 1', + 2.666: '2 və üçdə 2', + 0.25: 'dörddə 1', + 1.25: '1 və dörddə 1', + 0.75: 'dörddə 3', + 1.75: '1 və dörddə 3', + 3.4: '3 və beşdə 2', + 16.8333: '16 və altıda 5', + 12.5714: '12 və yeddidə 4', + 9.625: '9 və səkkizdə 5', + 6.777: '6 və doqquzda 7', + 3.1: '3 və onda 1', + 2.272: '2 və on birdə 3', + 5.583: '5 və on ikidə 7', + 8.384: '8 və on üçdə 5', + 0.071: 'on dörddə 1', + 6.466: '6 və on beşdə 7', + 8.312: '8 və on altıda 5', + 2.176: '2 və on yeddidə 3', + 200.722: '200 və on səkkizdə 13', + 7.421: '7 və on doqquzda 8', + 0.05: 'iyirmidə 1' +} + + +class TestNiceNumberFormat(unittest.TestCase): + + tmp_var = None + + def set_tmp_var(self, val): + self.tmp_var = val + + def test_convert_float_to_nice_number(self): + for number, number_str in NUMBERS_FIXTURE_AZ.items(): + self.assertEqual(nice_number(number), number_str, + 'should format {} as {} and not {}'.format( + number, number_str, nice_number(number))) + + def test_specify_denominator(self): + self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), + '5 yarım', + 'should format 5.5 as 5 yarım not {}'.format( + nice_number(5.5, denominators=[1, 2, 3]))) + self.assertEqual(nice_number(2.333, denominators=[1, 2]), + '2.333', + 'should format 2.333 as 2.333 not {}'.format( + nice_number(2.333, denominators=[1, 2]))) + + def test_no_speech(self): + self.assertEqual(nice_number(6.777, speech=False), + '6 7/9', + 'should format 6.777 as 6 7/9 not {}'.format( + nice_number(6.777, speech=False))) + self.assertEqual(nice_number(6.0, speech=False), + '6', + 'should format 6.0 as 6 not {}'.format( + nice_number(6.0, speech=False))) + + def test_unknown_language(self): + """ An unknown / unhandled language should return the string + representation of the input number. + """ + def bypass_warning(): + self.assertEqual( + nice_number(5.5, lang='as-df'), '5.5', + 'should format 5.5 ' + 'as 5.5 not {}'.format( + nice_number(5.5, lang='as-df'))) + + # Should throw a warning. Would raise the same text as a + # NotImplementedError, but nice_number() bypasses and returns + # its input as a string + self.assertWarns(UserWarning, bypass_warning) + + +class TestPronounceNumber(unittest.TestCase): + def test_convert_int(self): + self.assertEqual(pronounce_number(0), "sıfır") + self.assertEqual(pronounce_number(1), "bir") + self.assertEqual(pronounce_number(10), "on") + self.assertEqual(pronounce_number(15), "on beş") + self.assertEqual(pronounce_number(20), "iyirmi") + self.assertEqual(pronounce_number(27), "iyirmi yeddi") + self.assertEqual(pronounce_number(30), "otuz") + self.assertEqual(pronounce_number(33), "otuz üç") + + def test_convert_negative_int(self): + self.assertEqual(pronounce_number(-1), "mənfi bir") + self.assertEqual(pronounce_number(-10), "mənfi on") + self.assertEqual(pronounce_number(-15), "mənfi on beş") + self.assertEqual(pronounce_number(-20), "mənfi iyirmi") + self.assertEqual(pronounce_number(-27), "mənfi iyirmi yeddi") + self.assertEqual(pronounce_number(-30), "mənfi otuz") + self.assertEqual(pronounce_number(-33), "mənfi otuz üç") + + def test_convert_decimals(self): + self.assertEqual(pronounce_number(0.05), "sıfır nöqtə sıfır beş") + self.assertEqual(pronounce_number(-0.05), "mənfi sıfır nöqtə sıfır beş") + self.assertEqual(pronounce_number(1.234), + "bir nöqtə iki üç") + self.assertEqual(pronounce_number(21.234), + "iyirmi bir nöqtə iki üç") + self.assertEqual(pronounce_number(21.234, places=1), + "iyirmi bir nöqtə iki") + self.assertEqual(pronounce_number(21.234, places=0), + "iyirmi bir") + self.assertEqual(pronounce_number(21.234, places=3), + "iyirmi bir nöqtə iki üç dörd") + self.assertEqual(pronounce_number(21.234, places=4), + "iyirmi bir nöqtə iki üç dörd") + self.assertEqual(pronounce_number(21.234, places=5), + "iyirmi bir nöqtə iki üç dörd") + self.assertEqual(pronounce_number(-1.234), + "mənfi bir nöqtə iki üç") + self.assertEqual(pronounce_number(-21.234), + "mənfi iyirmi bir nöqtə iki üç") + self.assertEqual(pronounce_number(-21.234, places=1), + "mənfi iyirmi bir nöqtə iki") + self.assertEqual(pronounce_number(-21.234, places=0), + "mənfi iyirmi bir") + self.assertEqual(pronounce_number(-21.234, places=3), + "mənfi iyirmi bir nöqtə iki üç dörd") + self.assertEqual(pronounce_number(-21.234, places=4), + "mənfi iyirmi bir nöqtə iki üç dörd") + self.assertEqual(pronounce_number(-21.234, places=5), + "mənfi iyirmi bir nöqtə iki üç dörd") + + def test_convert_hundreds(self): + self.assertEqual(pronounce_number(100), "yüz") + self.assertEqual(pronounce_number(666), "altı yüz altmış altı") + self.assertEqual(pronounce_number(1456), "min, dörd yüz əlli altı") + self.assertEqual(pronounce_number(103254654), "yüz üç milyon, " + "iki yüz əlli dörd min, " + "altı yüz əlli dörd") + self.assertEqual(pronounce_number(1512457), "bir milyon, " + "beş yüz on iki min, " + "dörd yüz əlli yeddi") + self.assertEqual(pronounce_number(209996), "iki yüz doqquz min, " + "doqquz yüz doxsan altı") + + + + + + def test_convert_scientific_notation(self): + self.assertEqual(pronounce_number(0, scientific=True), "sıfır") + self.assertEqual(pronounce_number(33, scientific=True), + "üç nöqtə üç vurulsun on üstü bir") + self.assertEqual(pronounce_number(299792458, scientific=True), + "iki nöqtə doqquz doqquz vurulsun on üstü səkkiz") + self.assertEqual(pronounce_number(299792458, places=6, + scientific=True), + "iki nöqtə doqquz doqquz yeddi doqquz iki beş vurulsun " + "on üstü səkkiz") + self.assertEqual(pronounce_number(1.672e-27, places=3, + scientific=True), + "bir nöqtə altı yeddi iki vurulsun on üstü " + "mənfi iyirmi yeddi") + + def test_auto_scientific_notation(self): + self.assertEqual( + pronounce_number(1.1e-150), "bir nöqtə bir vurulsun " + "on üstü mənfi yüz əlli") + + def test_large_numbers(self): + self.assertEqual( + pronounce_number(299792458, short_scale=True), + "iki yüz doxsan doqquz milyon, yeddi yüz " + "doxsan iki min, dörd yüz əlli səkkiz") + self.assertEqual( + pronounce_number(299792458, short_scale=False), + "iki yüz doxsan doqquz milyon, yeddi yüz " + "doxsan iki min, dörd yüz əlli səkkiz") + self.assertEqual( + pronounce_number(100034000000299792458, short_scale=True), + "yüz kvintilyon, otuz dörd kvadrilyon, " + "iki yüz doxsan doqquz milyon, yeddi yüz " + "doxsan iki min, dörd yüz əlli səkkiz") + self.assertEqual( + pronounce_number(100034000000299792458, short_scale=False), + "yüz trilyon, otuz dörd min milyard, " + "iki yüz doxsan doqquz milyon, yeddi yüz " + "doxsan iki min, dörd yüz əlli səkkiz") + self.assertEqual( + pronounce_number(10000000000, short_scale=True), + "on milyard") + self.assertEqual( + pronounce_number(1000000000000, short_scale=True), + "bir trilyon") + self.assertEqual( + pronounce_number(1000001, short_scale=True), + "bir milyon, bir") + self.assertEqual(pronounce_number(95505896639631893), + "doxsan beş kvadrilyon, beş yüz beş trilyon, " + "səkkiz yüz doxsan altı milyard, " + "altı yüz otuz doqquz milyon, " + "altı yüz otuz bir min, səkkiz yüz doxsan üç") + self.assertEqual(pronounce_number(95505896639631893, + short_scale=False), + "doxsan beş min beş yüz beş milyard, " + "səkkiz yüz doxsan altı min altı yüz " + "otuz doqquz milyon, altı yüz otuz bir min, " + "səkkiz yüz doxsan üç") + self.assertEqual(pronounce_number(10e32, places=1), + "bir dekilyon") + + # infinity + self.assertEqual( + pronounce_number(sys.float_info.max * 2), "sonsuzluq") + self.assertEqual( + pronounce_number(float("inf")), + "sonsuzluq") + self.assertEqual( + pronounce_number(float("-inf")), + "mənfi sonsuzluq") + + def test_ordinals(self): + self.assertEqual(pronounce_number(1, ordinals=True), "birinci") + self.assertEqual(pronounce_number(10, ordinals=True), "onuncu") + self.assertEqual(pronounce_number(15, ordinals=True), "on beşinci") + self.assertEqual(pronounce_number(20, ordinals=True), "iyirminci") + self.assertEqual(pronounce_number(27, ordinals=True), "iyirmi yeddinci") + self.assertEqual(pronounce_number(30, ordinals=True), "otuzuncu") + self.assertEqual(pronounce_number(33, ordinals=True), "otuz üçüncü") + self.assertEqual(pronounce_number(100, ordinals=True), "yüzüncü") + self.assertEqual(pronounce_number(1000, ordinals=True), "mininci") + self.assertEqual(pronounce_number(10000, ordinals=True), + "on mininci") + self.assertEqual(pronounce_number(18691, ordinals=True), + "on səkkiz min, altı yüz doxsan birinci") + self.assertEqual(pronounce_number(1567, ordinals=True), + "min, beş yüz altmış yeddinci") + self.assertEqual(pronounce_number(1.672e-27, places=3, + scientific=True, ordinals=True), + "bir nöqtə altı yeddi iki vurulsun on üstü mənfi " + "iyirmi yeddinci") + self.assertEqual(pronounce_number(18e6, ordinals=True), + "on səkkiz milyonuncu") + self.assertEqual(pronounce_number(18e12, ordinals=True, + short_scale=False), + "on səkkiz milyardıncı") + self.assertEqual(pronounce_number(18e12, ordinals=True), + "on səkkiz trilyonuncu") + self.assertEqual(pronounce_number(18e18, ordinals=True, + short_scale=False), "on səkkiz " + "trilyonuncu") + + +class TestNiceDateFormat(unittest.TestCase): + @classmethod + def setUpClass(cls): + # Read date_time_test.json files for test data + cls.test_config = {} + p = Path(date_time_format.config_path) + for sub_dir in [x for x in p.iterdir() if x.is_dir()]: + if (sub_dir / 'date_time_test.json').exists(): + print("Getting test for " + + str(sub_dir / 'date_time_test.json')) + with (sub_dir / 'date_time_test.json').open() as f: + cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) + + def test_convert_times(self): + dt = datetime.datetime(2017, 1, 31, + 13, 22, 3, tzinfo=default_timezone()) + + # Verify defaults haven't changed + self.assertEqual(nice_time(dt), + nice_time(dt, "az-az", True, False, False)) + + self.assertEqual(nice_time(dt), + "ikiyə iyirmi iki dəqiqə işləyib") + + self.assertEqual(nice_time(dt, use_ampm=True), + "gündüz ikiyə iyirmi iki dəqiqə işləyib") + self.assertEqual(nice_time(dt, speech=False), + "1:22") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "gündüz 1:22") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:22") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:22") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "on üç iyirmi iki") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "on üç iyirmi iki") + + dt = datetime.datetime(2017, 1, 31, + 13, 0, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "bir tamamdır") + self.assertEqual(nice_time(dt, use_ampm=True), + "gündüz bir tamamdır") + self.assertEqual(nice_time(dt, speech=False), + "1:00") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "gündüz 1:00") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:00") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:00") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "on üç sıfır sıfır") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "on üç sıfır sıfır") + + dt = datetime.datetime(2017, 1, 31, + 13, 2, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ikiyə iki dəqiqə işləyib") + self.assertEqual(nice_time(dt, use_ampm=True), + "gündüz ikiyə iki dəqiqə işləyib") + self.assertEqual(nice_time(dt, speech=False), + "1:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "gündüz 1:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "13:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "13:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "on üç sıfır iki") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "on üç sıfır iki") + + dt = datetime.datetime(2017, 1, 31, + 0, 2, 3, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "birə iki dəqiqə işləyib") + self.assertEqual(nice_time(dt, use_ampm=True), + "gecə birə iki dəqiqə işləyib") + self.assertEqual(nice_time(dt, speech=False), + "12:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "gecə 12:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "00:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "00:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "sıfır sıfır sıfır iki") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "sıfır sıfır sıfır iki") + + dt = datetime.datetime(2018, 2, 8, + 1, 2, 33, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ikiyə iki dəqiqə işləyib") + self.assertEqual(nice_time(dt, use_ampm=True), + "gecə ikiyə iki dəqiqə işləyib") + self.assertEqual(nice_time(dt, speech=False), + "1:02") + self.assertEqual(nice_time(dt, speech=False, use_ampm=True), + "gecə 1:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True), + "01:02") + self.assertEqual(nice_time(dt, speech=False, use_24hour=True, + use_ampm=True), + "01:02") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), + "sıfır bir sıfır iki") + self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), + "sıfır bir sıfır iki") + + dt = datetime.datetime(2017, 1, 31, + 12, 15, 9, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "birə on beş dəqiqə işləyib") + self.assertEqual(nice_time(dt, use_ampm=True), + "gündüz birə on beş dəqiqə işləyib") + + dt = datetime.datetime(2017, 1, 31, + 5, 30, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt, use_ampm=True), + "gecə altının yarısı") + + dt = datetime.datetime(2017, 1, 31, + 1, 45, 00, tzinfo=default_timezone()) + self.assertEqual(nice_time(dt), + "ikiyə on beş dəqiqə qalıb") + + def test_nice_date(self): + lang = "az-az" + i = 1 + while (self.test_config[lang].get('test_nice_date') and + self.test_config[lang]['test_nice_date'].get(str(i))): + p = self.test_config[lang]['test_nice_date'][str(i)] + dp = ast.literal_eval(p['datetime_param']) + np = ast.literal_eval(p['now']) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5]) + now = None if not np else datetime.datetime( + np[0], np[1], np[2], np[3], np[4], np[5]) + print('Testing for ' + lang + ' that ' + str(dt) + + ' is date ' + p['assertEqual']) + self.assertEqual(p['assertEqual'], + nice_date(dt, lang=lang, now=now)) + i = i + 1 + + for dt in (datetime.datetime(2017, 12, 30, 0, 2, 3) + + datetime.timedelta(n) for n in range(368)): + self.assertTrue(len(nice_date(dt, lang=lang)) > 0) + + def test_nice_date_time(self): + # TODO: migrate these tests (in res files) to respect the new + # language loading features. Right now, some of them break if + # their languages are not default. + lang = "az-az" + set_default_lang(lang) + i = 1 + while (self.test_config[lang].get('test_nice_date_time') and + self.test_config[lang]['test_nice_date_time'].get(str(i))): + p = self.test_config[lang]['test_nice_date_time'][str(i)] + dp = ast.literal_eval(p['datetime_param']) + np = ast.literal_eval(p['now']) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], + tzinfo=default_timezone()) + now = None if not np else datetime.datetime( + np[0], np[1], np[2], np[3], np[4], np[5], + tzinfo=default_timezone()) + print('Testing for ' + lang + ' that ' + str(dt) + + ' is date time ' + p['assertEqual']) + self.assertEqual( + p['assertEqual'], + nice_date_time( + dt, lang=lang, now=now, + use_24hour=ast.literal_eval(p['use_24hour']), + use_ampm=ast.literal_eval(p['use_ampm']))) + i = i + 1 + + def test_nice_year(self): + lang = "az-az" + i = 1 + while (self.test_config[lang].get('test_nice_year') and + self.test_config[lang]['test_nice_year'].get(str(i))): + p = self.test_config[lang]['test_nice_year'][str(i)] + dp = ast.literal_eval(p['datetime_param']) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5]) + print('Testing for ' + lang + ' that ' + str(dt) + + ' is year ' + p['assertEqual']) + self.assertEqual(p['assertEqual'], nice_year( + dt, lang=lang, bc=ast.literal_eval(p['bc']))) + i = i + 1 + + # Test all years from 0 to 9999 for az, + # that some output is produced + print("Test all years in " + lang) + for i in range(1, 9999): + dt = datetime.datetime(i, 1, 31, 13, 2, 3, tzinfo=default_timezone()) + self.assertTrue(len(nice_year(dt, lang=lang)) > 0) + + def test_nice_duration(self): + self.assertEqual(nice_duration(1), "bir saniyə") + self.assertEqual(nice_duration(3), "üç saniyə") + self.assertEqual(nice_duration(1, speech=False), "0:01") + self.assertEqual(nice_duration(61), "bir dəqiqə bir saniyə") + self.assertEqual(nice_duration(61, speech=False), "1:01") + self.assertEqual(nice_duration(5000), + "bir saat iyirmi üç dəqiqə iyirmi saniyə") + self.assertEqual(nice_duration(5000, speech=False), "1:23:20") + self.assertEqual(nice_duration(50000), + "on üç saat əlli üç dəqiqə iyirmi saniyə") + self.assertEqual(nice_duration(50000, speech=False), "13:53:20") + self.assertEqual(nice_duration(500000), + "beş gün on səkkiz saat əlli üç dəqiqə iyirmi saniyə") # nopep8 + self.assertEqual(nice_duration(500000, speech=False), "5g 18:53:20") + self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), + speech=False), + "5g 18:53:20") + + def test_join(self): + self.assertEqual(join_list(None, "və"), "") + self.assertEqual(join_list([], "və"), "") + + self.assertEqual(join_list(["a"], "və"), "a") + self.assertEqual(join_list(["a", "b"], "və"), "a və b") + self.assertEqual(join_list(["a", "b"], "ya"), "a ya b") + + self.assertEqual(join_list(["a", "b", "c"], "və"), "a, b və c") + self.assertEqual(join_list(["a", "b", "c"], "ya"), "a, b ya c") + self.assertEqual(join_list(["a", "b", "c"], "ya", ";"), "a; b ya c") + self.assertEqual(join_list(["a", "b", "c", "d"], "ya"), "a, b, c ya d") + + self.assertEqual(join_list([1, "b", 3, "d"], "ya"), "1, b, 3 ya d") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_parse_az.py b/test/test_parse_az.py new file mode 100644 index 00000000..32172ba9 --- /dev/null +++ b/test/test_parse_az.py @@ -0,0 +1,429 @@ +# +# Copyright 2021 Mycroft AI Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import unittest +from datetime import datetime, timedelta +from dateutil import tz + +from lingua_franca import load_language, unload_language, set_default_lang +from lingua_franca.internal import FunctionNotLocalizedError +from lingua_franca.time import default_timezone +from lingua_franca.parse import extract_datetime +from lingua_franca.parse import extract_duration +from lingua_franca.parse import extract_number, extract_numbers +from lingua_franca.parse import fuzzy_match +from lingua_franca.parse import get_gender +from lingua_franca.parse import match_one +from lingua_franca.parse import normalize + + +def setUpModule(): + # TODO spin off English tests + load_language('az') + set_default_lang('az') + + +def tearDownModule(): + unload_language('az') + + +class TestFuzzyMatch(unittest.TestCase): + def test_matches(self): + self.assertTrue(fuzzy_match("sən və mən", "sən və mən") >= 1.0) + self.assertTrue(fuzzy_match("sən və mən", "sən") < 0.5) + self.assertTrue(fuzzy_match("sən", "sən") > 0.5) + self.assertTrue(fuzzy_match("sən və mən", "sən") == + fuzzy_match("sən", "sən və mən")) + self.assertTrue(fuzzy_match("sən və mən", "he ya onlar") < 0.2) + + def test_match_one(self): + # test list of choices + choices = ['frank', 'kate', 'harry', 'henry'] + self.assertEqual(match_one('frank', choices)[0], 'frank') + self.assertEqual(match_one('fran', choices)[0], 'frank') + self.assertEqual(match_one('enry', choices)[0], 'henry') + self.assertEqual(match_one('katt', choices)[0], 'kate') + # test dictionary of choices + choices = {'frank': 1, 'kate': 2, 'harry': 3, 'henry': 4} + self.assertEqual(match_one('frank', choices)[0], 1) + self.assertEqual(match_one('enry', choices)[0], 4) + + +class TestNormalize(unittest.TestCase): + def test_extract_number(self): + + self.assertEqual(extract_number("bu 2 sınaqdır"), 2) + self.assertEqual(extract_number("bu 4 nömrəli sınaqdır"), 4) + self.assertEqual(extract_number("üç fıncan"), 3) + self.assertEqual(extract_number("1/3 fıncan"), 1.0 / 3.0) + self.assertEqual(extract_number("dörddəbir fıncan"), 0.25) + self.assertEqual(extract_number("1/4 fıncan"), 0.25) + self.assertEqual(extract_number("dörddə bir fincan"), 0.25) + self.assertEqual(extract_number("2/3 fincan"), 2.0 / 3.0) + self.assertEqual(extract_number("3/4 fincan"), 3.0 / 4.0) + self.assertEqual(extract_number("1 və 3/4 fincan"), 1.75) + self.assertEqual(extract_number("1 yarım fincan"), 1.5) + self.assertEqual(extract_number("bir yarım fincan"), 1.5) + self.assertEqual(extract_number("dörddə üç fincan"), 3.0 / 4.0) + self.assertEqual(extract_number("iyirmi iki"), 22) + + # TODO 'İ'.lower() returns 2 chars, gets fixed in python3.10(unicode v14) + # self.assertEqual(extract_number( + # "İyirmi iki böyük hərflə iyirmi"), 22) + # self.assertEqual(extract_number( + # "iyirmi İki böyük hərflə iki"), 22) + # self.assertEqual(extract_number( + # "İyirmi İki böyük hərflə ikisidə"), 22) + + self.assertEqual(extract_number("iki yüz"), 200) + self.assertEqual(extract_number("doqquz min"), 9000) + self.assertEqual(extract_number("altı yüz altmış altı"), 666) + self.assertEqual(extract_number("iki milyon"), 2000000) + self.assertEqual(extract_number("iki milyon beş yüz min " + "tons dəmir"), 2500000) + self.assertEqual(extract_number("altı trilyon"), 6000000000000) + self.assertEqual(extract_number("altı trilyon", short_scale=False), + 6e+18) + self.assertEqual(extract_number("bir nöqtə beş"), 1.5) + self.assertEqual(extract_number("üç nöqtə on dörd"), 3.14) + self.assertEqual(extract_number("sıfır nöqtə iki"), 0.2) + self.assertEqual(extract_number("bir milyard yaş daha böyükdür"), + 1000000000.0) + self.assertEqual(extract_number("bir milyard yaş daha böyükdür", + short_scale=False), + 1000000000000.0) + self.assertEqual(extract_number("yüz min"), 100000) + self.assertEqual(extract_number("minus 2"), -2) + self.assertEqual(extract_number("mənfi yetmiş"), -70) + self.assertEqual(extract_number("min milyon"), 1000000000) + + # Verify non-power multiples of ten no longer discard + # adjacent multipliers + self.assertEqual(extract_number("iyirmi min"), 20000) + self.assertEqual(extract_number("əlli milyon"), 50000000) + + # Verify smaller powers of ten no longer cause miscalculation of larger + # powers of ten (see MycroftAI#86) + self.assertEqual(extract_number("iyirmi milyard üç yüz milyon \ + doqquz yüz əlli min altı yüz \ + yetmiş beş nöqtə səkkiz"), + 20300950675.8) + self.assertEqual(extract_number("doqquz yüz doxsan doqquz milyon doqquz \ + yüz doxsan doqquz min doqquz \ + yüz doxsan doqquz nöqtə doqquz"), + 999999999.9) + + # TODO why does "trilyon" result in xxxx.0? + self.assertEqual(extract_number("səkkiz yüz trilyon iki yüz \ + əlli yeddi"), 800000000000257.0) + + # TODO handle this case + # self.assertEqual( + # extract_number("altı altı altı"), 666) + + self.assertTrue(extract_number("Tennisçi sürətlidir") is False) + self.assertTrue(extract_number("parçalamaq") is False) + + self.assertTrue(extract_number("parçalamaq sıfır") is not False) + self.assertEqual(extract_number("parçalamaq sıfır"), 0) + + self.assertTrue(extract_number("grobo 0") is not False) + self.assertEqual(extract_number("grobo 0"), 0) + + self.assertEqual(extract_number("tamamilə 100%"), 100) + + def test_extract_duration_az(self): + self.assertEqual(extract_duration("10 saniyə"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5 dəqiqə"), + (timedelta(minutes=5), "")) + self.assertEqual(extract_duration("2 saat"), + (timedelta(hours=2), "")) + self.assertEqual(extract_duration("3 gün"), + (timedelta(days=3), "")) + self.assertEqual(extract_duration("25 həftə"), + (timedelta(weeks=25), "")) + self.assertEqual(extract_duration("yeddi saat"), + (timedelta(hours=7), "")) + self.assertEqual(extract_duration("7.5 saniyə"), + (timedelta(seconds=7.5), "")) + self.assertEqual(extract_duration("səkkiz yarım gün otuz" + " doqquz saniyə"), + (timedelta(days=8.5, seconds=39), "")) + self.assertEqual(extract_duration("üç həftə, dörd yüz doxsan yeddi gün, " + "üç yüz 91.6 saniyə sonra məni oyandır"), + (timedelta(weeks=3, days=497, seconds=391.6), + "sonra məni oyandır")) + self.assertEqual(extract_duration("10-saniyə"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5-dəqiqə"), + (timedelta(minutes=5), "")) + + def test_extract_duration_case_az(self): + self.assertEqual(extract_duration("taymeri 30 dəqiqəyə qur"), + (timedelta(minutes=30), "taymeri qur")) + self.assertEqual(extract_duration("Film bir saat, əlli yeddi" + " yarım dəqiqə davam edir"), + (timedelta(hours=1, minutes=57.5), + "Film davam edir")) + self.assertEqual(extract_duration("Gün batana dörd dəqiqə yarım qaldı"), + (timedelta(minutes=4.5), "Gün batana qaldı")) + self.assertEqual(extract_duration("Saatı on doqquz dəqiqə keçir"), + (timedelta(minutes=19), "Saatı keçir")) + + def test_extractdatetime_fractions_az(self): + def extractWithFormat(text): + date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) # Tue June 27, 2017 @ 1:04pm + [extractedDate, leftover] = extract_datetime(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("yarım saat sonra pusu qur", + "2017-06-27 13:34:00", "pusu qur") + testExtract("yarım saat sora anama zəng etməyi xatırlat", + "2017-06-27 13:34:00", "anama zəng etməyi xatırlat") + + def test_extractdatetime_az(self): + def extractWithFormat(text): + date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) # Tue June 27, 2017 @ 1:04pm + [extractedDate, leftover] = extract_datetime(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + print(res) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("indi vaxtıdır", + "2017-06-27 13:04:00", "vaxtıdır") + testExtract("bir saniyəyə", + "2017-06-27 13:04:01", "") + testExtract("bir dəqiqəyə", + "2017-06-27 13:05:00", "") + testExtract("gələn onillikə", + "2027-06-27 00:00:00", "") + testExtract("gələn yüzillikə", + "2117-06-27 00:00:00", "") + testExtract("gələn minillikə", + "3017-06-27 00:00:00", "") + testExtract("5 onillikə", + "2067-06-27 00:00:00", "") + testExtract("2 yüzillikə", + "2217-06-27 00:00:00", "") + testExtract("bir saata", + "2017-06-27 14:04:00", "") + testExtract("bir saat ərzində istəyirəm", + "2017-06-27 14:04:00", "istəyirəm") + testExtract("1 saniyəyə", + "2017-06-27 13:04:01", "") + testExtract("2 saniyəyə", + "2017-06-27 13:04:02", "") + testExtract("Pusunu 1 dəqiqə sonraya qur", + "2017-06-27 13:05:00", "pusunu qur") + testExtract("5 gün sonraya pusu qur", + "2017-07-02 00:00:00", "pusu qur") + testExtract("birigün", + "2017-06-29 00:00:00", "") + testExtract("birigün hava necə olacaq?", + "2017-06-29 00:00:00", "hava necə olacaq") + testExtract("Axşam 10:45 də yadıma sal", + "2017-06-27 22:45:00", "yadıma sal") + testExtract("cümə səhər hava necədir", + "2017-06-30 08:00:00", "hava necədir") + testExtract("sabah hava necedir", + "2017-06-28 00:00:00", "hava necedir") + testExtract("bu günortadan sonra hava necədir", + "2017-06-27 15:00:00", "hava necədir") + testExtract("bu axşam hava necədir", + "2017-06-27 19:00:00", "hava necədir") + testExtract("bu səhər hava neceydi", + "2017-06-27 08:00:00", "hava neceydi") + testExtract("8 həftə 2 gün sonra anama zəng etməyi xatırlat", + "2017-08-24 00:00:00", "anama zəng etməyi xatırlat") + testExtract("3 avqustda anama zəng etməyi xatırlat", + "2017-08-03 00:00:00", "anama zəng etməyi xatırlat") + testExtract("sabah 7 də anama zəng etməyi xatırlat", + "2017-06-28 07:00:00", "anama zəng etməyi xatırlat") + testExtract("sabah axşam saat 10 da anama zəng etməyi xatırlat", + "2017-06-28 22:00:00", "anama zəng etməyi xatırlat") + testExtract("səhər 7 də anama zəng etməyi xatırlat ", + "2017-06-28 07:00:00", "anama zəng etməyi xatırlat") + testExtract("bir saatdan sonra anama zəng etməyi xatırlat", + "2017-06-27 14:04:00", "anama zəng etməyi xatırlat") + testExtract("anama 17 30 da zəng etməyi xatırlat", + "2017-06-27 17:30:00", "anama zəng etməyi xatırlat") + testExtract("anama 06 30 da zəng etməyi xatırlat", + "2017-06-28 06:30:00", "anama zəng etməyi xatırlat") + testExtract("06 30 da anama zəng etməyi xatırlat", + "2017-06-28 06:30:00", "anama zəng etməyi xatırlat") + testExtract("Cümə axşamı səhər 7:00 də anama zəng etməyi xatırlat", + "2017-06-29 07:00:00", "anama zəng etməyi xatırlat") + testExtract("çərşənbə axşam 8 də anama zəng etməyi xatırlat", + "2017-06-28 20:00:00", "anama zəng etməyi xatırlat") + testExtract("iki saatdan sonra anama zəng etməyi xatırlat", + "2017-06-27 15:04:00", "anama zəng etməyi xatırlat") + testExtract("2 saatdan sonra anama zəng etməyi xatırlat", + "2017-06-27 15:04:00", "anama zəng etməyi xatırlat") + testExtract("15 dəqiqə sonra anama zəng etməyi xatırlat", + "2017-06-27 13:19:00", "anama zəng etməyi xatırlat") + testExtract("on beş dəqiqədən sonra anama zəng etməyi xatırlat", + "2017-06-27 13:19:00", "anama zəng etməyi xatırlat") + testExtract("bu şənbə günündən 2 gün sonra səhər 10 da anama zəng etməyi xatırlat", + "2017-07-03 10:00:00", "anama zəng etməyi xatırlat") + testExtract("Cümə günündən 2 gün sonra Rick Astley musiqisini çal", + "2017-07-02 00:00:00", "rick astley musiqisini çal") + testExtract("Cümə axşamı günü saat 15:45 də hücuma başlayın", + "2017-06-29 15:45:00", "hücuma başlayın") + testExtract("Bazar ertəsi günü çörəkxanadan çörək sifariş vər", + "2017-07-03 00:00:00", "çörəkxanadan çörək sifariş vər") + testExtract("Bu gündən 5 il sonra Happy Birthday musiqisini çal", + "2022-06-27 00:00:00", "happy birthday musiqisini çal") + testExtract("gələn cümə səhər hava necədir", + "2017-06-30 08:00:00", "hava necədir") + testExtract("gələn cümə axşam hava necədir", + "2017-06-30 19:00:00", "hava necədir") + testExtract("gələn cümə günortadan sonra hava necədir ", + "2017-06-30 15:00:00", "hava necədir") + testExtract("iyulun 4 də atəşfəşanlıq al", + "2017-07-04 00:00:00", "atəşfəşanlıq al") + testExtract("gələn cümə günündən 2 həftə sonra hava necədir", + "2017-07-14 00:00:00", "hava necədir") + testExtract("çərşənbə günü saat 07 00 də hava necədir", + "2017-06-28 07:00:00", "hava necədir") + testExtract("Gələn cümə axşamı saat 12:45 də görüş təyin ed", + "2017-07-06 12:45:00", "görüş təyin ed") + testExtract("Bu cümə axşamı hava necədir?", + "2017-06-29 00:00:00", "hava necədir") + testExtract("Cümə axşamı 03 45 də hücuma başlayın", + "2017-06-29 03:45:00", "hücuma başlayın") + testExtract("Cümə axşamı axşam 8 də hücuma başlayın", + "2017-06-29 20:00:00", "hücuma başlayın") + testExtract("Cümə axşamı günortada hücuma başlayın", + "2017-06-29 12:00:00", "hücuma başlayın") + testExtract("Cümə axşamı gecə yarısında hücuma başlayın", + "2017-06-29 00:00:00", "hücuma başlayın") + testExtract("Cümə axşamı saat 05:00 da hücuma başlayın", + "2017-06-29 05:00:00", "hücuma başlayın") + testExtract("4 il sonra oyanmağı xatırlat", + "2021-06-27 00:00:00", "oyanmağı xatırlat") + testExtract("4 il 4 gündə oyanmağı xatırlat", + "2021-07-01 00:00:00", "oyanmağı xatırlat") + testExtract("dekabr 3", + "2017-12-03 00:00:00", "") + testExtract("bu axşam saat 8:00 da görüşək", + "2017-06-27 20:00:00", "görüşək") + testExtract("axşam 5 də görüşək ", + "2017-06-27 17:00:00", "görüşək") + testExtract("səhər 8 də görüşək", + "2017-06-28 08:00:00", "görüşək") + testExtract("mənə səhər 8 də oyanmağı xatırlat", + "2017-06-28 08:00:00", "mənə oyanmağı xatırlat") + testExtract("çərşənbə axşamı hava necədir", + "2017-06-27 00:00:00", "hava necədir") + testExtract("bazar ertəsi hava necədir", + "2017-07-03 00:00:00", "hava necədir") + testExtract("bu çərşənbə günü hava necədir", + "2017-06-28 00:00:00", "hava necədir") + testExtract("keçən bazar ertəsi hava necə idi", + "2017-06-26 00:00:00", "hava necə idi") + testExtract("5 iyun 2017 ci il axşamı anama zəng etməyi xatırlat", + "2017-06-05 19:00:00", "anama zəng etməyi xatırlat") + testExtract("dünən hansı gün idi", + "2017-06-26 00:00:00", "hansı gün idi") + testExtract("dünən 6 da şam yedim", + "2017-06-26 06:00:00", "şam yedim") + testExtract("dünən səhər 6 da şam yedim", + "2017-06-26 06:00:00", "şam yedim") + testExtract("dünən axşam 6 da şam yedim", + "2017-06-26 18:00:00", "şam yedim") + + def test_extract_relativedatetime_az(self): + def extractWithFormat(text): + date = datetime(2017, 6, 27, 10, 1, 2, tzinfo=default_timezone()) + [extractedDate, leftover] = extract_datetime(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("5 dəqiqəyə görüşək", + "2017-06-27 10:06:02", "görüşək") + testExtract("5 saniyədə görüşək", + "2017-06-27 10:01:07", "görüşək") + testExtract("1 saatda görüşək", + "2017-06-27 11:01:02", "görüşək") + testExtract("2 saata görüşək", + "2017-06-27 12:01:02", "görüşək") + + def test_spaces(self): + self.assertEqual(normalize(" bu sınaqdır"), + "bu sınaqdır") + self.assertEqual(normalize(" bu bir sınaqdır"), + "bu 1 sınaqdır") + + def test_numbers(self): + self.assertEqual(normalize("bu bir iki üç sınaqdır"), + "bu 1 2 3 sınaqdır") + + def test_multiple_numbers(self): + self.assertEqual(extract_numbers("bu bir iki üç sınaqdır"), + [1.0, 2.0, 3.0]) + self.assertEqual(extract_numbers("bu on bir on iki on üç sınaqdır"), + [11.0, 12.0, 13.0]) + self.assertEqual(extract_numbers("bu bir iyirmi bir sınaqdır"), + [1.0, 21.0]) + self.assertEqual(extract_numbers("1 it, yeddi donuz, mənim dostum var " + "3 dəfə 5 macarena"), + [1, 7, 3, 5]) + self.assertEqual(extract_numbers("iki pivə iki ayıa"), + [2.0, 2.0]) + self.assertEqual(extract_numbers("iyirmi 20 iyirmi"), + [20, 20, 20]) + self.assertEqual(extract_numbers("iyirmi 20 22"), + [20.0, 20.0, 22.0]) + self.assertEqual(extract_numbers("iyirmi iyirmi iki iyirmi"), + [20, 22, 20]) + self.assertEqual(extract_numbers("iyirmi 2"), + [22.0]) + self.assertEqual(extract_numbers("iyirmi 20 iyirmi 2"), + [20, 20, 22]) + self.assertEqual(extract_numbers("üçdəbir bir"), + [1 / 3, 1]) + self.assertEqual(extract_numbers("altı trilyon", short_scale=True), + [6e12]) + self.assertEqual(extract_numbers("altı trilyon", short_scale=False), + [6e18]) + self.assertEqual(extract_numbers("iki donuz və altı trilyon bakteriya", + short_scale=True), [2, 6e12]) + self.assertEqual(extract_numbers("iki donuz altı trilyon bakteriya", + short_scale=False), [2, 6e18]) + self.assertEqual(extract_numbers("otuz ikinci ya birinci", + ordinals=True), [32, 1]) + self.assertEqual(extract_numbers("bu yeddi səkkiz doqquz yarım sınaqdır"), + [7.0, 8.0, 9.5]) + + +if __name__ == "__main__": + unittest.main()