From 879e79cebc1241a39c8b5d7b5b0bf8be6b61485d Mon Sep 17 00:00:00 2001 From: Siavash Mollayi Date: Mon, 14 Aug 2023 14:38:11 +0330 Subject: [PATCH] improve farsi --- lingua_franca/lang/common_data_fa.py | 194 ++- lingua_franca/lang/format_fa.py | 404 +++--- lingua_franca/lang/parse_fa.py | 1263 +++++++++++++---- lingua_franca/res/text/fa-ir/date_time.json | 108 +- .../res/text/fa-ir/date_time_test.json | 36 +- lingua_franca/res/text/fa-ir/normalize.json | 164 +-- test/test_format_fa.py | 286 ++-- test/test_parse_fa.py | 548 +++++-- 8 files changed, 2077 insertions(+), 926 deletions(-) diff --git a/lingua_franca/lang/common_data_fa.py b/lingua_franca/lang/common_data_fa.py index f44a2198..7c3db4e9 100644 --- a/lingua_franca/lang/common_data_fa.py +++ b/lingua_franca/lang/common_data_fa.py @@ -18,8 +18,50 @@ _FUNCTION_NOT_IMPLEMENTED_WARNING = "تابع خواسته شده در زبان فارسی پیاده سازی نشده است." - +_NUM_STRING_FA = { + 0: 'صفر', + 1: 'یک', + 2: 'دو', + 3: 'سه', + 4: 'چهار', + 5: 'پنج', + 6: 'شش', + 7: 'هفت', + 8: 'هشت', + 9: 'نه', + 10: 'ده', + 11: 'یازده', + 12: 'دوازده', + 13: 'سیزده', + 14: 'چهارده', + 15: 'پانزده', + 16: 'شانزده', + 17: 'هفده', + 18: 'هجده', + 19: 'نوزده' +} +_SUMS_FARSI = { + 20: 'بیست', + 30: 'سی', + 40: 'چهل', + 50: 'پنجاه', + 60: 'شصت', + 70: 'هفتاد', + 80: 'هشتاد', + 90: 'نود', + 100: 'صد', + 200: 'دویست', + 300: 'سیصد', + 400: 'چهارصد', + 500: 'پانصد', + 600: 'ششصد', + 700: 'هفصد', + 800: 'هشصد', + 900: 'نهصد', + 1000: 'هزار' +} _FRACTION_STRING_FA = { + 1: 'یکم', 2: 'دوم', 3: 'سوم', 4: 'چهارم', @@ -33,83 +75,91 @@ 12: 'دوازدهم', 13: 'سیزدهم', 14: 'چهاردهم', - 15: 'پونزدهم', + 15: 'پانزدهم', 16: 'شونزدهم', - 17: 'هیفدهم', - 18: 'هیجدهم', + 17: 'هفدهم', + 18: 'هجدهم', 19: 'نوزدهم', 20: 'بیستم' } +_SCALE_FA = OrderedDict([ + (1000, 'هزار'), + (1000000, 'میلیون'), + (1e9, "میلیارد"), + (1e12, 'تریلیون'), + (1e15, "کوادریلیون"), + (1e18, "کوئینتیلیون"), + (1e21, "سکتلیون"), + (1e24, "سپتیلیون"), + (1e27, "اکتیلیون"), + (1e30, "ننیلیون"), + (1e33, "دسیلیون") +]) +_ORDINAL_BASE_FA = { + 1: 'یکم', + 2: 'دوم', + 3: 'سوم', + 4: 'چهارم', + 5: 'پنجم', + 6: 'ششم', + 7: 'هفتم', + 8: 'هشتم', + 9: 'نهم', + 10: 'دهم', + 11: 'یازدهم', + 12: 'دوازدهم', + 13: 'سیزدهم', + 14: 'چهاردهم', + 15: 'پانزدهم', + 16: 'شانزدهم', + 17: 'هفدهم', + 18: 'هجدهم', + 19: 'نوزدهم', + 20: 'بیستم', + 30: 'سیم', + 40: "چهلم", + 50: "پنجاهم", + 60: "شصتم", + 70: "هفتادم", + 80: "هشتادم", + 90: "نودم", + 1e2: "صدم", + 1e3: "هزارم" +} +_ORDINAL_FA = { + 1e6: "میلیونم", + 1e9: "میلیاردم", + 1e12: "تریلیونم", + 1e15: "کوادریلیونم", + 1e18: "کوئینتیلیونم", + 1e21: "سکتلیونم", + 1e24: "سپتیلیونم", + 1e27: "اکتیلیونم", + 1e30: "ننیلیونم", + 1e33: "دسیلیونم" +} +_ORDINAL_FA.update(_ORDINAL_BASE_FA) +_DECIMAL_MARKER_FA = {"و"} -_FARSI_ONES = [ - "", - "یک", - "دو", - "سه", - "چهار", - "پنج", - "شش", - "هفت", - "هشت", - "نه", - "ده", - "یازده", - "دوازده", - "سیزده", - "چهارده", - "پونزده", - "شونزده", - "هیفده", - "هیجده", - "نوزده", -] - -_FARSI_TENS = [ - "", - "ده", - "بیست", - "سی", - "چهل", - "پنجاه", - "شصت", - "هفتاد", - "هشتاد", - "نود", -] - -_FARSI_HUNDREDS = [ - "", - "صد", - "دویست", - "سیصد", - "چهارصد", - "پانصد", - "ششصد", - "هفتصد", - "هشتصد", - "نهصد", -] - -_FARSI_BIG = [ - '', - 'هزار', - 'میلیون', - "میلیارد", - 'تریلیون', - "تریلیارد", -] - - -_FORMAL_VARIANT = { - 'هفده': 'هیفده', - 'هجده': 'هیجده', - 'شانزده': 'شونزده', - 'پانزده': 'پونزده', +_DECIMAL_STRING_FA = { + 1: "دهم", + 2: "صدم", + 3: "هزارم", + 6: "میلیونم", + 9: "میلیاردم", } +_FARSI_SUMS = invert_dict(_SUMS_FARSI) +_NUM_STRING_FA.update(_SUMS_FARSI) +_STRING_NUM_FA = invert_dict(_NUM_STRING_FA) +_NEGATIVES_FA = {"منفی"} -_FARSI_FRAC = ["", "ده", "صد"] -_FARSI_FRAC_BIG = ["", "هزار", "میلیونی", "میلیاردی"] - -_FARSI_SEPERATOR = ' و ' \ No newline at end of file +_EXTRA_SPOKEN_NUM_FA = { + "نیم": 2 + } +_STRING_ORDINAL_FA = invert_dict(_ORDINAL_FA) +_STRING_ORDINAL_FA.update(invert_dict(_ORDINAL_BASE_FA)) +_STRING_SCALE_FA = invert_dict(_SCALE_FA) +_STRING_FRACTION_FA = invert_dict(_FRACTION_STRING_FA) +_STRING_DECIMAL_FA = invert_dict(_DECIMAL_STRING_FA) diff --git a/lingua_franca/lang/format_fa.py b/lingua_franca/lang/format_fa.py index 04a42757..13d48c47 100644 --- a/lingua_franca/lang/format_fa.py +++ b/lingua_franca/lang/format_fa.py @@ -14,54 +14,18 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import datetime from lingua_franca.lang.format_common import convert_to_mixed_fraction -from lingua_franca.lang.common_data_fa import \ - _FARSI_ONES, _FARSI_TENS, _FARSI_HUNDREDS, _FARSI_BIG, _FARSI_SEPERATOR, \ - _FARSI_FRAC, _FARSI_FRAC_BIG, _FRACTION_STRING_FA, _FORMAL_VARIANT -import math -from lingua_franca.internal import lookup_variant -from enum import IntEnum -from functools import wraps - -class NumberVariantFA(IntEnum): - CONVERSATIONAL = 0 - FORMAL = 1 - -lookup_number = lookup_variant({ - "default": NumberVariantFA.CONVERSATIONAL, - "conversational": NumberVariantFA.CONVERSATIONAL, - "formal": NumberVariantFA.FORMAL, -}) - -def _apply_number_variant(text, variant): - if variant == NumberVariantFA.FORMAL: - for key, value in _FORMAL_VARIANT.items(): - text = text.replace(value, key) - return text - -def _handle_number_variant(func): - - @wraps(func) - @lookup_variant({ - "default": NumberVariantFA.CONVERSATIONAL, - "conversational": NumberVariantFA.CONVERSATIONAL, - "formal": NumberVariantFA.FORMAL, - }) - def wrapper(*args, **kwargs): - result = func(*args, **kwargs) - if 'variant' in kwargs: - return _apply_number_variant(result, kwargs['variant']) - else: - return result - return wrapper +from lingua_franca.lang.common_data_fa import _NUM_STRING_FA, \ + _FRACTION_STRING_FA, _SCALE_FA, _ORDINAL_FA, _DECIMAL_STRING_FA + -@_handle_number_variant -def nice_number_fa(number, speech=True, denominators=range(1, 21), variant=None): +def nice_number_fa(number, speech=True, denominators=range(1, 21)): """ Farsi helper for nice_number This function formats a float to human understandable functions. Like - 4.5 becomes "4 and a half" for speech and "4 1/2" for text + 4.5 becomes "چهار و نیم" for speech and "4 1/2" for text Args: number (int or float): the float to format @@ -89,111 +53,28 @@ def nice_number_fa(number, speech=True, denominators=range(1, 21), variant=None) return str(whole) den_str = _FRACTION_STRING_FA[den] if whole == 0: - if num == 1: - return_string = 'یک {}'.format(den_str) - else: - return_string = '{} {}'.format(num, den_str) - elif num == 1: - return_string = '{} و یک {}'.format(whole, den_str) + if den == 2: + return 'نیم' + return_string = '{} {}'.format(num, den_str) else: + if den == 2: + return '{} و نیم'.format(whole) return_string = '{} و {} {}'.format(whole, num, den_str) return return_string -def _float2tuple(value, _precision): - pre = int(value) - - post = abs(value - pre) * 10**_precision - if abs(round(post) - post) < 0.01: - # We generally floor all values beyond our precision (rather than - # rounding), but in cases where we have something like 1.239999999, - # which is probably due to python's handling of floats, we actually - # want to consider it as 1.24 instead of 1.23 - post = int(round(post)) - else: - post = int(math.floor(post)) - - while post != 0: - x, y = divmod(post, 10) - if y != 0: - break - post = x - _precision -= 1 - - return pre, post, _precision - - -def _cardinal3(number): - if (number < 19): - return _FARSI_ONES[number] - if (number < 100): - x, y = divmod(number, 10) - if y == 0: - return _FARSI_TENS[x] - return _FARSI_TENS[x] + _FARSI_SEPERATOR + _FARSI_ONES[y] - x, y = divmod(number, 100) - if y == 0: - return _FARSI_HUNDREDS[x] - return _FARSI_HUNDREDS[x] + _FARSI_SEPERATOR + _cardinal3(y) - -def _cardinalPos(number): - x = number - res = '' - for b in _FARSI_BIG: - x, y = divmod(x, 1000) - if (y == 0): - continue - yx = _cardinal3(y) - if y == 1 and b == 'هزار': - yx = b - elif b != '': - yx += ' ' + b - if (res == ''): - res = yx - else: - res = yx + _FARSI_SEPERATOR + res - return res - -def _fractional(number, l): - if (number / 10**l == 0.5): - return "نیم" - x = _cardinalPos(number) - ld3, lm3 = divmod(l, 3) - ltext = (_FARSI_FRAC[lm3] + " " + _FARSI_FRAC_BIG[ld3]).strip() + 'م' - return x + " " + ltext - -def _to_ordinal(number): - r = _to_cardinal(number, 0) - if (r[-1] == 'ه' and r[-2] == 'س'): - return r[:-1] + 'وم' - return r + 'م' - -def _to_ordinal_num(value): - return str(value)+"م" - -def _to_cardinal(number, places): - if number < 0: - return "منفی " + _to_cardinal(-number, places) - if (number == 0): - return "صفر" - x, y, l = _float2tuple(number, places) - if y == 0: - return _cardinalPos(x) - if x == 0: - return _fractional(y, l) - return _cardinalPos(x) + _FARSI_SEPERATOR + _fractional(y, l) - -@_handle_number_variant -def pronounce_number_fa(number, places=2, scientific=False, - ordinals=False, variant=None): +def pronounce_number_fa(number, places=2, short_scale=True, scientific=False, + ordinals=False): """ Convert a number to it's spoken equivalent - For example, '5.2' would return 'five point two' + For example, '5.2' would return 'پنج و دو دهم' Args: num(float or int): the number to pronounce (under 100) places(int): maximum decimal places to speak + short_scale (bool) : use short (True) or long scale (False) + https://en.wikipedia.org/wiki/Names_of_large_numbers scientific (bool): pronounce in scientific notation ordinals (bool): pronounce in ordinal form "first" instead of "one" Returns: @@ -202,31 +83,142 @@ def pronounce_number_fa(number, places=2, scientific=False, num = number # deal with infinity if num == float("inf"): - return "بینهایت" + return "بی نهایت" elif num == float("-inf"): - return "منفی بینهایت" + return "منفی بی نهایت" if scientific: - if number == 0: - return "صفر" number = '%E' % num n, power = number.replace("+", "").split("E") power = int(power) if power != 0: - return '{}{} ضرب در ده به توان {}{}'.format( - 'منفی ' if float(n) < 0 else '', - pronounce_number_fa( - abs(float(n)), places, False, ordinals=False), - 'منفی ' if power < 0 else '', - pronounce_number_fa(abs(power), places, False, ordinals=False)) - if ordinals: - return _to_ordinal(number) - return _to_cardinal(number, places) + if ordinals: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} ضرب در ده به توان {}{}'.format( + 'منفی ' if float(n) < 0 else '', + pronounce_number_fa( + abs(float(n)), places, short_scale, False, ordinals=False), + 'منفی ' if power < 0 else '', + pronounce_number_fa(abs(power), places, short_scale, False, ordinals=True)) + else: + # This handles negatives of powers separately from the normal + # handling since each call disables the scientific flag + return '{}{} ضرب در ده به توان {}{}'.format( + 'منفی ' if float(n) < 0 else '', + pronounce_number_fa( + abs(float(n)), places, short_scale, False), + 'منفی ' if power < 0 else '', + pronounce_number_fa(abs(power), places, short_scale, False)) + + number_names = _NUM_STRING_FA.copy() + number_names.update(_SCALE_FA) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] -@_handle_number_variant -def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False, variant=None): + hunds = [number_names[n] for n in range(100, 1000, 100)] + + hundreds = [_SCALE_FA[n] for n in _SCALE_FA.keys()] + hundreds = ['صد'] + hundreds + # deal with negatives + result = "" + if num < 0: + result = "منفی " + num = abs(num) + + # check for a direct match + if num in number_names and not ordinals: + if num > 1000: + result += "یک " + result += number_names[num] + else: + def _sub_thousand(n, ordinals=False): + assert 0 <= n <= 999 + if n in _ORDINAL_FA and ordinals: + return _ORDINAL_FA[n] + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + return tens[q - 1] + (" و " + _sub_thousand(r, ordinals) if r + else "") + else: + q, r = divmod(n, 100) + return hunds[q-1] + (" و " + _sub_thousand(r, ordinals) if r else "") + + def _short_scale(n): + if n >= max(_SCALE_FA.keys()): + return "بی نهایت" + ordi = ordinals + + if int(n) != n: + ordi = False + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z, not i and ordi) + + if i: + if i >= len(hundreds): + return "" + number += " " + if ordi: + + if i * 1000 in _ORDINAL_FA: + if z == 1: + number = _ORDINAL_FA[i * 1000] + else: + number += _ORDINAL_FA[i * 1000] + else: + if n not in _SCALE_FA: + num = int("1" + "0" * (len(str(n)) - 2)) + + number += _SCALE_FA[num] + "م" + else: + number = _SCALE_FA[n] + "م" + else: + number += hundreds[i] + if number.startswith("یک هزار"): + number = number[3:] + res.append(number) + ordi = False + + return " و ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + result += _short_scale(num) + + # deal with scientific notation unpronounceable as number + if not result and "e" in str(num): + return pronounce_number_fa(num, places, short_scale, scientific=True) + # Deal with fractional part + elif not num == int(num) and places > 0: + if result and not result == "منفی ": + result += " و" + _num_str = str(num) + _num_str = _num_str.split(".")[1][0:places] + print(_num_str) + result += (" " if result and not result == "منفی " else "") + pronounce_number_fa(int(_num_str)) + if len(_num_str) < 9: + result += " " + _DECIMAL_STRING_FA[len(_num_str)] + return result + + +def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False): """ Format a time to a comfortable human format - For example, generate 'five thirty' for speech or '5:30' for + For example, generate 'پنج و نیم' for speech or '5:30' for text display. Args: dt (datetime): date to format (assumes already in local timezone) @@ -242,7 +234,11 @@ def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False, variant=None else: if use_ampm: # e.g. "3:01 AM" or "2:22 PM" - string = dt.strftime("%I:%M %p") + string = dt.strftime("%I:%M") + if dt.strftime("%p") == "AM": + string += " قبل از ظهر" + else: + string += " بعد از ظهر" else: # e.g. "3:01" or "2:22" string = dt.strftime("%I:%M") @@ -255,42 +251,36 @@ def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False, variant=None # Generate a speakable version of the time if use_24hour: speak = "" - - # Either "0 8 hundred" or "13 hundred" - if string[0] == '0': - speak += pronounce_number_fa(int(string[1])) + if dt.hour == 0: + speak = pronounce_number_fa(dt.minute) + " دقیقه‌ی بامداد" else: - speak = pronounce_number_fa(int(string[0:2])) - if not string[3:5] == '00': - speak += " و " - if string[3] == '0': - speak += pronounce_number_fa(int(string[4])) - else: + speak = pronounce_number_fa(dt.hour) + if string[3:5] != "00": + speak += " و " speak += pronounce_number_fa(int(string[3:5])) - speak += ' دقیقه' return speak else: if dt.hour == 0 and dt.minute == 0: - return "نیمه شب" - elif dt.hour == 12 and dt.minute == 0: - return "ظهر" + return "دوازده شب" + + if dt.hour == 12 and dt.minute == 0: + return "دوازده ظهر" + hour = dt.hour % 12 or 12 # 12 hour clock and 0 is spoken as 12 - if dt.minute == 15: + if dt.minute == 0: + speak = pronounce_number_fa(hour) + elif dt.minute == 15: speak = pronounce_number_fa(hour) + " و ربع" elif dt.minute == 30: speak = pronounce_number_fa(hour) + " و نیم" elif dt.minute == 45: next_hour = (dt.hour + 1) % 12 or 12 - speak = "یه ربع به " + pronounce_number_fa(next_hour) + speak = "یک ربع به " + pronounce_number_fa(next_hour) + elif dt.minute < 30: + speak = pronounce_number_fa(hour) + " و " + pronounce_number_fa(dt.minute) + " دقیقه" else: - speak = pronounce_number_fa(hour) - - if dt.minute == 0: - if not use_ampm: - return speak - else: - speak += " و " + pronounce_number_fa(dt.minute) + ' دقیقه' + pronounce_number_fa(dt.minute) + "دقیقه به " + pronounce_number_fa(hour) if use_ampm: if dt.hour > 11: @@ -299,3 +289,67 @@ def nice_time_fa(dt, speech=True, use_24hour=False, use_ampm=False, variant=None speak += " قبل از ظهر" return speak + +def nice_duration_fa(duration, speech=True): + """ Convert duration in seconds to a nice spoken timespan + + Examples: + duration = 60 -> "1:00" or "یک دقیقه" + duration = 163 -> "2:43" or "یک دقیقه و چهل و سه ثانیه" + + Args: + duration: time, in seconds + speech (bool): format for speech (True) or display (False) + + Returns: + str: timespan as a string + """ + + if isinstance(duration, datetime.timedelta): + duration = duration.total_seconds() + + # Do traditional rounding: 2.5->3, 3.5->4, plus this + # helps in a few cases of where calculations generate + # times like 2:59:59.9 instead of 3:00. + duration += 0.5 + + days = int(duration // 86400) + hours = int(duration // 3600 % 24) + minutes = int(duration // 60 % 60) + seconds = int(duration % 60) + + if speech: + out = "" + if days > 0: + out += pronounce_number_fa(days) + " " + out += "روز" + if hours > 0: + if out: + out += " و " + out += pronounce_number_fa(hours) + " " + out += "ساعت" + if minutes > 0: + if out: + out += " و " + out += pronounce_number_fa(minutes) + " " + out += "دقیقه" + if seconds > 0: + if out: + out += " و " + out += pronounce_number_fa(seconds) + " " + out += "ثانیه" + else: + # M:SS, MM:SS, H:MM:SS, Dd H:MM:SS format + out = "" + if days > 0: + out = str(days) + " " + if hours > 0 or days > 0: + out += str(hours) + ":" + if minutes < 10 and (hours > 0 or days > 0): + out += "0" + out += str(minutes) + ":" + if seconds < 10: + out += "0" + out += str(seconds) + + return out \ No newline at end of file diff --git a/lingua_franca/lang/parse_fa.py b/lingua_franca/lang/parse_fa.py index bda9293f..caa51d4a 100644 --- a/lingua_franca/lang/parse_fa.py +++ b/lingua_franca/lang/parse_fa.py @@ -14,130 +14,354 @@ # limitations under the License. # from datetime import datetime, timedelta +import locale +from lingua_franca.parse import normalize +locale.setlocale(locale.LC_TIME, "fa_IR") from dateutil.relativedelta import relativedelta +from lingua_franca.time import now_local from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer -from lingua_franca.lang.common_data_fa import _FARSI_BIG, \ - _FARSI_ONES, _FARSI_TENS, _FARSI_HUNDREDS, _FORMAL_VARIANT +from lingua_franca.lang.common_data_fa import _FARSI_SUMS, _STRING_DECIMAL_FA, \ + _ORDINAL_FA, _NEGATIVES_FA, _DECIMAL_MARKER_FA, _STRING_NUM_FA, _STRING_SCALE_FA, \ + _STRING_ORDINAL_FA, _STRING_FRACTION_FA, _EXTRA_SPOKEN_NUM_FA import re import json from lingua_franca.internal import resolve_resource_file -def _is_number(s): - try: - float(s) - return True - except ValueError: - return False - -def _parse_sentence(text): - for key, value in _FORMAL_VARIANT.items(): - text = text.replace(key, value) - ar = text.split() - result = [] - current_number = 0 - current_words = [] - s = 0 - step = 10 - mode = 'init' - def finish_num(): - nonlocal current_number - nonlocal s - nonlocal result - nonlocal mode - nonlocal current_words - current_number += s - if current_number != 0: - result.append((current_number, current_words)) - s = 0 - current_number = 0 - current_words = [] - mode = 'init' - for x in ar: - if x == "و": - if mode == 'num_ten' or mode == 'num_hundred' or mode == 'num_one': - mode += '_va' - current_words.append(x) - elif mode == 'num': - current_words.append(x) - else: - finish_num() - result.append(x) - elif x == "نیم": - current_words.append(x) - current_number += 0.5 - finish_num() - elif x in _FARSI_ONES: - t = _FARSI_ONES.index(x) - if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': - if not(t < 10 and mode == 'num_ten_va'): - finish_num() - current_words.append(x) - s += t - mode = 'num_one' - elif x in _FARSI_TENS: - if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': - finish_num() - current_words.append(x) - s += _FARSI_TENS.index(x)*10 - mode = 'num_ten' - elif x in _FARSI_HUNDREDS: - if mode != 'init' and mode != 'num': - finish_num() - current_words.append(x) - s += _FARSI_HUNDREDS.index(x)*100 - mode = 'num_hundred' - elif x in _FARSI_BIG: - current_words.append(x) - d = _FARSI_BIG.index(x) - if mode == 'init' and d == 1: - s = 1 - s *= 10**(3*d) - current_number += s - s = 0 - mode = 'num' - elif _is_number(x): - current_words.append(x) - current_number = float(x) - finish_num() +def _replace_farsi_numbers(text): + for i in text: + if ord(i) in range(1776, 1786): + text = text.replace(i, chr(ord(i)-1728)) + return text + +def _convert_words_to_numbers_fa(text, short_scale=True, ordinals=False): + """ + Convert words in a string into their equivalent numbers. + Args: + text str: + short_scale boolean: True if short scale numbers should be used. + ordinals boolean: True if ordinals (e.g. یکم, دوم, سوم) should + be parsed to their number values (1, 2, 3...) + + Returns: + str + The original text, with numbers subbed in where appropriate. + + """ + tokens = tokenize(text) + numbers_to_replace = \ + _extract_numbers_with_text_fa(tokens, short_scale, ordinals) + numbers_to_replace.sort(key=lambda number: number.start_index) + + results = [] + for token in tokens: + if not numbers_to_replace or \ + token.index < numbers_to_replace[0].start_index: + results.append(token.word) else: - finish_num() - result.append(x) - if mode[:3] == 'num': - finish_num() - return result + if numbers_to_replace and \ + token.index == numbers_to_replace[0].start_index: + results.append(str(numbers_to_replace[0].value)) + if numbers_to_replace and \ + token.index == numbers_to_replace[0].end_index: + numbers_to_replace.pop(0) + + return ' '.join(results) + +def _extract_numbers_with_text_fa(tokens, short_scale=True, ordinals=False, fractional_numbers=True): + """ + Extract all numbers from a list of Tokens, with the words that + represent them. + + Args: + [Token]: The tokens to parse. + short_scale bool: True if short scale numbers should be used, False for + long scale. True by default. + ordinals bool: True if ordinal words (first, second, third, etc) should + be parsed. + fractional_numbers bool: True if we should look for fractions and + decimals. + + Returns: + [ReplaceableNumber]: A list of tuples, each containing a number and a + string. + + """ + placeholder = "" # inserted to maintain correct indices + results = [] + while True: + to_replace = \ + _extract_number_with_text_fa(tokens, short_scale, + ordinals, fractional_numbers) + + if not to_replace: + break + results.append(to_replace) -_time_units = { - 'ثانیه': timedelta(seconds=1), - 'دقیقه': timedelta(minutes=1), - 'ساعت': timedelta(hours=1), -} + tokens = [ + t if not + to_replace.start_index <= t.index <= to_replace.end_index + else + Token(placeholder, t.index) for t in tokens + ] + results.sort(key=lambda n: n.start_index) + return results -_date_units = { - 'روز': timedelta(days=1), - 'هفته': timedelta(weeks=1), -} +def _extract_number_with_text_fa(tokens, short_scale=True, ordinals=False, fractional_numbers=True): + """ + Handle numbers. + + Args: + tokens [Token]: + short_scale boolean: + ordinals boolean: + fractional_numbers boolean: + + Returns: + ReplaceableNumber + + """ + + number_words = [] + val = False + state = 0 + num_state = 0 + to_sum = [] + negative = 1 + explicit_ordinals = False + idx = 0 + while idx < len(tokens): + token = tokens[idx] + idx += 1 + current_val = None + + word = token.word.lower() + if ordinals and word in _STRING_ORDINAL_FA: + word = str(_STRING_ORDINAL_FA[word]) + explicit_ordinals = True + + prev_word = tokens[idx - 1].word.lower() if idx > 0 else "" + prevprev_word = tokens[idx - 2].word.lower() if idx > 1 else "" + next_word = tokens[idx + 1].word.lower() if idx + 1 < len(tokens) else "" + + # explicit ordinals + if is_numeric(word[:-2]) and \ + (word.endswith("ام")): + word = word[:-2] + explicit_ordinals = True + if word in _STRING_NUM_FA: + if ordinals and word not in _FARSI_SUMS: + val = False + continue + word = _STRING_NUM_FA[word] + if state == 0: + if is_numeric(word): + current_val = float(word) + if ordinals and not explicit_ordinals and current_val < 20: + val = False + continue + + if num_state == 0: + number_words.append(token) + state = 1 + val = current_val + if current_val < 20: + num_state = 1 + elif current_val < 100 and current_val % 10 == 0: + num_state = 2 + elif current_val < 1000 and current_val % 100 == 0: + num_state = 3 + elif current_val < 1000 and current_val % 10 == 0: + num_state = 2 + else: + num_state = 4 + elif num_state == 1: + break + elif num_state == 2: + if 0 < current_val < 10: + val += current_val + num_state = 0 + number_words.append(token) + state = 1 + else: + to_sum.append(val) + val = False + state = 0 + num_state = 0 + idx -= 1 + elif num_state == 3: + if current_val < 100: + number_words.append(token) + state = 1 + val += current_val + if current_val < 20: + num_state = 1 + elif current_val % 10 == 0: + num_state = 2 + else: + num_state = 1 + else: + to_sum.append(val) + state = 0 + num_state = 0 + val = False + idx -= 1 + elif num_state == 4: + to_sum.append(val) + state = 0 + num_state = 0 + val = False + idx -= 1 + elif '/' in word: + temp = word.split('/') + if len(temp) == 2: + current_val = int(temp[0]) / int(temp[1]) + val = current_val + number_words.append(token) + elif word in _EXTRA_SPOKEN_NUM_FA: + current_val = 1 / _EXTRA_SPOKEN_NUM_FA[word] + if not val: + val = 0 + val += current_val + number_words.append(token) + state = 1 + num_state = 1 + elif word in _NEGATIVES_FA: + negative = -1 + number_words.append(token) + val = False + state = 0 + num_state = 0 + elif word == 'و': + number_words.append(token) + continue + else: + if not val and not to_sum: + continue + else: + break + elif state == 1: + if word == 'و': + number_words.append(token) + current_val = False + state = 0 + if num_state == 0: + to_sum.append(val) + val = False + elif word == 'ممیز': + number_words.append(token) + current_val = False + to_sum.append(val) + temp = _extract_number_with_text_fa(tokens[idx:], short_scale, ordinals, fractional_numbers) + degree = 1 + while temp.value > 10 ** degree: + degree += 1 + to_sum.append(temp.value / 10 ** degree) + for t in temp.tokens: + number_words.append(t) + val = False + break + elif word in _STRING_SCALE_FA: + current_val = _STRING_SCALE_FA[word] + val *= current_val + to_sum.append(val) + val = False + number_words.append(token) + state = 0 + num_state = 0 + elif word == 1000: + current_val = word + val *= current_val + to_sum.append(val) + val = False + number_words.append(token) + state = 0 + num_state = 0 + elif word in _STRING_DECIMAL_FA: + current_val = 10 ** _STRING_DECIMAL_FA[word] + val /= current_val + to_sum.append(val) + val = False + number_words.append(token) + state = 0 + elif word in _STRING_FRACTION_FA: + current_val = _STRING_FRACTION_FA[word] + val /= current_val + to_sum.append(val) + val = False + number_words.append(token) + state = 0 + else: + break + + if to_sum: + val += sum(to_sum) + if val is not False: + val = negative * val + if val and val % 1 == 0: + val = int(val) + return ReplaceableNumber(val, number_words) + # return ReplaceableNumber(False, []) + +def extract_numbers_fa(text, short_scale=True, ordinals=False): + """ + Takes in a string and extracts a list of numbers. + + Args: + text (str): the string to extract a number from + short_scale (bool): Use "short scale" or "long scale" + ordinals (bool): consider ordinal numbers + Returns: + list: list of extracted numbers as floats + """ + text = _replace_farsi_numbers(text) + normalizer = FarsiNormalizer() + text = normalizer.replace_words(text) + text = normalizer.normalize_ordinals(text) + results = _extract_numbers_with_text_fa(tokenize(text), + short_scale, ordinals) + return [float(result.value) for result in results] + +def extract_number_fa(text, short_scale=True, ordinals=False): + """ + This function extracts a number from a text string + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + + """ + text = _replace_farsi_numbers(text) + text = FarsiNormalizer().replace_words(text) + return _extract_number_with_text_fa(tokenize(text.lower()), + short_scale, ordinals).value def extract_duration_fa(text): """ Convert an english phrase into a number of seconds Convert things like: - "10 minute" - "2 and a half hours" - "3 days 8 hours 10 minutes and 49 seconds" + "۱۰ دقیقه" + "۲ و نیم ساعت" + "۳ روز و ۸ساعت و ۱۰ دقیقه و ۴۹ ثانیه" into an int, representing the total number of seconds. The words used in the duration will be consumed, and the remainder returned. - As an example, "set a timer for 5 minutes" would return - (300, "set a timer for"). + As an example, "برای ۵ دقیقه هشدار تنظیم کن" would return + (300, "برای هشدار تنظیم کن"). Args: text (str): string containing a duration @@ -149,44 +373,61 @@ def extract_duration_fa(text): be None if no duration is found. The text returned will have whitespace stripped from the ends. """ - remainder = [] - ar = _parse_sentence(text) - current_number = None - result = timedelta(0) - for x in ar: - if x == "و": - continue - elif type(x) == tuple: - current_number = x - elif x in _time_units: - result += _time_units[x] * current_number[0] - current_number = None - elif x in _date_units: - result += _date_units[x] * current_number[0] - current_number = None - else: - if current_number: - remainder.extend(current_number[1]) - remainder.append(x) - current_number = None - return (result, " ".join(remainder)) + if not text: + return None + time_units = { + 'microseconds': 0, + 'milliseconds': 0, + 'seconds': 0, + 'minutes': 0, + 'hours': 0, + 'days': 0, + 'weeks': 0 + } + + time_units_fa = { + 'میکرو ثانیه': 'microseconds', + 'میلی ثانیه': 'milliseconds', + 'ثانیه': 'seconds', + 'دقیقه': 'minutes', + 'ساعت': 'hours', + 'روز': 'days', + 'هفته': 'weeks' + } + + # pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}?" + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}?(?:(?:\s|,|و)+)?(?Pنیم|0\.5)?" + text = _convert_words_to_numbers_fa(text) + print(text) + for unit_fa in time_units_fa: + unit_pattern = pattern.format(unit=unit_fa) + + def repl(match): + time_units[time_units_fa[unit_fa]] += float(match.group(1)) + return '' + text = re.sub(unit_pattern, repl, text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + + return (duration, text) def extract_datetime_fa(text, anchorDate=None, default_time=None): """ Convert a human date reference into an exact datetime Convert things like - "today" - "tomorrow afternoon" - "next Tuesday at 4pm" - "August 3rd" + "امروز" + "فردا بعد از ظهر" + "سه شنبه بعد ساعت 4 بعد از ظهر" + "سوم آگوست" into a datetime. If a reference date is not provided, the current local time is used. Also consumes the words used to define the date returning the remaining string. For example, the string - "what is Tuesday's weather forecast" + "سه شنبه هوا چطوره؟" returns the date for the forthcoming Tuesday relative to the reference date and the remainder string - "what is weather forecast". + "هوا چطوره". The "next" instance of a day or weekend is considered to be no earlier than 48 hours in the future. On Friday, "next Monday" would be in 3 days. @@ -202,192 +443,644 @@ def extract_datetime_fa(text, anchorDate=None, default_time=None): text not consumed in the parsing, or None if no date or time related text was found. """ + def clean_string(s): + # normalize and lowercase utt (replaces words with numbers) + s = _convert_words_to_numbers_fa(s, ordinals=True) + # clean unneeded punctuation and capitalization among other things. + s = s.lower().replace('?', '').replace(',', '').replace('؟', '') + + wordList = s.split() + skip_next_word = False + skip_next_next_word = False + new_words = [] + for idx, word in enumerate(wordList): + if skip_next_word: + if not skip_next_next_word: + skip_next_word = False + skip_next_next_word = False + continue + wordNext = wordList[idx + 1] if idx + 1 < len(wordList) else "" + wordNextNext = wordList[idx + 2] if idx + 2 < len(wordList) else "" + + ordinals = ["ام"] + if word[0].isdigit(): + for ordinal in ordinals: + if wordNext.startswith(ordinal): + skip_next_word = True + if (word in ['بعد', 'قبل'] and wordNext == 'از' and wordNextNext == 'ظهر'): + word = word + ' ' + wordNext + ' ' + wordNextNext + skip_next_word = True + skip_next_next_word = True + if (word in ['یک', 'سه', 'دو', 'پنج'] and wordNext == 'شنبه')\ + or (word in ['نیمه', 'نصف'] and wordNext == 'شب'): + word = word + wordNext + skip_next_word = True + new_words.append(word) + return new_words + + def date_found(): + return found or \ + ( + datestr != "" or + yearOffset != 0 or monthOffset != 0 or + dayOffset is True or hrOffset != 0 or + hrAbs or minOffset != 0 or + minAbs or secOffset != 0 + ) + + if not anchorDate: + anchorDate = now_local() + if text == "": return None - text = text.lower().replace('‌', ' ').replace('.', '').replace('،', '') \ - .replace('?', '').replace("پس فردا", "پسفردا") \ - .replace('یک شنبه', 'یکشنبه') \ - .replace('دو شنبه', 'دوشنبه') \ - .replace('سه شنبه', 'سهشنبه') \ - .replace('چهار شنبه', 'چهارشنبه') \ - .replace('پنج شنبه', 'پنجشنبه') \ - .replace('بعد از ظهر', 'بعدازظهر') \ - - - if not anchorDate: - anchorDate = datetime.now() - today = anchorDate.replace(hour=0, minute=0, second=0, microsecond=0) - today_weekday = int(anchorDate.strftime("%w")) - weekday_names = [ - 'دوشنبه', - 'سهشنبه', - 'چهارشنبه', - 'پنجشنبه', - 'جمعه', - 'شنبه', - 'یکشنبه', - ] - daysDict = { - 'پریروز': today + timedelta(days= -2), - 'دیروز': today + timedelta(days= -1), - 'امروز': today, - 'فردا': today + timedelta(days= 1), - 'پسفردا': today + timedelta(days= 2), - } - timesDict = { - 'صبح': timedelta(hours=8), - 'بعدازظهر': timedelta(hours=15), - } - exactDict = { - 'الان': anchorDate, - } - nextWords = ["بعد", "دیگه"] - prevWords = ["پیش", "قبل"] - ar = _parse_sentence(text) - mode = 'none' - number_seen = None - delta_seen = timedelta(0) - remainder = [] - result = None - for x in ar: - handled = 1 - if mode == 'finished': - remainder.append(x) - elif x == 'و' and mode[:5] == 'delta': - pass - elif type(x) == tuple: - number_seen = x - elif x in weekday_names: - dayOffset = (weekday_names.index(x) + 1) - today_weekday + + found = False + daySpecified = False + dayOffset = False + monthOffset = 0 + yearOffset = 0 + today = anchorDate.strftime("%w") + currentYear = anchorDate.strftime("%Y") + datestr = "" + hasYear = False + timeQualifier = "" + + timeQualifiersAM = ['صبح', 'قبل از ظهر', 'بامداد'] + timeQualifiersPM = ['بعد از ظهر', 'عصر', 'شب', 'امشب'] + timeQualifiersList = set(timeQualifiersAM + timeQualifiersPM) + markers = ['نیم', 'در', 'عرض', 'برای', 'از', 'این'] + days = ['دوشنبه', 'سهشنبه', 'چهارشنبه', + 'پنجشنبه', 'جمعه', 'شنبه', 'یکشنبه'] + months = ['ژانویه', 'فوریه', 'مارس', 'آوریل', 'می', 'ژوئن', + 'ژوئیه', 'اوت', 'سپتامبر', 'اکتبر', 'نوامبر', + 'دسامبر'] + recur_markers = days + [d + ' ها' for d in days] + year_multiples = ["دهه", "قرن", "هزاره"] + day_multiples = ["هفته", "ماه", "سال"] + text = _convert_words_to_numbers_fa(text) + words = clean_string(text) + print(words) + for idx, word in enumerate(words): + if word == "": + continue + wordPrevPrevPrev = words[idx - 3] if idx > 2 else "" + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + + # this isn't in clean string because I don't want to save back to words + start = idx + used = 0 + # save timequalifier for later + if word == "الان" and not datestr: + resultStr = " ".join(words[idx + 1:]) + resultStr = ' '.join(resultStr.split()) + extractedDate = anchorDate.replace(microsecond=0) + return [extractedDate, resultStr] + elif wordNext in year_multiples: + multiplier = None + if is_numeric(word): + multiplier = extract_number_fa(word) + multiplier = multiplier or 1 + multiplier = int(multiplier) + used += 2 + if wordNext == "دهه": + yearOffset = multiplier * 10 + elif wordNext == "قرن": + yearOffset = multiplier * 100 + elif wordNext == "هزاره": + yearOffset = multiplier * 1000 + elif word in timeQualifiersList: + timeQualifier = word + # parse امروز, فردا, پسفردا + elif word == "امروز": + dayOffset = 0 + used += 1 + elif word == "فردا": + dayOffset = 1 + used += 1 + elif word == "پریروز": + dayOffset = -2 + used += 1 + elif word == "دیروز": + dayOffset = -1 + used += 1 + elif word == "پسفردا": + dayOffset = 2 + used = 1 + # parse 5 days, 10 weeks, last week, next week + elif word == "روز": + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) + start -= 1 + used = 2 + elif wordNext: + dayOffset = 1 + used = 1 + if wordNext == "بعد": + used += 1 + elif wordPrevPrev == 'بعد': + start -= 1 + used += 1 + elif wordNext in ["قبل", "پیش"]: + dayOffset *= -1 + used += 1 + elif word == "هفته": + if wordPrev and wordPrev[0].isdigit(): + dayOffset += int(wordPrev) * 7 + start -= 1 + used = 2 + elif wordNext: + dayOffset = 7 + used = 1 + if wordPrevPrev == 'از' and wordPrevPrevPrev == 'بعد': + start -= 2 + used += 2 + elif wordPrevPrev == 'بعد': + start -= 1 + used += 1 + elif wordNext == "بعد": + used += 1 + elif wordNext in ["قبل", "پیش"]: + dayOffset *= -1 + used += 1 + # parse 10 months, next month, last month + elif word == "ماه": + if wordPrev and wordPrev[0].isdigit(): + monthOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordNext: + monthOffset = 1 + used = 1 + if wordNext == "بعد": + used += 1 + elif wordPrevPrev == 'بعد': + start -= 1 + used += 1 + elif wordNext in ["قبل", "پیش"]: + monthOffset *= -1 + used += 1 + # parse 5 years, next year, last year + elif word == "سال": + if wordPrev and wordPrev[0].isdigit(): + yearOffset = int(wordPrev) + start -= 1 + used = 2 + elif wordNext: + used = 1 + yearOffset = 1 + if wordNext == "بعد": + used += 1 + elif wordPrevPrev == 'بعد': + start -= 1 + used += 1 + elif wordNext in ["قبل", "پیش"]: + yearOffset *= -1 + used += 1 + elif word == "دهه": + if wordPrev and wordPrev[0].isdigit(): + yearOffset = int(wordPrev) * 10 + start -= 1 + used = 2 + elif wordNext: + used = 1 + yearOffset = 10 + if wordNext == "بعد": + used += 1 + elif wordPrevPrev == 'بعد': + start -= 1 + used += 1 + elif wordNext in ["قبل", "پیش"]: + yearOffset *= -1 + used += 1 + # parse Monday, Tuesday, etc., and next Monday, + # last Tuesday, etc. + elif word in days: + d = days.index(word) + dayOffset = (d + 1) - int(today) + used = 1 if dayOffset < 0: dayOffset += 7 - result = today + timedelta(days=dayOffset) - mode = 'time' - elif x in exactDict: - result = exactDict[x] - mode = 'finished' - elif x in daysDict: - result = daysDict[x] - mode = 'time' - elif x in timesDict and mode == 'time': - result += timesDict[x] - mode = 'finish' - elif x in _date_units: - k = 1 - if (number_seen): - k = number_seen[0] - number_seen = None - delta_seen += _date_units[x] * k - if mode != 'delta_time': - mode = 'delta_date' - elif x in _time_units: - k = 1 - if (number_seen): - k = number_seen[0] - number_seen = None - delta_seen += _time_units[x] * k - mode = 'delta_time' - elif x in nextWords or x in prevWords: - # Give up instead of incorrect result - if mode == 'time': - return None - sign = 1 if x in nextWords else -1 - if mode == 'delta_date': - result = today + delta_seen - mode = 'time' - elif mode == 'delta_time': - result = anchorDate + delta_seen - mode = 'finished' + if wordNext == "بعد": + if dayOffset <= 2: + dayOffset += 7 + used += 1 + elif wordNext == "قبل": + dayOffset -= 7 + used += 1 + if wordPrev in markers: + words[idx - 1] = "" + # parse 15 of July, June 20th, Feb 18, 19 of February + elif word in months: + m = months.index(word) + used += 1 + datestr = months[m] + if wordPrev and wordPrev[0].isdigit(): + datestr += " " + wordPrev + start -= 1 + used += 1 + if wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + hasYear = True + else: + hasYear = False + + elif wordNext and wordNext[0].isdigit(): + datestr += " " + wordNext + used += 1 + if wordNextNext and wordNextNext[0].isdigit(): + datestr += " " + wordNextNext + used += 1 + hasYear = True + else: + hasYear = False else: - handled = 0 - else: - handled = 0 - if handled == 1: + datestr = "" + used = 0 + + if used > 0: + if start - 1 > 0 and words[start - 1] == "این": + start -= 1 + used += 1 + + for i in range(0, used): + words[i + start] = "" + + if start - 1 >= 0 and words[start - 1] in markers: + words[start - 1] = "" + found = True + daySpecified = True + + # parse time + hrOffset = 0 + minOffset = 0 + secOffset = 0 + hrAbs = None + minAbs = None + military = False + + for idx, word in enumerate(words): + if word == "": continue - if number_seen: - remainder.extend(number_seen[1]) - number_seen = None - remainder.append(x) - return (result, " ".join(remainder)) + wordPrevPrev = words[idx - 2] if idx > 1 else "" + wordPrev = words[idx - 1] if idx > 0 else "" + wordNext = words[idx + 1] if idx + 1 < len(words) else "" + wordNextNext = words[idx + 2] if idx + 2 < len(words) else "" + # parse noon, midnight, morning, afternoon, evening + used = 0 + if word == "ظهر": + hrAbs = 12 + used += 1 + elif word in ["نصفشب", "نیمهشب"]: + hrAbs = 0 + used += 1 + elif word == "صبح": + if hrAbs is None: + hrAbs = 8 + used += 1 + elif word == "بعد از ظهر": + if hrAbs is None: + hrAbs = 15 + used += 1 + elif word == "عصر": + if hrAbs is None: + hrAbs = 19 + used += 1 -def is_fractional_fa(input_str, short_scale=True): - """ - This function takes the given text and checks if it is a fraction. + # parse half an hour, quarter hour + elif word == "ساعت" and wordPrev in markers: + if wordPrev == "نیم": + minOffset = 30 + used += 1 + else: + hrOffset = 1 + used += 1 + words[idx - 1] = "" + hrAbs = -1 + minAbs = -1 + elif word[0].isdigit(): + isTime = True + strHH = "" + strMM = "" + remainder = "" + wordNextNextNext = words[idx + 3] if idx + 3 < len(words) else "" + if wordNext == "امشب" or wordNextNext == "امشب" or \ + wordPrev == "امشب" or wordPrevPrev == "امشب": + remainder = "شب" + if wordPrev == "امشب": + words[idx - 1] = "" + if wordPrevPrev == "امشب": + words[idx - 2] = "" + if wordNextNext == "امشب": + used += 1 + if wordNextNextNext == "امشب": + used += 1 - Args: - input_str (str): the string to check if fractional - short_scale (bool): use short scale if True, long scale if False - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction + if ':' in word: + # parse colons + # "3:00 in the morning" + stage = 0 + length = len(word) + for i in range(length): + if stage == 0: + if word[i].isdigit(): + strHH += word[i] + elif word[i] == ":": + stage = 1 + else: + stage = 2 + i -= 1 + elif stage == 1: + if word[i].isdigit(): + strMM += word[i] + else: + stage = 2 + i -= 1 + elif stage == 2: + remainder = word[i:].replace(".", "") + break + if remainder == "": + nextWord = wordNext.replace(".", "") + if nextWord == "صبح" or nextWord == "شب": + remainder = nextWord + used += 1 - """ - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "fifths" + elif wordNext == "صبح": + remainder = "صبح" + used += 1 + elif wordNext == "بعد از ظهر": + remainder = "شب" + used += 1 + elif wordNext == "عصر": + remainder = "شب" + used += 1 + elif wordNext == "امروز" and wordNextNext == "صبح": + remainder = "صبح" + used = 2 + daySpecified = True + elif wordNext == "امروز" and wordNextNext == "بعد از ظهر": + remainder = "شب" + used = 2 + daySpecified = True + elif wordNext == "امروز" and wordNextNext == "عصر": + remainder = "شب" + used = 2 + daySpecified = True + else: + if timeQualifier != "": + military = True + if strHH and int(strHH) <= 12 and \ + (timeQualifier in timeQualifiersPM): + strHH += str(int(strHH) + 12) + if wordPrev == 'ساعت' or wordPrev in markers: + words[idx - 1] = '' + if wordPrevPrev in markers: + words[idx - 2] = '' + else: + length = len(word) + strNum = "" + for i in range(length): + if word[i].isdigit(): + strNum += word[i] + else: + remainder += word[i] + if remainder == "": + remainder = wordNext + if remainder in timeQualifiersPM: + strHH = strNum + remainder = "شب" + used += 1 + elif remainder in timeQualifiersAM: + strHH = strNum + remainder = "صبح" + used += 1 + if ( + remainder in recur_markers or + wordNext in recur_markers or + wordNextNext in recur_markers): + # Ex: "7 on mondays" or "3 this friday" + # Set strHH so that isTime == True + # when am or pm is not specified + strHH = strNum + used = 1 + else: + if wordNext == "ساعت" or remainder == "ساعت": + if is_numeric(word): + temp = float(word) + if temp < 1: + minOffset = int(temp * 60) + temp = 0 + else: + temp = int(temp) + else: + temp = int(strNum) + hrOffset = temp + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + print(wordPrev) + if wordPrev in markers: + words[idx - 1] = '' + if wordPrev in ['بعد', 'پس']: + words[idx - 1] = '' + if wordNextNext in ['دیگه', 'بعد']: + used += 1 + elif wordNext == "دقیقه" or remainder == "دقیقه": + # "in 10 minutes" + minOffset = int(strNum) + print(minOffset) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + if wordPrev in ['بعد', 'پس']: + words[idx - 1] = '' + if wordNextNext in ['دیگه', 'بعد']: + used += 1 + elif wordNext == "ثانیه" or remainder == "ثانیه": + # in 5 seconds + secOffset = int(strNum) + used = 2 + isTime = False + hrAbs = -1 + minAbs = -1 + if wordPrev in ['بعد', 'پس']: + words[idx - 1] = '' + if wordNextNext in ['دیگه', 'بعد']: + used += 1 + elif wordPrev == 'ساعت': + strHH = strNum + words[idx - 1] = '' + if wordNext and wordNext[0].isdigit(): + strMM = wordNext + used += 1 + if wordNextNext == 'دقیقه': + used += 1 + elif int(strNum) > 100: + strHH = str(int(strNum) // 100) + strMM = str(int(strNum) % 100) + military = True + if wordNext == "دقیقه": + used += 1 + if wordPrev == "ساعت": + words[idx - 1] = '' + elif strHH: + pass + else: + isTime = False + if wordPrev in markers: + words[idx - 1] = "" + if wordPrevPrev in markers: + words[idx - 2] = "" + HH = int(strHH) if strHH else 0 + MM = int(strMM) if strMM else 0 + HH = HH + 12 if remainder == "شب" and HH < 12 else HH + HH = HH - 12 if remainder == "صبح" and HH >= 12 else HH - fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} - if short_scale: - for num in _SHORT_ORDINAL_FA: - if num > 2: - fracts[_SHORT_ORDINAL_FA[num]] = num - else: - for num in _LONG_ORDINAL_FA: - if num > 2: - fracts[_LONG_ORDINAL_FA[num]] = num + if timeQualifier in timeQualifiersPM and HH < 12: + HH += 12 - if input_str.lower() in fracts: - return 1.0 / fracts[input_str.lower()] - return False + if HH > 24 or MM > 59: + isTime = False + used = 0 + if isTime: + hrAbs = HH + minAbs = MM + used += 1 + if used > 0: + # removed parsed words from the sentence + for i in range(used): + if idx + i >= len(words): + break + words[idx + i] = "" -def extract_numbers_fa(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. + idx += used - 1 + found = True + # check that we found a date + if not date_found(): + return None - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats - """ + if dayOffset is False: + dayOffset = 0 - ar = _parse_sentence(text) - result = [] - for x in ar: - if type(x) == tuple: - result.append(x[0]) - return result + # perform date manipulation + extractedDate = anchorDate.replace(microsecond=0) -def extract_number_fa(text, ordinals=False): - """ - This function extracts a number from a text string, - handles pronunciations in long scale and short scale + if datestr != "": + # date included an explicit date, e.g. "june 5" or "june 2, 2017" + try: + temp = datetime.strptime(datestr, "%B %d") + except ValueError: + # Try again, allowing the year + temp = datetime.strptime(datestr, "%B %d %Y") + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) + if not hasYear: + temp = temp.replace(year=extractedDate.year, + tzinfo=extractedDate.tzinfo) + if extractedDate < temp: + extractedDate = extractedDate.replace( + year=int(currentYear), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(currentYear) + 1, + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + extractedDate = extractedDate.replace( + year=int(temp.strftime("%Y")), + month=int(temp.strftime("%m")), + day=int(temp.strftime("%d")), + tzinfo=extractedDate.tzinfo) + else: + # ignore the current HH:MM:SS if relative using days or greater + if hrOffset == 0 and minOffset == 0 and secOffset == 0: + extractedDate = extractedDate.replace(hour=0, minute=0, second=0) - https://en.wikipedia.org/wiki/Names_of_large_numbers + if yearOffset != 0: + extractedDate = extractedDate + relativedelta(years=yearOffset) + if monthOffset != 0: + extractedDate = extractedDate + relativedelta(months=monthOffset) + if dayOffset != 0: + extractedDate = extractedDate + relativedelta(days=dayOffset) + if hrAbs != -1 and minAbs != -1: + # If no time was supplied in the string set the time to default + # time if it's available + if hrAbs is None and minAbs is None and default_time is not None: + hrAbs, minAbs = default_time.hour, default_time.minute + else: + hrAbs = hrAbs or 0 + minAbs = minAbs or 0 + + extractedDate = extractedDate + relativedelta(hours=hrAbs, + minutes=minAbs) + if (hrAbs != 0 or minAbs != 0) and datestr == "": + if not daySpecified and anchorDate > extractedDate: + extractedDate = extractedDate + relativedelta(days=1) + if hrOffset != 0: + extractedDate = extractedDate + relativedelta(hours=hrOffset) + if minOffset != 0: + extractedDate = extractedDate + relativedelta(minutes=minOffset) + if secOffset != 0: + extractedDate = extractedDate + relativedelta(seconds=secOffset) + for idx, word in enumerate(words): + if words[idx] == "and" and \ + words[idx - 1] == "" and words[idx + 1] == "": + words[idx] = "" + + resultStr = " ".join(words) + resultStr = ' '.join(resultStr.split()) + return [extractedDate, resultStr] + +def is_fractional_fa(input_str, short_scale=True, spoken=True): + """ + This function takes the given text and checks if it is a fraction. Args: - text (str): the string to normalize + input_str (str): the string to check if fractional short_scale (bool): use short scale if True, long scale if False - ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + spoken (bool): consider "half", "quarter", "whole" a fraction Returns: - (int) or (float) or False: The extracted number or False if no number - was found + (bool) or (float): False if not a fraction, otherwise the fraction """ - x = extract_numbers_fa(text, ordinals=ordinals) - if (len(x) == 0): - return False - return x[0] -class EnglishNormalizer(Normalizer): - with open(resolve_resource_file("text/en-us/normalize.json")) as f: + fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} + for num in _ORDINAL_FA: + if num > 2: + fracts[_ORDINAL_FA[num]] = num + + if input_str.lower() in fracts and spoken: + return 1.0 / fracts[input_str.lower()] + return False + +class FarsiNormalizer(Normalizer): + with open(resolve_resource_file("text/fa-ir/normalize.json")) as f: _default_config = json.load(f) + def numbers_to_digits(self, utterance): + return _convert_words_to_numbers_fa(utterance, ordinals=None) + + def normalize_ordinals(self, text): + words = self.tokenize(text) + for idx, w in enumerate(words): + for ordinal in _STRING_ORDINAL_FA: + if w.startswith(ordinal): + words[idx] = ordinal + utterance = " ".join(words) + return utterance def normalize_fa(text, remove_articles=True): - """ English string normalization """ - return EnglishNormalizer().normalize(text, remove_articles) + """ Farsi string normalization """ + normalizer = FarsiNormalizer() + text = normalizer.normalize_ordinals(text) + return normalizer.normalize(text, remove_articles) diff --git a/lingua_franca/res/text/fa-ir/date_time.json b/lingua_franca/res/text/fa-ir/date_time.json index 1a43989f..a2f86bb4 100644 --- a/lingua_franca/res/text/fa-ir/date_time.json +++ b/lingua_franca/res/text/fa-ir/date_time.json @@ -1,84 +1,32 @@ { "decade_format": { - "1": { - "match": "^\\d$", - "format": "{x}" - }, - "2": { - "match": "^1\\d$", - "format": "{xx}" - }, - "3": { - "match": "^\\d0$", - "format": "{x0}" - }, - "4": { - "match": "^[2-9]\\d$", - "format": "{x0} {x}" - }, + "1": {"match": "^\\d$", "format": "{x}"}, + "2": {"match": "^1\\d$","format": "{xx}"}, + "3": {"match": "^\\d0$","format": "{x0}"}, + "4": {"match": "^[2-9]\\d$","format": "{x0} {x}"}, "default": "{number}" }, "hundreds_format": { - "1": { - "match": "^\\d{3}$", - "format": "{x_in_x00} hundred" - }, + "1": {"match": "^\\d{3}$", "format": "{x_in_x00} hundred"}, "default": "{number}" }, "thousand_format": { - "1": { - "match": "^\\d00\\d$", - "format": "{x_in_x000} thousand" - }, - "2": { - "match": "^1\\d00$", - "format": "{xx_in_xx00} hundred" - }, - "3": { - "match": "^\\d{2}00$", - "format": "{x0_in_x000} {x_in_x00} hundred" - }, - "4": { - "match": "^(1\\d{3})|(\\d0\\d{2})$", - "format": "{xx_in_xx00}" - }, - "5": { - "match": "^\\d{4}$", - "format": "{x0_in_x000} {x_in_x00}" - }, + "1": {"match": "^10\\d{2}$", "format": "هزار"}, + "2": {"match": "^[2-9]0\\d{2}$", "format": "{x_in_x000} هزار"}, + "3": {"match": "^11\\d{2}$", "format": "هزار و صد"}, + "4": {"match": "^1\\d{3}$", "format": "هزار و {x_in_x00}"}, + "5": {"match": "^[2-9]1\\d{2}", "format": "{x_in_x000} هزار و صد"}, + "6": {"match": "^[2-9]\\d{3}$", "format": "{x_in_x000} هزار و {x_in_x00}"}, "default": "{number}" }, "year_format": { - "1": { - "match": "^\\d\\d?$", - "format": "{formatted_decade} {bc}" - }, - "2": { - "match": "^\\d00$", - "format": "{formatted_hundreds} {bc}" - }, - "3": { - "match": "^\\d{3}$", - "format": "{formatted_hundreds} {formatted_decade} {bc}" - }, - "4": { - "match": "^\\d{2}00$", - "format": "{formatted_thousand} {bc}" - }, - "5": { - "match": "^\\d00\\d$", - "format": "{formatted_thousand} {formatted_decade} {bc}" - }, - "6": { - "match": "^\\d{2}0\\d$", - "format": "{formatted_thousand} {formatted_decade} {bc}" - }, - "7": { - "match": "^\\d{4}$", - "format": "{formatted_thousand} {formatted_decade} {bc}" - }, + "1": {"match": "^\\d{1,2}$", "format": "{formatted_decade} {bc}"}, + "2": {"match": "^\\d00$", "format": "{formatted_hundreds} {bc}"}, + "3": {"match": "^\\d{3}$", "format": "{formatted_hundreds} {formatted_decade} {bc}"}, + "4": {"match": "^\\d{2}00$", "format": "{formatted_thousand} {bc}"}, + "5": {"match": "^\\d{4}$", "format": "{formatted_thousand} و {formatted_decade} {bc}"}, "default": "{year} {bc}", - "bc": "بعد از میلاد" + "bc": "قبل از میلاد" }, "date_format": { "date_full": "{weekday}, {day} {month} {formatted_year}", @@ -115,10 +63,10 @@ "12": "دوازدهم", "13": "سیزدهم", "14": "چهاردهم", - "15": "پونزدهم", - "16": "شونزدهم", - "17": "هیفدهم", - "18": "هیجدهم", + "15": "پانزدهم", + "16": "شانزدهم", + "17": "هفدهم", + "18": "هجدهم", "19": "نوزدهم", "20": "بیستم", "21": "بیست و یکم", @@ -130,7 +78,7 @@ "27": "بیست و هفتم", "28": "بیست و هشتم", "29": "بیست و نهم", - "30": "سیم", + "30": "سی ام", "31": "سی و یکم" }, "month": { @@ -139,8 +87,8 @@ "3": "مارس", "4": "آوریل", "5": "مه", - "6": "جون", - "7": "جولای", + "6": "ژوئن", + "7": "ژوئیه", "8": "آگوست", "9": "سپتامبر", "10": "اکتبر", @@ -163,10 +111,10 @@ "12": "دوازده", "13": "سیزده", "14": "چهارده", - "15": "پونزده", - "16": "شونزده", - "17": "هیفده", - "18": "هیجده", + "15": "پانزده", + "16": "شانزده", + "17": "هفده", + "18": "هجده", "19": "نوزده", "20": "بیست", "30": "سی", diff --git a/lingua_franca/res/text/fa-ir/date_time_test.json b/lingua_franca/res/text/fa-ir/date_time_test.json index 72321e35..09f79711 100644 --- a/lingua_franca/res/text/fa-ir/date_time_test.json +++ b/lingua_franca/res/text/fa-ir/date_time_test.json @@ -1,36 +1,36 @@ { "test_nice_year": { - "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "یک بعد از میلاد" }, - "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ده بعد از میلاد" }, - "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ده دوازده" }, - "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "ده چهل و شش" }, - "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هیجده صفر هفت" }, - "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هیفده هیفده" }, - "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "نوزده هشتاد و هشت"}, + "1": {"datetime_param": "1, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "یک قبل از میلاد" }, + "2": {"datetime_param": "10, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "ده قبل از میلاد" }, + "8": {"datetime_param": "1012, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "هزار و دوازده" }, + "9": {"datetime_param": "1046, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "هزار و چهل و شش" }, + "10": {"datetime_param": "1807, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هزار و هشصد و هفت" }, + "11": {"datetime_param": "1717, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هزار و هفصد و هفده" }, + "12": {"datetime_param": "1988, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هزار و نهصد و هشتاد و هشت"}, "13": {"datetime_param": "2009, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار و نه"}, - "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست هیجده"}, - "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست بیست و یک"}, - "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "بیست سی"}, + "14": {"datetime_param": "2018, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار و هجده"}, + "15": {"datetime_param": "2021, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار و بیست و یک"}, + "16": {"datetime_param": "2030, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار و سی"}, "17": {"datetime_param": "2100, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "دو هزار و صد" }, "18": {"datetime_param": "1000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "هزار" }, "19": {"datetime_param": "2000, 1, 31, 13, 22, 3", "bc": "None", "assertEqual": "دو هزار" }, - "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سی و یک بیست بعد از میلاد" }, - "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سی و دو چهل و یک بعد از میلاد" }, - "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "پنجاه و دو هزار" } + "20": {"datetime_param": "3120, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سه هزار و صد و بیست قبل از میلاد" }, + "21": {"datetime_param": "3241, 1, 31, 13, 22, 3", "bc": "True", "assertEqual": "سه هزار و دویست و چهل و یک قبل از میلاد" }, + "22": {"datetime_param": "5200, 1, 31, 13, 22, 3", "bc": "False", "assertEqual": "پنج هزار و دویست" } }, "test_nice_date": { - "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده"}, - "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه بیست هیجده"}, + "1": {"datetime_param": "2017, 1, 31, 0, 2, 3", "now": "None", "assertEqual": "سه شنبه, سی و یکم ژانویه دو هزار و هفده"}, + "2": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2017, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه دو هزار و هجده"}, "3": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 1, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه"}, "4": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 1, 0, 2, 3", "assertEqual": "یکشنبه, چهارم"}, "5": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 3, 0, 2, 3", "assertEqual": "فردا"}, "6": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 4, 0, 2, 3", "assertEqual": "امروز"}, "7": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 5, 0, 2, 3", "assertEqual": "دیروز"}, "8": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2018, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه"}, - "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه بیست هیجده"} + "9": {"datetime_param": "2018, 2, 4, 0, 2, 3", "now": "2019, 2, 6, 0, 2, 3", "assertEqual": "یکشنبه, چهارم فوریه دو هزار و هجده"} }, "test_nice_date_time": { - "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده ساعت یک و بیست و دو دقیقه بعد از ظهر"}, - "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "سه شنبه, سی و یکم ژانویه بیست هیفده ساعت سیزده و بیست و دو دقیقه"} + "1": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "False", "use_ampm": "True", "assertEqual": "سه شنبه, سی و یکم ژانویه دو هزار و هفده ساعت یک و بیست و دو دقیقه بعد از ظهر"}, + "2": {"datetime_param": "2017, 1, 31, 13, 22, 3", "now": "None", "use_24hour": "True", "use_ampm": "False", "assertEqual": "سه شنبه, سی و یکم ژانویه دو هزار و هفده ساعت سیزده و بیست و دو"} } } diff --git a/lingua_franca/res/text/fa-ir/normalize.json b/lingua_franca/res/text/fa-ir/normalize.json index 4126c02e..bb7ab03e 100644 --- a/lingua_franca/res/text/fa-ir/normalize.json +++ b/lingua_franca/res/text/fa-ir/normalize.json @@ -6,136 +6,44 @@ "remove_accents": false, "remove_articles": false, "remove_stopwords": false, - "contractions": { - "I'd": "I would", - "I'll": "I will", - "I'm": "I am", - "I've": "I have", - "ain't": "is not", - "aren't": "are not", - "can't": "can not", - "could've": "could have", - "couldn't": "could not", - "didn't": "did not", - "doesn't": "does not", - "don't": "do not", - "gonna": "going to", - "gotta": "got to", - "hadn't": "had not", - "hasn't": "has not", - "haven't": "have not", - "he'd": "he would", - "he'll": "he will", - "he's": "he is", - "how'd": "how did", - "how'll": "how will", - "how's": "how is", - "isn't": "is not", - "it'd": "it would", - "it'll": "it will", - "it's": "it is", - "might've": "might have", - "mightn't": "might not", - "must've": "must have", - "mustn't": "must not", - "needn't": "need not", - "oughtn't": "ought not", - "shan't": "shall not", - "she'd": "she would", - "she'll": "she will", - "she's": "she is", - "should've": "should have", - "shouldn't": "should not", - "somebody's": "somebody is", - "someone'd": "someone would", - "someone'll": "someone will", - "someone's": "someone is", - "that'd": "that would", - "that'll": "that will", - "that's": "that is", - "there'd": "there would", - "there're": "there are", - "there's": "there is", - "they'd": "they would", - "they'll": "they will", - "they're": "they are", - "they've": "they have", - "wasn't": "was not", - "we'd": "we would", - "we'll": "we will", - "we're": "we are", - "we've": "we have", - "weren't": "were not", - "what'd": "what did", - "what'll": "what will", - "what're": "what are", - "what's": "what is", - "what've": "what have", - "whats": "what is", - "when'd": "when did", - "when's": "when is", - "where'd": "where did", - "where's": "where is", - "where've": "where have", - "who'd": "who would", - "who'd've": "who would have", - "who'll": "who will", - "who're": "who are", - "who's": "who is", - "who've": "who have", - "why'd": "why did", - "why're": "why are", - "why's": "why is", - "won't": "will not", - "won't've": "will not have", - "would've": "would have", - "wouldn't": "would not", - "wouldn't've": "would not have", - "y'ain't": "you are not", - "y'aint": "you are not", - "y'all": "you all", - "ya'll": "you all", - "you'd": "you would", - "you'd've": "you would have", - "you'll": "you will", - "you're": "you are", - "you've": "you have" + "contractions": {}, + "word_replacements": { + "اول":"یکم", + "جولای": "ژوئیه", + "اولی":"اول", + "اولین":"اول", + "اولیه":"اول" }, - "word_replacements": {}, "number_replacements": { - "zero": "0", - "one": "1", - "two": "2", - "three": "3", - "four": "4", - "five": "5", - "six": "6", - "seven": "7", - "eight": "8", - "nine": "9", - "ten": "10", - "eleven": "11", - "twelve": "12", - "thirteen": "13", - "fourteen": "14", - "fifteen": "15", - "sixteen": "16", - "seventeen": "17", - "eighteen": "18", - "nineteen": "19", - "twenty": "20", - "thirty": "30", - "forty": "40", - "fifty": "50", - "sixty": "60", - "seventy": "70", - "eighty": "80", - "ninety": "90" + "صفر": "0", + "یک": "1", + "دو": "2", + "سه": "3", + "چهار": "4", + "پنج": "5", + "شش": "6", + "هفت": "7", + "هشت": "8", + "نه": "9", + "ده": "10", + "یازده": "11", + "دوازده": "12", + "سیزده": "13", + "چهارده": "14", + "پانزده": "15", + "شانزده": "16", + "هفده": "17", + "هجده": "18", + "نوزده": "19", + "بیست": "20", + "سی": "30", + "چهل": "40", + "پنجاه": "50", + "شصت": "60", + "هفتاد": "70", + "هشتاد": "80", + "نود": "90" }, "stopwords": [], - "articles": [ - "the", - "a", - "an" - ] + "articles": [] } \ No newline at end of file diff --git a/test/test_format_fa.py b/test/test_format_fa.py index 8bc01119..e552cb97 100644 --- a/test/test_format_fa.py +++ b/test/test_format_fa.py @@ -35,7 +35,7 @@ from lingua_franca.format import pronounce_number from lingua_franca.format import date_time_format from lingua_franca.format import join_list - +from lingua_franca.time import default_timezone def setUpModule(): load_languages(get_supported_langs()) @@ -43,21 +43,19 @@ def setUpModule(): # don't have to do this confusing thing in the "master" test_format.py set_default_lang('fa-ir') - def tearDownModule(): unload_languages(get_active_langs()) - -NUMBERS_FIXTURE_EN = { +NUMBERS_FIXTURE_FA = { 1.435634: '1.436', 2: '2', 5.0: '5', 0.027: '0.027', - 0.5: 'یک دوم', - 1.333: '1 و یک سوم', + 0.5: 'نیم', + 1.333: '1 و 1 سوم', 2.666: '2 و 2 سوم', - 0.25: 'یک چهارم', - 1.25: '1 و یک چهارم', + 0.25: '1 چهارم', + 1.25: '1 و 1 چهارم', 0.75: '3 چهارم', 1.75: '1 و 3 چهارم', 3.4: '3 و 2 پنجم', @@ -65,17 +63,17 @@ def tearDownModule(): 12.5714: '12 و 4 هفتم', 9.625: '9 و 5 هشتم', 6.777: '6 و 7 نهم', - 3.1: '3 و یک دهم', + 3.1: '3 و 1 دهم', 2.272: '2 و 3 یازدهم', 5.583: '5 و 7 دوازدهم', 8.384: '8 و 5 سیزدهم', - 0.071: 'یک چهاردهم', - 6.466: '6 و 7 پونزدهم', + 0.071: '1 چهاردهم', + 6.466: '6 و 7 پانزدهم', 8.312: '8 و 5 شونزدهم', - 2.176: '2 و 3 هیفدهم', - 200.722: '200 و 13 هیجدهم', + 2.176: '2 و 3 هفدهم', + 200.722: '200 و 13 هجدهم', 7.421: '7 و 8 نوزدهم', - 0.05: 'یک بیستم' + 0.05: '1 بیستم' } @@ -87,14 +85,14 @@ def set_tmp_var(self, val): self.tmp_var = val def test_convert_float_to_nice_number(self): - for number, number_str in NUMBERS_FIXTURE_EN.items(): + for number, number_str in NUMBERS_FIXTURE_FA.items(): self.assertEqual(nice_number(number), number_str, 'should format {} as {} and not {}'.format( number, number_str, nice_number(number))) def test_specify_denominator(self): self.assertEqual(nice_number(5.5, denominators=[1, 2, 3]), - '5 و یک دوم', + '5 و نیم', 'should format 5.5 as 5 and a half not {}'.format( nice_number(5.5, denominators=[1, 2, 3]))) self.assertEqual(nice_number(2.333, denominators=[1, 2]), @@ -112,13 +110,29 @@ def test_no_speech(self): 'should format 6.0 as 6 not {}'.format( nice_number(6.0, speech=False))) + def test_unknown_language(self): + """ An unknown / unhandled language should return the string + representation of the input number. + """ + def bypass_warning(): + self.assertEqual( + nice_number(5.5, lang='as-df'), '5.5', + 'should format 5.5 ' + 'as 5.5 not {}'.format( + nice_number(5.5, lang='as-df'))) + + # Should throw a warning. Would raise the same text as a + # NotImplementedError, but nice_number() bypasses and returns + # its input as a string + self.assertWarns(UserWarning, bypass_warning) + class TestPronounceNumber(unittest.TestCase): def test_convert_int(self): self.assertEqual(pronounce_number(0), "صفر") self.assertEqual(pronounce_number(1), "یک") self.assertEqual(pronounce_number(10), "ده") - self.assertEqual(pronounce_number(15), "پونزده") + self.assertEqual(pronounce_number(15), "پانزده") self.assertEqual(pronounce_number(20), "بیست") self.assertEqual(pronounce_number(27), "بیست و هفت") self.assertEqual(pronounce_number(30), "سی") @@ -127,10 +141,12 @@ def test_convert_int(self): def test_convert_negative_int(self): self.assertEqual(pronounce_number(-1), "منفی یک") self.assertEqual(pronounce_number(-10), "منفی ده") - self.assertEqual(pronounce_number(-15), "منفی پونزده") + self.assertEqual(pronounce_number(-15), "منفی پانزده") self.assertEqual(pronounce_number(-20), "منفی بیست") self.assertEqual(pronounce_number(-27), "منفی بیست و هفت") - + self.assertEqual(pronounce_number(-30), "منفی سی") + self.assertEqual(pronounce_number(-33), "منفی سی و سه") + def test_convert_decimals(self): self.assertEqual(pronounce_number(0.05), "پنج صدم") self.assertEqual(pronounce_number(-0.05), "منفی پنج صدم") @@ -154,6 +170,14 @@ def test_convert_decimals(self): "منفی بیست و یک و بیست و سه صدم") self.assertEqual(pronounce_number(-21.234, places=1), "منفی بیست و یک و دو دهم") + self.assertEqual(pronounce_number(-21.234, places=0), + "منفی بیست و یک") + self.assertEqual(pronounce_number(-21.234, places=3), + "منفی بیست و یک و دویست و سی و چهار هزارم") + self.assertEqual(pronounce_number(-21.234, places=4), + "منفی بیست و یک و دویست و سی و چهار هزارم") + self.assertEqual(pronounce_number(-21.234, places=5), + "منفی بیست و یک و دویست و سی و چهار هزارم") def test_convert_hundreds(self): self.assertEqual(pronounce_number(100), "صد") @@ -174,15 +198,55 @@ def test_convert_scientific_notation(self): "دو و نود و نه صدم ضرب در ده به توان هشت") self.assertEqual(pronounce_number(299792448, places=6, scientific=True), - "دو و نهصد و نود و هفت هزار و نهصد و بیست و چهار میلیونیم ضرب در ده به توان هشت") + "دو و نهصد و نود و هفت هزار و نهصد و بیست و چهار میلیونم ضرب در ده به توان هشت") self.assertEqual(pronounce_number(1.672e-27, places=3, scientific=True), "یک و ششصد و هفتاد و دو هزارم ضرب در ده به توان منفی بیست و هفت") + def test_auto_scientific_notation(self): + self.assertEqual( + pronounce_number(1.1e-150), "یک و یک دهم ضرب در ده به توان منفی صد و پنجاه") + + def test_large_numbers(self): + self.assertEqual( + pronounce_number(299792458), + "دویست و نود و نه میلیون و هفصد و نود و دو هزار و " + "چهارصد و پنجاه و هشت") + self.assertEqual( + pronounce_number(100034000000299792458), + "صد کوئینتیلیون و سی و چهار کوادریلیون و دویست و نود و نه میلیون و " + "هفصد و نود و دو هزار و چهارصد و پنجاه و هشت") + self.assertEqual( + pronounce_number(10000000000), + "ده میلیارد") + self.assertEqual( + pronounce_number(1000000000000), + "یک تریلیون") + # TODO maybe beautify this + self.assertEqual( + pronounce_number(1000001), + "یک میلیون و یک") + self.assertEqual(pronounce_number(95505896639631893), + "نود و پنج کوادریلیون و پانصد و پنج تریلیون و " + "هشصد و نود و شش میلیارد و ششصد و سی و نه میلیون و " + "ششصد و سی و یک هزار و هشصد و نود و سه") + self.assertEqual(pronounce_number(10e32, places=1), + "یک دسیلیون") + + # infinity + self.assertEqual( + pronounce_number(sys.float_info.max * 2), "بی نهایت") + self.assertEqual( + pronounce_number(float("inf")), + "بی نهایت") + self.assertEqual( + pronounce_number(float("-inf")), + "منفی بی نهایت") + def test_ordinals(self): self.assertEqual(pronounce_number(1, ordinals=True), "یکم") self.assertEqual(pronounce_number(10, ordinals=True), "دهم") - self.assertEqual(pronounce_number(15, ordinals=True), "پونزدهم") + self.assertEqual(pronounce_number(15, ordinals=True), "پانزدهم") self.assertEqual(pronounce_number(20, ordinals=True), "بیستم") self.assertEqual(pronounce_number(27, ordinals=True), "بیست و هفتم") self.assertEqual(pronounce_number(30, ordinals=True), "سیم") @@ -192,30 +256,14 @@ def test_ordinals(self): self.assertEqual(pronounce_number(10000, ordinals=True), "ده هزارم") self.assertEqual(pronounce_number(18691, ordinals=True), - "هیجده هزار و ششصد و نود و یکم") + "هجده هزار و ششصد و نود و یکم") self.assertEqual(pronounce_number(1567, ordinals=True), "هزار و پانصد و شصت و هفتم") self.assertEqual(pronounce_number(18e6, ordinals=True), - "هیجده میلیونم") + "هجده میلیونم") self.assertEqual(pronounce_number(18e9, ordinals=True), - "هیجده میلیاردم") - def test_variant(self): - self.assertEqual(pronounce_number(18691, ordinals=True, variant="formal"), - "هجده هزار و ششصد و نود و یکم") - self.assertEqual(pronounce_number(15, variant='conversational'), "پونزده") - self.assertEqual(pronounce_number(15, variant='formal'), "پانزده") - self.assertEqual(nice_number(2.176, variant='formal'), "2 و 3 هفدهم") - dt = datetime.datetime(2017, 1, 31, - 16, 22, 3) - self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True, variant='formal'), - "شانزده و بیست و دو دقیقه") - self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True, variant='conversational'), - "شونزده و بیست و دو دقیقه") - - + "هجده میلیاردم") -# def nice_time(dt, lang="en-us", speech=True, use_24hour=False, -# use_ampm=False): class TestNiceDateFormat(unittest.TestCase): @classmethod @@ -230,10 +278,9 @@ def setUpClass(cls): with (sub_dir / 'date_time_test.json').open() as f: cls.test_config[sub_dir.parts[-1]] = json.loads(f.read()) - def test_convert_times(self): dt = datetime.datetime(2017, 1, 31, - 13, 22, 3) + 13, 22, 3, tzinfo=default_timezone()) # Verify defaults haven't changed self.assertEqual(nice_time(dt), @@ -246,19 +293,19 @@ def test_convert_times(self): self.assertEqual(nice_time(dt, speech=False), "1:22") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), - "1:22 PM") + "1:22 بعد از ظهر") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:22") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:22") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), - "سیزده و بیست و دو دقیقه") + "سیزده و بیست و دو") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), - "سیزده و بیست و دو دقیقه") + "سیزده و بیست و دو") dt = datetime.datetime(2017, 1, 31, - 13, 0, 3) + 13, 0, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "یک") self.assertEqual(nice_time(dt, use_ampm=True), @@ -266,7 +313,7 @@ def test_convert_times(self): self.assertEqual(nice_time(dt, speech=False), "1:00") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), - "1:00 PM") + "1:00 بعد از ظهر") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:00") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, @@ -278,7 +325,7 @@ def test_convert_times(self): "سیزده") dt = datetime.datetime(2017, 1, 31, - 13, 2, 3) + 13, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "یک و دو دقیقه") self.assertEqual(nice_time(dt, use_ampm=True), @@ -286,19 +333,19 @@ def test_convert_times(self): self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), - "1:02 PM") + "1:02 بعد از ظهر") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "13:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "13:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), - "سیزده و دو دقیقه") + "سیزده و دو") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), - "سیزده و دو دقیقه") + "سیزده و دو") dt = datetime.datetime(2017, 1, 31, - 0, 2, 3) + 0, 2, 3, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "دوازده و دو دقیقه") self.assertEqual(nice_time(dt, use_ampm=True), @@ -306,19 +353,19 @@ def test_convert_times(self): self.assertEqual(nice_time(dt, speech=False), "12:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), - "12:02 AM") + "12:02 قبل از ظهر") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "00:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "00:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), - "صفر و دو دقیقه") + "دو دقیقه‌ی بامداد") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), - "صفر و دو دقیقه") + "دو دقیقه‌ی بامداد") dt = datetime.datetime(2018, 2, 8, - 1, 2, 33) + 1, 2, 33, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "یک و دو دقیقه") self.assertEqual(nice_time(dt, use_ampm=True), @@ -326,53 +373,125 @@ def test_convert_times(self): self.assertEqual(nice_time(dt, speech=False), "1:02") self.assertEqual(nice_time(dt, speech=False, use_ampm=True), - "1:02 AM") + "1:02 قبل از ظهر") self.assertEqual(nice_time(dt, speech=False, use_24hour=True), "01:02") self.assertEqual(nice_time(dt, speech=False, use_24hour=True, use_ampm=True), "01:02") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=True), - "یک و دو دقیقه") + "یک و دو") self.assertEqual(nice_time(dt, use_24hour=True, use_ampm=False), - "یک و دو دقیقه") + "یک و دو") dt = datetime.datetime(2017, 1, 31, - 12, 15, 9) + 12, 15, 9, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), "دوازده و ربع") self.assertEqual(nice_time(dt, use_ampm=True), "دوازده و ربع بعد از ظهر") dt = datetime.datetime(2017, 1, 31, - 5, 30, 00) + 5, 30, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt, use_ampm=True), "پنج و نیم قبل از ظهر") dt = datetime.datetime(2017, 1, 31, - 1, 45, 00) + 1, 45, 00, tzinfo=default_timezone()) self.assertEqual(nice_time(dt), - "یه ربع به دو") - - # TODO: failed because of و - #def test_nice_duration(self): - # self.assertEqual(nice_duration(1), "یک ثانیه") - # self.assertEqual(nice_duration(3), "سه ثانیه") - # self.assertEqual(nice_duration(1, speech=False), "0:01") - # self.assertEqual(nice_duration(61), "یک دقیقه و یک ثانیه") - # self.assertEqual(nice_duration(61, speech=False), "1:01") - # self.assertEqual(nice_duration(5000), - # "یک ساعت و بیست و سه دقیقه و بیست ثانیه") - # self.assertEqual(nice_duration(5000, speech=False), "1:23:20") - # self.assertEqual(nice_duration(50000), - # "سیزده ساعت و پنجاه و سه دقیقه و بیست ثانیه") - # self.assertEqual(nice_duration(50000, speech=False), "13:53:20") - # self.assertEqual(nice_duration(500000), - # "پنج روز و هیجده ساعت و پنجاه و سه دقیقه و بیست ثانیه") # nopep8 - # self.assertEqual(nice_duration(500000, speech=False), "5d 18:53:20") - # self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), - # speech=False), - # "5d 18:53:20") + "یک ربع به دو") + + def test_nice_date(self): + lang = "fa-ir" + i = 1 + while (self.test_config[lang].get('test_nice_date') and + self.test_config[lang]['test_nice_date'].get(str(i))): + p = self.test_config[lang]['test_nice_date'][str(i)] + dp = ast.literal_eval(p['datetime_param']) + np = ast.literal_eval(p['now']) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5]) + now = None if not np else datetime.datetime( + np[0], np[1], np[2], np[3], np[4], np[5]) + print('Testing for ' + lang + ' that ' + str(dt) + + ' is date ' + p['assertEqual']) + self.assertEqual(p['assertEqual'], + nice_date(dt, lang=lang, now=now)) + i = i + 1 + + for dt in (datetime.datetime(2017, 12, 30, 0, 2, 3) + + datetime.timedelta(n) for n in range(368)): + self.assertTrue(len(nice_date(dt, lang=lang)) > 0) + + def test_nice_date_time(self): + # TODO: migrate these tests (in res files) to respect the new + # language loading features. Right now, some of them break if + # their languages are not default. + lang = "fa-ir" + set_default_lang(lang) + i = 1 + while (self.test_config[lang].get('test_nice_date_time') and + self.test_config[lang]['test_nice_date_time'].get(str(i))): + p = self.test_config[lang]['test_nice_date_time'][str(i)] + dp = ast.literal_eval(p['datetime_param']) + np = ast.literal_eval(p['now']) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5], + tzinfo=default_timezone()) + now = None if not np else datetime.datetime( + np[0], np[1], np[2], np[3], np[4], np[5], + tzinfo=default_timezone()) + print('Testing for ' + lang + ' that ' + str(dt) + + ' is date time ' + p['assertEqual']) + self.assertEqual( + p['assertEqual'], + nice_date_time( + dt, lang=lang, now=now, + use_24hour=ast.literal_eval(p['use_24hour']), + use_ampm=ast.literal_eval(p['use_ampm']))) + i = i + 1 + + def test_nice_year(self): + lang = "fa-ir" + i = 1 + while (self.test_config[lang].get('test_nice_year') and + self.test_config[lang]['test_nice_year'].get(str(i))): + p = self.test_config[lang]['test_nice_year'][str(i)] + dp = ast.literal_eval(p['datetime_param']) + dt = datetime.datetime( + dp[0], dp[1], dp[2], dp[3], dp[4], dp[5]) + print('Testing for ' + lang + ' that ' + str(dt) + + ' is year ' + p['assertEqual']) + self.assertEqual(p['assertEqual'], nice_year( + dt, lang=lang, bc=ast.literal_eval(p['bc']))) + i = i + 1 + + # Test all years from 0 to 9999 for all languages, + # that some output is produced + print("Test all years in " + lang) + for i in range(1, 9999): + dt = datetime.datetime(i, 1, 31, 13, 2, 3, tzinfo=default_timezone()) + self.assertTrue(len(nice_year(dt, lang=lang)) > 0) + # Looking through the date sequence can be helpful + + def test_nice_duration(self): + self.assertEqual(nice_duration(1), "یک ثانیه") + self.assertEqual(nice_duration(3), "سه ثانیه") + self.assertEqual(nice_duration(1, speech=False), "0:01") + self.assertEqual(nice_duration(61), "یک دقیقه و یک ثانیه") + self.assertEqual(nice_duration(61, speech=False), "1:01") + self.assertEqual(nice_duration(5000), + "یک ساعت و بیست و سه دقیقه و بیست ثانیه") + self.assertEqual(nice_duration(5000, speech=False), "1:23:20") + self.assertEqual(nice_duration(50000), + "سیزده ساعت و پنجاه و سه دقیقه و بیست ثانیه") + self.assertEqual(nice_duration(50000, speech=False), "13:53:20") + self.assertEqual(nice_duration(500000), + "پنج روز و هجده ساعت و پنجاه و سه دقیقه و بیست ثانیه") + self.assertEqual(nice_duration(500000, speech=False), "5 18:53:20") + self.assertEqual(nice_duration(datetime.timedelta(seconds=500000), + speech=False), + "5 18:53:20") def test_join(self): self.assertEqual(join_list(None, "and"), "") @@ -389,6 +508,5 @@ def test_join(self): self.assertEqual(join_list([1, "ب", 3, "دال"], "یا"), "1, ب, 3 یا دال") - if __name__ == "__main__": unittest.main() diff --git a/test/test_parse_fa.py b/test/test_parse_fa.py index b87909b9..432a7e40 100644 --- a/test/test_parse_fa.py +++ b/test/test_parse_fa.py @@ -15,9 +15,11 @@ # import unittest from datetime import datetime, timedelta +from dateutil import tz from lingua_franca import load_language, unload_language, set_default_lang from lingua_franca.internal import FunctionNotLocalizedError +from lingua_franca.time import default_timezone from lingua_franca.parse import extract_datetime from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number, extract_numbers @@ -36,44 +38,142 @@ def setUpModule(): def tearDownModule(): unload_language('fa') + class TestNormalize(unittest.TestCase): + + def test_extract_number_ambiguous(self): + # test explicit ordinals + self.assertEqual(extract_number("این تست ۱ام هستش", + ordinals=True), 1) + self.assertEqual(extract_number("این تست ۲ام هست", + ordinals=False), 2) + self.assertEqual(extract_number("این تست ۴ام است", + ordinals=None), 4) + self.assertEqual(extract_number( + "این تست ۷ام هست", ordinals=True), 7) + self.assertEqual(extract_number( + "این تست ۷ام هست", ordinals=False), 7) + self.assertTrue(extract_number("این تست چندم است") is False) + self.assertEqual(extract_number("این تست ۲ام هست"), 2) + self.assertEqual(extract_number("این تست ۳۱ام هست"), 31) + + # test non ambiguous ordinals + self.assertEqual(extract_number("این تست اول هست", + ordinals=True), 1) + self.assertEqual(extract_number("این تست یکم هست", + ordinals=True), 1) + self.assertEqual(extract_number("این تست اول هست", + ordinals=False), False) + self.assertEqual(extract_number("این تست اول هست", + ordinals=None), False) + + # test ambiguous ordinal/fractional + self.assertEqual(extract_number("این تست سوم هست", + ordinals=True), 3.0) + self.assertEqual(extract_number("این تست سوم هست", + ordinals=False), False) + self.assertEqual(extract_number("این تست سوم هست", + ordinals=None), False) + + self.assertEqual(extract_number("یک سوم فنجان", + ordinals=False), 1.0 / 3.0) + self.assertEqual(extract_number("یک سوم فنجان", + ordinals=True), 3) + self.assertEqual(extract_number("یک سوم فنجان", + ordinals=None), 1 / 3) + + # test plurals + self.assertEqual(extract_number("۲ پنجم", + ordinals=True), 5) + self.assertEqual(extract_number("۲ پنجم", + ordinals=False), 2 / 5) + self.assertEqual(extract_number("۲ پنجم", + ordinals=None), 2 / 5) + + self.assertEqual(extract_number("بیست و دو و سه پنجم"), 22.6) + + # test multiple ambiguous + self.assertEqual(extract_number("ششم سوم", ordinals=None), False) + self.assertEqual(extract_number("سی و دوم", ordinals=False), 30) + self.assertEqual(extract_number("سی و دوم", ordinals=None), 30) + self.assertEqual(extract_number("سی و دوم", ordinals=True), 32) + + # test big numbers / short vs long scale + self.assertEqual(extract_number("این تست یک میلیاردم هست", + ordinals=True), 1e09) + self.assertEqual(extract_number("این تست یک میلیاردم هست", + ordinals=None), 1e-9) + + self.assertEqual(extract_number("این تست یک میلیاردم هست", + ordinals=False), 1e-9) + self.assertEqual(extract_number("این تست یک میلیاردم هست", + ordinals=True, + short_scale=False), 1e9) + self.assertEqual(extract_number("این تست یک میلیاردم هست", + ordinals=None, + short_scale=False), 1e-9) + self.assertEqual(extract_number("این تست یک میلیاردم هست", + short_scale=False), 1e-9) def test_extract_number(self): - #self.assertEqual(extract_number("این تست اول است", - # ordinals=True), 1) - self.assertEqual(extract_number("این تست دو است"), 2) - #self.assertEqual(extract_number("این تست دوم است", - # ordinals=True), 2) - #self.assertEqual(extract_number("این تست سوم است", - # ordinals=True), 3.0) - #self.assertEqual(extract_number("چهارمی", ordinals=True), 4.0) - #self.assertEqual(extract_number("سی و ششمی", ordinals=True), 36.0) - self.assertEqual(extract_number("این تست شماره چهار است"), 4) - #self.assertEqual(extract_number("یک سوم فنجان"), 1.0 / 3.0) + self.assertEqual(extract_number("این تست ۲ هست"), 2) + self.assertEqual(extract_number("این تست شماره ۴ هستش"), 4) self.assertEqual(extract_number("سه فنجان"), 3) - #self.assertEqual(extract_number("۱/۳ فنجان"), 1.0 / 3.0) - #self.assertEqual(extract_number("یک چهارم فنجان"), 0.25) - #self.assertEqual(extract_number("۱/۴ فنجان"), 0.25) - #self.assertEqual(extract_number("دو سوم فنجان"), 2.0 / 3.0) - #self.assertEqual(extract_number("سه چهارم فنجان"), 3.0 / 4.0) - #self.assertEqual(extract_number("یک و سه چهارم فنجان"), 1.75) - #self.assertEqual(extract_number("۱ فنجان و نیم"), 1.5) - #self.assertEqual(extract_number("یک فنجان و نیم"), 1.5) + self.assertEqual(extract_number("یک سوم فنجان"), 1.0 / 3.0) + self.assertEqual(extract_number("1/4 فنجان"), 0.25) + self.assertEqual(extract_number("یک چهارم فنجان"), 0.25) + self.assertEqual(extract_number("2/3 فنجان"), 2.0 / 3.0) + # self.assertEqual(extract_number("یک فنجان و نیم"), 1.5) self.assertEqual(extract_number("یک و نیم فنجان"), 1.5) self.assertEqual(extract_number("بیست و دو"), 22) - #self.assertEqual(extract_number("بیست و دو و سه پنجم"), 22.6) self.assertEqual(extract_number("دویست"), 200) self.assertEqual(extract_number("نه هزار"), 9000) - self.assertEqual(extract_number("هزار و پانصد"), 1500) self.assertEqual(extract_number("ششصد و شصت و شش"), 666) self.assertEqual(extract_number("دو میلیون"), 2000000) - self.assertEqual(extract_number("دو هزار و هفده"), 2017) - self.assertEqual(extract_number("شانزده هزار و صد و پونزده"), 16115) - self.assertEqual(extract_number("هجده میلیون و هجده هزار و دویست و هجده"), 18018218) - self.assertEqual(extract_number("دو میلیون و پانصد هزار " - "تن گوشت یخ زده"), 2500000) + self.assertEqual(extract_number("دو میلیون و پانصد هزار تن فلز چرخان"), 2500000) + self.assertEqual(extract_number("شش تریلیون"), 6000000000000.0) + self.assertEqual(extract_number("شش تریلیون", short_scale=False), + 6000000000000.0) + self.assertEqual(extract_number("یک ممیز پنج"), 1.5) + self.assertEqual(extract_number("سه ممیز چهارده"), 3.14) + self.assertEqual(extract_number("سه ممیز بیست و سه"), 3.23) + self.assertEqual(extract_number("دو دهم"), 0.2) + self.assertEqual(extract_number("صد هزار"), 100000) + self.assertEqual(extract_number("منفی ۲"), -2) + self.assertEqual(extract_number("منفی هفتاد"), -70) + + # Verify non-power multiples of ten no longer discard + # adjacent multipliers + self.assertEqual(extract_number("بیست هزار"), 20000) + self.assertEqual(extract_number("پنجاه میلیون"), 50000000) + + # Verify smaller powers of ten no longer cause miscalculation of larger + # powers of ten (see MycroftAI#86) + self.assertEqual(extract_number("بیست میلیارد و سیصد میلیون و \ + نهصد و پنجاه هزار و \ + ششصد و هفتاد و پنج و هشت دهم"), + 20300950675.8) + self.assertEqual(extract_number("نهصد و نود و نه میلیون و \ + نهصد و نود و نه هزار و \ + نهصد و نود و نه و نه دهم"), + 999999999.9) + + # TODO why does "trillion" result in xxxx.0? + self.assertEqual(extract_number("هشصد میلیارد و دویست و پنجاه و هفت"), + 800000000257.0) + + self.assertTrue(extract_number("تنیسور سریع است") is False) + self.assertTrue(extract_number("شکستنی") is False) - def test_extract_duration_en(self): + self.assertTrue(extract_number("صفر شکستنی") is not False) + self.assertEqual(extract_number("صفر شکستنی"), 0) + + self.assertTrue(extract_number("خشن 0") is not False) + self.assertEqual(extract_number("خشن 0"), 0) + + self.assertEqual(extract_number("کاملا 100%"), 100) + + def test_extract_duration_fa(self): self.assertEqual(extract_duration("10 ثانیه"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 دقیقه"), @@ -88,21 +188,49 @@ def test_extract_duration_en(self): (timedelta(hours=7), "")) self.assertEqual(extract_duration("7.5 ثانیه"), (timedelta(seconds=7.5), "")) - self.assertEqual(extract_duration("هشت و نیم روز و " - "سی و نه ثانیه"), + self.assertEqual(extract_duration("هشت و نیم روز و سی و نه ثانیه"), (timedelta(days=8.5, seconds=39), "")) - self.assertEqual(extract_duration("یک تایمر برای نیم ساعت دیگه بزار"), - (timedelta(minutes=30), "یک تایمر برای دیگه بزار")) - self.assertEqual(extract_duration("چهار و نیم دقیقه تا " - "طلوع آفتاب"), - (timedelta(minutes=4.5), "تا طلوع آفتاب")) - self.assertEqual(extract_duration("این فیلم یک ساعت و پنجاه و هفت و نیم دقیقه " - "طول می کشد"), + self.assertEqual(extract_duration("سه هفته و چهارصد و نود و هفت روز و " + "سیصد و ۹۱.۶ ثانیه در بیدارم کن"), + (timedelta(weeks=3, days=497, seconds=391.6), + "در بیدارم کن")) + self.assertEqual(extract_duration("10-ثانیه"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5-دقیقه"), + (timedelta(minutes=5), "")) + + def test_extract_duration_case_fa(self): + self.assertEqual(extract_duration("یک تایمر برای ۳۰ دقیقه تنظیم کن"), + (timedelta(minutes=30), "1 تایمر برای تنظیم کن")) + self.assertEqual(extract_duration("این فیلم یک ساعت و " + "پنجاه و هفت و نیم دقیقه است"), (timedelta(hours=1, minutes=57.5), - "این فیلم طول می کشد")) - def test_extractdatetime_en(self): + "این فیلم است")) + self.assertEqual(extract_duration("چهار و نیم دقیقه تا غروب"), + (timedelta(minutes=4.5), "تا غروب")) + self.assertEqual(extract_duration("نوزده دقیقه از ساعت گذشته"), + (timedelta(minutes=19), "از ساعت گذشته")) + + def test_extractdatetime_fractions_fa(self): + def extractWithFormat(text): + date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) + [extractedDate, leftover] = extract_datetime(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("کمین را برای نیم ساعت تنظیم کن", + "2017-06-27 13:34:00", "کمین را تنظیم کن") + testExtract("در نیم ساعت یادم بنداز به مادرم زنگ بزنم", + "2017-06-27 13:34:00", "یادم بنداز به مادرم زنگ بزنم") + + def test_extractdatetime_fa(self): def extractWithFormat(text): - date = datetime(2017, 6, 27, 13, 4) # Tue June 27, 2017 @ 1:04pm + date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) # Tue June 27, 2017 @ 1:04pm [extractedDate, leftover] = extract_datetime(text, date) extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") return [extractedDate, leftover] @@ -112,59 +240,311 @@ def testExtract(text, expected_date, expected_leftover): self.assertEqual(res[0], expected_date, "for=" + text) self.assertEqual(res[1], expected_leftover, "for=" + text) - testExtract("الان ساعت اینه", - "2017-06-27 13:04:00", "ساعت اینه") - testExtract("یک ثانیه دیگه", + testExtract("الان وقتشه", + "2017-06-27 13:04:00", "وقتشه") + testExtract("یک ثانیه", "2017-06-27 13:04:01", "") - testExtract("یک دقیقه دیگه", + testExtract("یک دقیقه", "2017-06-27 13:05:00", "") - testExtract("دو دقیقه دیگه", - "2017-06-27 13:06:00", "") - testExtract("دو ساعت دیگه", - "2017-06-27 15:04:00", "") - testExtract("من یک ساعت دیگه می خوامش", - "2017-06-27 14:04:00", "من می خوامش") - testExtract("1 ثانیه دیگه", + testExtract("یک دهه", + "2027-06-27 00:00:00", "") + testExtract("دهه بعد", + "2027-06-27 00:00:00", "") + testExtract("یک قرن", + "2117-06-27 00:00:00", "") + testExtract("یک هزاره", + "3017-06-27 00:00:00", "") + testExtract("یک ساعت", + "2017-06-27 14:04:00", "") + testExtract("من آن را در عرض یک ساعت می خواهم", + "2017-06-27 14:04:00", "من آن را می خواهم") + testExtract("۱ ثانیه", "2017-06-27 13:04:01", "") - testExtract("2 ثانیه دیگه", - "2017-06-27 13:04:02", "") - testExtract("یک آلارم برای یک دقیقه بعد بزار", - "2017-06-27 13:05:00", "یک آلارم برای بزار") - testExtract("یک آلارم برای نیم ساعت دیگه بزار", - "2017-06-27 13:34:00", "یک آلارم برای بزار") - testExtract("یه آلارم برای پنج روز بعد بزار", - "2017-07-02 00:00:00", "یه آلارم برای بزار") - testExtract("پس فردا", + testExtract("کمین را برای یک دقیقه تنظیم کن", + "2017-06-27 13:05:00", "کمین را تنظیم کن") + testExtract("از امروز برای ۵ روز کمین بگذارید", + "2017-07-02 00:00:00", "کمین بگذارید") + testExtract("پسفردا", "2017-06-29 00:00:00", "") - testExtract("آب و هوا پس فردا چطوره؟", - "2017-06-29 00:00:00", "آب و هوا چطوره؟") - #testExtract("ساعت بیست و دو و چهل و پنج دقیقه بهم یادآوری کن", - # "2017-06-27 22:45:00", "بهم یادآوری کن") - testExtract("هوای جمعه صبح چطوره؟", - "2017-06-30 08:00:00", "هوای چطوره؟") - testExtract("هوای فردا چطوره؟", - "2017-06-28 00:00:00", "هوای چطوره؟") - testExtract("هوای امروز بعد از ظهر چطوره؟", - "2017-06-27 15:00:00", "هوای چطوره؟") - testExtract("یادم بنداز که هشت هفته و دو روز دیگه به مادرم زنگ بزنم", - "2017-08-24 00:00:00", "یادم بنداز که به مادرم زنگ بزنم") - #testExtract("یادم بنداز که دوازده مرداد به مادرم زنگ بزنم", - # "2017-08-03 00:00:00", "یادم بنداز که به مادرم زنگ بزنم") - #testExtract("یادم بنداز که ساعت هفت به مادرم زنگ بزنم", - # "2017-06-28 07:00:00", "یادم بنداز که به مادرم زنگ بزنم") - #testExtract("یادم بنداز که فردا ساعت بیست و دو به مادرم زنگ بزنم", - # "2017-06-28 22:00:00", "یادم بنداز که به مادرم زنگ بزنم") - # TODO: This test is imperfect due to the "at 7:00" still in the - # remainder. But let it pass for now since time is correct + testExtract("پسفردا هوا چطوره؟", + "2017-06-29 00:00:00", "هوا چطوره") + testExtract("10:45 بعد از ظهر یادم بنداز", + "2017-06-27 22:45:00", "یادم بنداز") + testExtract("جمعه صبح هوا چطوره", + "2017-06-30 08:00:00", "هوا چطوره") + testExtract("فردا هوا چطوره", + "2017-06-28 00:00:00", "هوا چطوره") + testExtract("بعد از ظهر هوا چطوره", + "2017-06-27 15:00:00", "هوا چطوره") + testExtract("عصر هوا چطوره", + "2017-06-27 19:00:00", "هوا چطوره") + testExtract("امروز صبح هوا چطور بود", + "2017-06-27 08:00:00", "هوا چطور بود") + testExtract("بعد از ۸ هفته و دو روز یادم بنداز به مادر زنگ بزنم", + "2017-08-24 00:00:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("سوم اوت یادم بنداز به مادر زنگ بزنم", + "2017-08-03 00:00:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("فردا ساعت ۷ صبح یادم بنداز به مادر زنگ بزنم", + "2017-06-28 07:00:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("فردا ساعت ۱۰ شب یادم بنداز به مادر زنگ بزنم", + "2017-06-28 22:00:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("ساعت ۷ صبح یادم بنداز به مادر زنگ بزنم", + "2017-06-28 07:00:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("بعد یک ساعت یادم بنداز به مادر زنگ بزنم", + "2017-06-27 14:04:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("ساعت ۱۷ و ۳۰ یادم بنداز به مادر زنگ بزنم", + "2017-06-27 17:30:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("ساعت ۰۶ و ۳۰ یادم بنداز به مادر زنگ بزنم", + "2017-06-28 06:30:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("ساعت ۰۶ ۳۰ یادم بنداز به مادر زنگ بزنم", + "2017-06-28 06:30:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("پنجشنبه صبح ساعت ۷ یادم بنداز به مادر زنگ بزنم", + "2017-06-29 07:00:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("پنجشنبه عصر ساعت ۷ یادم بنداز به مادر زنگ بزنم", + "2017-06-29 19:00:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("بعد ۲ ساعت یادم بنداز به مادر زنگ بزنم", + "2017-06-27 15:04:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("۲ ساعت دیگه یادم بنداز به مادر زنگ بزنم", + "2017-06-27 15:04:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("پانزده دقیقه دیگه یادم بنداز به مادر زنگ بزنم", + "2017-06-27 13:19:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("بعد پانزده دقیقه یادم بنداز به مادر زنگ بزنم", + "2017-06-27 13:19:00", "یادم بنداز به مادر زنگ بزنم") + testExtract("پنجشنبه ساعت ۳:۴۵ بعد از ظهر حمله رو شروع کنید", + "2017-06-29 15:45:00", "حمله رو شروع کنید") + testExtract("دوشنبه از نانوایی نان سفارش بده", + "2017-07-03 00:00:00", "از نانوایی نان سفارش بده") + testExtract("پنجشنبه بعد ساعت ۱۲:۴۵ به مامان زنگ بزن", + "2017-07-06 12:45:00", "به مامان زنگ بزن") + testExtract("جمعه بعد هوا چطوره؟", + "2017-06-30 00:00:00", "هوا چطوره") + testExtract("چهارشنبه بعد هوا چطوره؟", + "2017-07-05 00:00:00", "هوا چطوره") + testExtract("جمعه بعد صبح هوا چطوره؟", + "2017-06-30 08:00:00", "هوا چطوره") + testExtract("جمعه بعد عصر هوا چطوره؟", + "2017-06-30 19:00:00", "هوا چطوره") + testExtract("جمعه بعد بعد از ظهر هوا چطوره", + "2017-06-30 15:00:00", "هوا چطوره") + testExtract("۴ام جولای آتش بازی بخر", + "2017-07-04 00:00:00", "آتش بازی بخر") + testExtract("۴ام ژوئیه آتش بازی بخر", + "2017-07-04 00:00:00", "آتش بازی بخر") + testExtract("این پنجشنبه هوا چطوره", + "2017-06-29 00:00:00", "هوا چطوره") + testExtract("پنجشنبه ساعت ۸ بعد از ظهر حمله رو شروع کنید", + "2017-06-29 20:00:00", "حمله رو شروع کنید") + testExtract("ظهر پنجشنبه حمله رو شروع کنید", + "2017-06-29 12:00:00", "حمله رو شروع کنید") + testExtract("نصف شب پنجشنبه حمله رو شروع کنید", + "2017-06-29 00:00:00", "حمله رو شروع کنید") + testExtract("۴ سال بعد یادم بنداز بیدار شم", + "2021-06-27 00:00:00", "یادم بنداز بیدار شم") + testExtract("۴ سال و ۴ روز بعد یادم بنداز بیدار شم", + "2021-07-01 00:00:00", "یادم بنداز بیدار شم") + testExtract("۳ دسامبر", + "2017-12-03 00:00:00", "") + testExtract("امشب ساعت ۸:۰۰ ملاقات کنیم", + "2017-06-27 20:00:00", "ملاقات کنیم") + testExtract("۵ عصر ملاقات کنیم", + "2017-06-27 17:00:00", "ملاقات کنیم") + testExtract("۸ صبح ملاقات کنیم", + "2017-06-28 08:00:00", "ملاقات کنیم") + testExtract("ساعت ۸ صبح بیدارم کن", + "2017-06-28 08:00:00", "بیدارم کن") + testExtract("پنجشنبه هوا چطوره", + "2017-06-29 00:00:00", "هوا چطوره") + testExtract("دوشنبه هوا چطوره", + "2017-07-03 00:00:00", "هوا چطوره") + testExtract("این چهارشنبه هوا چطوره", + "2017-06-28 00:00:00", "هوا چطوره") + testExtract("دوشنبه قبل هوا چطور بود", + "2017-06-26 00:00:00", "هوا چطور بود") + testExtract("برای چهارشنبه ساعت ۸ عصر هشدار تنظیم کن", + "2017-06-28 20:00:00", "هشدار تنظیم کن") + testExtract("عصر پنجم ژوئن 2017 یادم بنداز به مامان زنگ بزنم", + "2017-06-05 19:00:00", "یادم بنداز به مامان زنگ بزنم") + testExtract("بعد ۳ هفته یادم بنداز به مامان زنگ بزنم", + "2017-07-18 00:00:00", "یادم بنداز به مامان زنگ بزنم") + testExtract("بعد ۸ هفته و دو روز یادم بنداز به مامان زنگ بزنم", + "2017-08-24 00:00:00", "یادم بنداز به مامان زنگ بزنم") + testExtract("بعد ۴ روز یادم بنداز به مامان زنگ بزنم", + "2017-07-01 00:00:00", "یادم بنداز به مامان زنگ بزنم") + testExtract("بعد ۳ ماه یادم بنداز به مامان زنگ بزنم", + "2017-09-27 00:00:00", "یادم بنداز به مامان زنگ بزنم") + testExtract("بعد ۲ سال و دو روز یادم بنداز به مامان زنگ بزنم", + "2019-06-29 00:00:00", "یادم بنداز به مامان زنگ بزنم") + testExtract("هفته بعد یادم بنداز به مامان زنگ بزنم", + "2017-07-04 00:00:00", "یادم بنداز به مامان زنگ بزنم") + testExtract("شنبه ۱۰ صبح یادم بنداز به مامان زنگ بزنم", + "2017-07-01 10:00:00", "یادم بنداز به مامان زنگ بزنم") + testExtract("شنبه بعد ساعت ۱۰ صبح یادم بنداز به مامان زنگ بزنم", + "2017-07-01 10:00:00", "یادم بنداز به مامان زنگ بزنم") + + # test yesterday + testExtract("دیروز چه روزی بود", + "2017-06-26 00:00:00", "چه روزی بود") + testExtract("دیروز ساعت ۶ شام خوردم", + "2017-06-26 06:00:00", "شام خوردم") + testExtract("دیروز ساعت ۶ صبح شام خوردم", + "2017-06-26 06:00:00", "شام خوردم") + testExtract("دیروز ساعت ۶ عصر شام خوردم", + "2017-06-26 18:00:00", "شام خوردم") + + testExtract("امشب ۸", + "2017-06-27 20:00:00", "") + testExtract("امشب ۸:۳۰", + "2017-06-27 20:30:00", "") + # Tests a time with ':' & without am/pm + testExtract("برای امشب ۹:۳۰ هشدار تنظیم کن", + "2017-06-27 21:30:00", "هشدار تنظیم کن") + testExtract("برای امشب ۹:۰۰ هشدار تنظیم کن", + "2017-06-27 21:00:00", "هشدار تنظیم کن") + + # "# days ago>" + testExtract("تولدم ۱ روز پیش بود", + "2017-06-26 00:00:00", "تولدم بود") + testExtract("تولدم ۲ روز پیش بود", + "2017-06-25 00:00:00", "تولدم بود") + + def test_extract_ambiguous_time_fa(self): + morning = datetime(2017, 6, 27, 8, 1, 2, tzinfo=default_timezone()) + evening = datetime(2017, 6, 27, 20, 1, 2, tzinfo=default_timezone()) + noonish = datetime(2017, 6, 27, 12, 1, 2, tzinfo=default_timezone()) + self.assertEqual( + extract_datetime('به ماهی غذا بده'), None) + self.assertEqual( + extract_datetime('روز'), None) + self.assertEqual( + extract_datetime('هفته'), None) + self.assertEqual( + extract_datetime('ماه'), None) + self.assertEqual( + extract_datetime('سال'), None) + self.assertEqual( + extract_datetime(' '), None) + + def test_extract_relativedatetime_fa(self): + def extractWithFormat(text): + date = datetime(2017, 6, 27, 10, 1, 2, tzinfo=default_timezone()) + [extractedDate, leftover] = extract_datetime(text, date) + extractedDate = extractedDate.strftime("%Y-%m-%d %H:%M:%S") + return [extractedDate, leftover] + + def testExtract(text, expected_date, expected_leftover): + res = extractWithFormat(normalize(text)) + self.assertEqual(res[0], expected_date, "for=" + text) + self.assertEqual(res[1], expected_leftover, "for=" + text) + + testExtract("۵ دقیقه بعد ملاقات کنیم", + "2017-06-27 10:06:02", "ملاقات کنیم") + testExtract("۵دقیقه بعد ملاقات کنیم", + "2017-06-27 10:06:02", "ملاقات کنیم") + testExtract("۵ ثانیه بعد ملاقات کنیم", + "2017-06-27 10:01:07", "ملاقات کنیم") + testExtract("یک ساعت بعد ملاقات کنیم", + "2017-06-27 11:01:02", "ملاقات کنیم") + testExtract("۲ساعت بعد ملاقات کنیم", + "2017-06-27 12:01:02", "ملاقات کنیم") + testExtract("۵ثانیه بعد ملاقات کنیم", + "2017-06-27 10:01:07", "ملاقات کنیم") + + def test_normalize_numbers(self): + self.assertEqual(normalize("ساعت دو به دو یادم بنداز"), + "ساعت 2 به 2 یادم بنداز") + self.assertEqual(normalize('دو دقیقه بعد ساعت چند میشه'), + '2 دقیقه بعد ساعت چند میشه') + self.assertEqual(normalize('بیست و دو دقیقه بعد ساعت چند میشه'), + '22 دقیقه بعد ساعت چند میشه') + self.assertEqual(normalize("بیست به دو یادم بنداز"), + "20 به 2 یادم بنداز") + + # test ordinals + self.assertEqual(normalize('این اولیه'), + 'این اول') + self.assertEqual(normalize('این اول دومیه'), + 'این یکم دوم') + + def test_extract_date_with_number_words(self): + now = datetime(2019, 7, 4, 8, 1, 2, tzinfo=default_timezone()) + self.assertEqual( + extract_datetime('۲ دقیقه بعد ساعت چند میشه', now)[0], + datetime(2019, 7, 4, 8, 3, 2, tzinfo=default_timezone())) + self.assertEqual( + extract_datetime('دو دقیقه بعد ساعت چند میشه', now)[0], + datetime(2019, 7, 4, 8, 3, 2, tzinfo=default_timezone())) + self.assertEqual( + extract_datetime('دویست دقیقه بعد چی میشه', now)[0], + datetime(2019, 7, 4, 11, 21, 2, tzinfo=default_timezone())) + + def test_numbers(self): + self.assertEqual(normalize("این تست یک دو سه است"), + "این تست 1 2 3 است") + self.assertEqual(normalize(" این تست چهار پنج شش است"), + "این تست 4 5 6 است") + self.assertEqual(normalize("این تست هفت هشت نه است"), + "این تست 7 8 9 است") + self.assertEqual(normalize("این تست ده یازده دوازده است"), + "این تست 10 11 12 است") + self.assertEqual(normalize("این تست سیزده چهارده است"), + "این تست 13 14 است") + self.assertEqual(normalize("این پانزده شانزده هفده است"), + "این 15 16 17 است") + self.assertEqual(normalize("این هجده نوزده بیست است"), + "این 18 19 20 است") + self.assertEqual(normalize("این یک نوزده بیست و دو است"), + "این 1 19 22 است") + self.assertEqual(normalize("این صد است"), + "این 100 است") + self.assertEqual(normalize("این یک دو بیست و دو است"), + "این 1 2 22 است") + self.assertEqual(normalize("این یک و نیم است"), + "این 1.5 است") + self.assertEqual(normalize("این یک و نیم و پنج و شش است"), + "این 1.5 5 6 است") def test_multiple_numbers(self): - self.assertEqual(extract_numbers("یک دو سه"), + self.assertEqual(extract_numbers("این تست یک دو سه است"), [1.0, 2.0, 3.0]) - self.assertEqual(extract_numbers("ده بیست سه پونزده هزار و شصت و شونزده"), - [10, 20, 3, 15060, 16]) - - + self.assertEqual(extract_numbers("این تست چهار پنج شش است"), + [4.0, 5.0, 6.0]) + self.assertEqual(extract_numbers("این تست ده یازده دوازده است"), + [10.0, 11.0, 12.0]) + self.assertEqual(extract_numbers("این تست یک بیست و یک است"), + [1.0, 21.0]) + self.assertEqual(extract_numbers("۱ سگ، هفت خوک، مک دونالد مزرعه دارد " + "۳ ضرب در ۵"), + [1.0, 7.0, 3.0, 5.0]) + self.assertEqual(extract_numbers("دو آبجو برای دو خرس"), + [2.0, 2.0]) + self.assertEqual(extract_numbers("بیست ۲۰ بیست"), + [20, 20, 20]) + self.assertEqual(extract_numbers("بیست ۲۰ ۲۲"), + [20.0, 20.0, 22.0]) + self.assertEqual(extract_numbers("بیست بیست و دو بیست"), + [20, 22, 20]) + self.assertEqual(extract_numbers("بیست و ۲"), + [22.0]) + self.assertEqual(extract_numbers("بیست ۲۰ بیست و ۲"), + [20, 20, 22]) + self.assertEqual(extract_numbers("یک سوم یک"), + [1 / 3, 1]) + self.assertEqual(extract_numbers("سومی", ordinals=True), [3]) + self.assertEqual(extract_numbers("شش تریلیون", short_scale=True), + [6e12]) + self.assertEqual(extract_numbers("شش تریلیون", short_scale=False), + [6e12]) + self.assertEqual(extract_numbers("دو خوک و شش تریلیون باکتری", + short_scale=True), [2, 6e12]) + self.assertEqual(extract_numbers("دو خوک و شش تریلیون باکتری", + short_scale=False), [2, 6e12]) + self.assertEqual(extract_numbers("سی و دوم یا یک", + ordinals=True), [32.0]) + self.assertEqual(extract_numbers("این تست هفت هشت نه و نیم است"), + [7.0, 8.0, 9.5]) + # def test_contractions(self): + # pass if __name__ == "__main__": unittest.main()