From 9690ffa3e8fe5b28be60ad0ff84446c4fde2014c Mon Sep 17 00:00:00 2001 From: jarbasal Date: Sun, 9 May 2021 13:18:37 +0100 Subject: [PATCH] feat/support_decimal_markers rebase of https://github.com/MycroftAI/lingua-franca/pull/69 --- lingua_nostra/lang/parse_cs.py | 31 ++++++++---- lingua_nostra/lang/parse_da.py | 48 +++++++++++------- lingua_nostra/lang/parse_de.py | 59 +++++++++++++-------- lingua_nostra/lang/parse_en.py | 17 ++++++- lingua_nostra/lang/parse_es.py | 44 +++++++++++----- lingua_nostra/lang/parse_fa.py | 93 +++++++++++++++------------------- lingua_nostra/lang/parse_fr.py | 47 ++++++++++++----- lingua_nostra/lang/parse_it.py | 32 ++++++++---- lingua_nostra/lang/parse_nl.py | 37 +++++++++----- lingua_nostra/lang/parse_pl.py | 35 +++++++++---- lingua_nostra/lang/parse_pt.py | 21 ++++++-- lingua_nostra/lang/parse_sv.py | 11 +++- lingua_nostra/parse.py | 31 ++++++++++-- test/test_parse.py | 11 ++++ test/test_parse_fa.py | 4 +- 15 files changed, 350 insertions(+), 171 deletions(-) diff --git a/lingua_nostra/lang/parse_cs.py b/lingua_nostra/lang/parse_cs.py index 601a06b7..cc00db8b 100644 --- a/lingua_nostra/lang/parse_cs.py +++ b/lingua_nostra/lang/parse_cs.py @@ -23,7 +23,7 @@ _LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \ _FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \ _ORDINAL_BASE_CS # _ARTICLES_CS - +from lingua_nostra.parse import normalize_decimals import re import json from lingua_nostra import resolve_resource_file @@ -579,7 +579,7 @@ def _initialize_number_data(short_scale): return multiplies, string_num_ordinal_cs, string_num_scale_cs -def extract_number_cs(text, short_scale=True, ordinals=False): +def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_cs(tokenize(text.lower()), short_scale, ordinals).value @@ -1560,20 +1566,25 @@ def isFractional_cs(input_str, short_scale=True): return False -def extract_numbers_cs(text, short_scale=True, ordinals=False): +def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_cs(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_nostra/lang/parse_da.py b/lingua_nostra/lang/parse_da.py index 15871fc9..227e672c 100644 --- a/lingua_nostra/lang/parse_da.py +++ b/lingua_nostra/lang/parse_da.py @@ -20,22 +20,31 @@ from lingua_nostra.lang.common_data_da import _DA_NUMBERS from lingua_nostra.lang.format_da import pronounce_number_da from lingua_nostra.time import now_local +from lingua_nostra.parse import normalize_decimals -def extract_number_da(text, short_scale=True, ordinals=False): +def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -869,20 +878,25 @@ def normalize_da(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_da(text, short_scale=True, ordinals=False): +def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_da, extract_number_da, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_nostra/lang/parse_de.py b/lingua_nostra/lang/parse_de.py index 315a1398..ba9101ff 100644 --- a/lingua_nostra/lang/parse_de.py +++ b/lingua_nostra/lang/parse_de.py @@ -21,6 +21,7 @@ from lingua_nostra.lang.common_data_de import _DE_NUMBERS from lingua_nostra.lang.format_de import pronounce_number_de from lingua_nostra.time import now_local +from lingua_nostra.parse import normalize_decimals de_numbers = { @@ -143,20 +144,28 @@ def repl(match): return (duration, text) -def extract_number_de(text, short_scale=True, ordinals=False): +def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - - - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1003,20 +1012,28 @@ def normalize_de(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_de(text, short_scale=True, ordinals=False): - """ - Takes in a string and extracts a list of numbers. - - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 - Returns: - list: list of extracted numbers as floats +def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'): """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' + Returns: + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_de, extract_number_de, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_nostra/lang/parse_en.py b/lingua_nostra/lang/parse_en.py index 44d1c2eb..829b5adc 100644 --- a/lingua_nostra/lang/parse_en.py +++ b/lingua_nostra/lang/parse_en.py @@ -30,6 +30,7 @@ import re import json from lingua_nostra.internal import resolve_resource_file +from lingua_nostra.parse import normalize_decimals def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False): @@ -530,7 +531,7 @@ def _initialize_number_data_en(short_scale, speech=True): return multiplies, string_num_ordinal_en, string_num_scale_en -def extract_number_en(text, short_scale=True, ordinals=False): +def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -541,11 +542,17 @@ def extract_number_en(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_en(tokenize(text.lower()), short_scale, ordinals).value @@ -1453,7 +1460,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True): return False -def extract_numbers_en(text, short_scale=True, ordinals=False): +def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1464,9 +1471,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_en(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_nostra/lang/parse_es.py b/lingua_nostra/lang/parse_es.py index ead3c875..3fdf2586 100644 --- a/lingua_nostra/lang/parse_es.py +++ b/lingua_nostra/lang/parse_es.py @@ -20,6 +20,7 @@ from lingua_nostra.lang.format_es import pronounce_number_es from lingua_nostra.lang.parse_common import * from lingua_nostra.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES +from lingua_nostra.parse import normalize_decimals def is_fractional_es(input_str, short_scale=True): @@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True): return False -def extract_number_es(text, short_scale=True, ordinals=False): +def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -268,20 +281,25 @@ def es_number(i): return es_number(i) -def extract_numbers_es(text, short_scale=True, ordinals=False): +def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_es, extract_number_es, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_nostra/lang/parse_fa.py b/lingua_nostra/lang/parse_fa.py index 30e41fc1..cf2897aa 100644 --- a/lingua_nostra/lang/parse_fa.py +++ b/lingua_nostra/lang/parse_fa.py @@ -21,6 +21,7 @@ import json from lingua_nostra.internal import resolve_resource_file +from lingua_nostra.parse import normalize_decimals def _is_number(s): @@ -30,6 +31,7 @@ def _is_number(s): except ValueError: return False + def _parse_sentence(text): for key, value in _FORMAL_VARIANT.items(): text = text.replace(key, value) @@ -40,6 +42,7 @@ def _parse_sentence(text): s = 0 step = 10 mode = 'init' + def finish_num(): nonlocal current_number nonlocal s @@ -53,13 +56,14 @@ def finish_num(): current_number = 0 current_words = [] mode = 'init' + for x in ar: if x == "و": if mode == 'num_ten' or mode == 'num_hundred' or mode == 'num_one': mode += '_va' current_words.append(x) elif mode == 'num': - current_words.append(x) + current_words.append(x) else: finish_num() result.append(x) @@ -70,7 +74,7 @@ def finish_num(): elif x in _FARSI_ONES: t = _FARSI_ONES.index(x) if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': - if not(t < 10 and mode == 'num_ten_va'): + if not (t < 10 and mode == 'num_ten_va'): finish_num() current_words.append(x) s += t @@ -79,20 +83,20 @@ def finish_num(): if mode != 'init' and mode != 'num_hundred_va' and mode != 'num': finish_num() current_words.append(x) - s += _FARSI_TENS.index(x)*10 + s += _FARSI_TENS.index(x) * 10 mode = 'num_ten' elif x in _FARSI_HUNDREDS: if mode != 'init' and mode != 'num': finish_num() current_words.append(x) - s += _FARSI_HUNDREDS.index(x)*100 + s += _FARSI_HUNDREDS.index(x) * 100 mode = 'num_hundred' elif x in _FARSI_BIG: current_words.append(x) d = _FARSI_BIG.index(x) if mode == 'init' and d == 1: s = 1 - s *= 10**(3*d) + s *= 10 ** (3 * d) current_number += s s = 0 mode = 'num' @@ -119,6 +123,7 @@ def finish_num(): 'هفته': timedelta(weeks=1), } + def extract_duration_fa(text): """ Convert an english phrase into a number of seconds @@ -207,9 +212,8 @@ def extract_datetime_fa(text, anchorDate=None, default_time=None): .replace('سه شنبه', 'سهشنبه') \ .replace('چهار شنبه', 'چهارشنبه') \ .replace('پنج شنبه', 'پنجشنبه') \ - .replace('بعد از ظهر', 'بعدازظهر') \ - - + .replace('بعد از ظهر', 'بعدازظهر') + if not anchorDate: anchorDate = datetime.now() today = anchorDate.replace(hour=0, minute=0, second=0, microsecond=0) @@ -224,11 +228,11 @@ def extract_datetime_fa(text, anchorDate=None, default_time=None): 'یکشنبه', ] daysDict = { - 'پریروز': today + timedelta(days= -2), - 'دیروز': today + timedelta(days= -1), + 'پریروز': today + timedelta(days=-2), + 'دیروز': today + timedelta(days=-1), 'امروز': today, - 'فردا': today + timedelta(days= 1), - 'پسفردا': today + timedelta(days= 2), + 'فردا': today + timedelta(days=1), + 'پسفردا': today + timedelta(days=2), } timesDict = { 'صبح': timedelta(hours=8), @@ -306,49 +310,26 @@ def extract_datetime_fa(text, anchorDate=None, default_time=None): remainder.append(x) return (result, " ".join(remainder)) -def is_fractional_fa(input_str, short_scale=True): - """ - This function takes the given text and checks if it is a fraction. - - Args: - input_str (str): the string to check if fractional - short_scale (bool): use short scale if True, long scale if False - Returns: - (bool) or (float): False if not a fraction, otherwise the fraction - - """ - if input_str.endswith('s', -1): - input_str = input_str[:len(input_str) - 1] # e.g. "fifths" - - fracts = {"whole": 1, "half": 2, "halve": 2, "quarter": 4} - if short_scale: - for num in _SHORT_ORDINAL_FA: - if num > 2: - fracts[_SHORT_ORDINAL_FA[num]] = num - else: - for num in _LONG_ORDINAL_FA: - if num > 2: - fracts[_LONG_ORDINAL_FA[num]] = num - - if input_str.lower() in fracts: - return 1.0 / fracts[input_str.lower()] - return False - -def extract_numbers_fa(text, short_scale=True, ordinals=False): +def extract_numbers_fa(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) ar = _parse_sentence(text) result = [] @@ -358,7 +339,7 @@ def extract_numbers_fa(text, short_scale=True, ordinals=False): return result -def extract_number_fa(text, ordinals=False): +def extract_number_fa(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -369,21 +350,29 @@ def extract_number_fa(text, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) x = extract_numbers_fa(text, ordinals=ordinals) if (len(x) == 0): return False return x[0] -class EnglishNormalizer(Normalizer): + +class FarsiNormalizer(Normalizer): + # TODO with open(resolve_resource_file("text/en-us/normalize.json")) as f: _default_config = json.load(f) def normalize_fa(text, remove_articles=True): """ English string normalization """ - return EnglishNormalizer().normalize(text, remove_articles) + return FarsiNormalizer().normalize(text, remove_articles) diff --git a/lingua_nostra/lang/parse_fr.py b/lingua_nostra/lang/parse_fr.py index 3264d016..2ae0cf41 100644 --- a/lingua_nostra/lang/parse_fr.py +++ b/lingua_nostra/lang/parse_fr.py @@ -23,6 +23,7 @@ from lingua_nostra.lang.common_data_fr import _ARTICLES_FR, _NUMBERS_FR, \ _ORDINAL_ENDINGS_FR from lingua_nostra.time import now_local +from lingua_nostra.parse import normalize_decimals def extract_duration_fr(text): @@ -369,13 +370,28 @@ def _number_ordinal_fr(words, i): return None -def extract_number_fr(text, short_scale=True, ordinals=False): - """Takes in a string and extracts a number. +def extract_number_fr(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: - text (str): the string to extract a number from + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (str): The number extracted or the original text. + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1068,20 +1084,25 @@ def normalize_fr(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_fr(text, short_scale=True, ordinals=False): +def extract_numbers_fr(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_fr, extract_number_fr, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_nostra/lang/parse_it.py b/lingua_nostra/lang/parse_it.py index c4e77263..e76f663b 100644 --- a/lingua_nostra/lang/parse_it.py +++ b/lingua_nostra/lang/parse_it.py @@ -28,6 +28,7 @@ pronounce_number_it from lingua_nostra.lang.common_data_it import _SHORT_ORDINAL_STRING_IT, \ _ARTICLES_IT, _LONG_ORDINAL_STRING_IT, _STRING_NUM_IT +from lingua_nostra.parse import normalize_decimals def is_fractional_it(input_str, short_scale=False): @@ -224,7 +225,7 @@ def _extract_number_long_it(word): return value -def extract_number_it(text, short_scale=False, ordinals=False): +def extract_number_it(text, short_scale=False, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -235,11 +236,17 @@ def extract_number_it(text, short_scale=False, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) text = text.lower() string_num_ordinal_it = {} @@ -1148,20 +1155,25 @@ def get_gender_it(word, context=""): return gender -def extract_numbers_it(text, short_scale=False, ordinals=False): +def extract_numbers_it(text, short_scale=False, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. - Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + Args: + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_it, extract_number_it, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_nostra/lang/parse_nl.py b/lingua_nostra/lang/parse_nl.py index ea149654..277341a5 100644 --- a/lingua_nostra/lang/parse_nl.py +++ b/lingua_nostra/lang/parse_nl.py @@ -25,6 +25,7 @@ _NEGATIVES_NL, _SHORT_SCALE_NL, _STRING_LONG_ORDINAL_NL, _STRING_NUM_NL, \ _STRING_SHORT_ORDINAL_NL, _SUMS_NL from lingua_nostra.time import now_local +from lingua_nostra.parse import normalize_decimals import re @@ -414,10 +415,10 @@ def _initialize_number_data_nl(short_scale): return multiplies, string_num_ordinal_nl, string_num_scale_nl -def extract_number_nl(text, short_scale=True, ordinals=False): - """Extract a number from a text string - - The function handles pronunciations in long scale and short scale +def extract_number_nl(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale https://en.wikipedia.org/wiki/Names_of_large_numbers @@ -425,10 +426,17 @@ def extract_number_nl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_nl(tokenize(text.lower()), short_scale, ordinals).value @@ -1294,19 +1302,24 @@ def is_fractional_nl(input_str, short_scale=True): return False -def extract_numbers_nl(text, short_scale=True, ordinals=False): +def extract_numbers_nl(text, short_scale=True, ordinals=False, decimal='.'): """Takes in a string and extracts a list of numbers. Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_nl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_nostra/lang/parse_pl.py b/lingua_nostra/lang/parse_pl.py index 6ae8e9a1..bf0ae736 100644 --- a/lingua_nostra/lang/parse_pl.py +++ b/lingua_nostra/lang/parse_pl.py @@ -23,6 +23,7 @@ _SHORT_SCALE_PL, _SHORT_ORDINAL_PL, _FRACTION_STRING_PL, _TIME_UNITS_CONVERSION, \ _TIME_UNITS_NORMALIZATION, _MONTHS_TO_EN, _DAYS_TO_EN, _ORDINAL_BASE_PL, \ _ALT_ORDINALS_PL +from lingua_nostra.parse import normalize_decimals from lingua_nostra.time import now_local import re @@ -576,7 +577,7 @@ def _initialize_number_data(short_scale): return multiplies, _STRING_SHORT_ORDINAL_PL, string_num_scale -def extract_number_pl(text, short_scale=True, ordinals=False): +def extract_number_pl(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -587,11 +588,17 @@ def extract_number_pl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_pl(tokenize(text.lower()), True, ordinals).value @@ -1333,20 +1340,28 @@ def isFractional_pl(input_str, short_scale=True): return False -def extract_numbers_pl(text, short_scale=True, ordinals=False): +def extract_numbers_pl(text, short_scale=True, ordinals=False, decimal='.'): """ - Takes in a string and extracts a list of numbers. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers Args: - text (str): the string to extract a number from - short_scale (bool): Use "short scale" or "long scale" for large - numbers -- over a million. The default is short scale, which - is now common in most English speaking countries. - See https://en.wikipedia.org/wiki/Names_of_large_numbers - ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - list: list of extracted numbers as floats + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_pl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_nostra/lang/parse_pt.py b/lingua_nostra/lang/parse_pt.py index ac5572fb..8cc140f7 100644 --- a/lingua_nostra/lang/parse_pt.py +++ b/lingua_nostra/lang/parse_pt.py @@ -29,6 +29,7 @@ from lingua_nostra.internal import resolve_resource_file from lingua_nostra.lang.parse_common import Normalizer from lingua_nostra.time import now_local +from lingua_nostra.parse import normalize_decimals import json import re @@ -67,16 +68,28 @@ def is_fractional_pt(input_str, short_scale=True): return False -def extract_number_pt(text, short_scale=True, ordinals=False): +def extract_number_pt(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. diff --git a/lingua_nostra/lang/parse_sv.py b/lingua_nostra/lang/parse_sv.py index 43fbf4c7..6bf38563 100644 --- a/lingua_nostra/lang/parse_sv.py +++ b/lingua_nostra/lang/parse_sv.py @@ -16,18 +16,27 @@ from datetime import datetime from dateutil.relativedelta import relativedelta from lingua_nostra.time import now_local +from lingua_nostra.parse import normalize_decimals from .parse_common import is_numeric, look_for_fractions, Normalizer -def extract_number_sv(text, short_scale=True, ordinals=False): +def extract_number_sv(text, short_scale=True, ordinals=False, decimal='.'): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float): The value of extracted number + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. diff --git a/lingua_nostra/parse.py b/lingua_nostra/parse.py index 479afa06..6c7c6fbd 100644 --- a/lingua_nostra/parse.py +++ b/lingua_nostra/parse.py @@ -13,13 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import re from difflib import SequenceMatcher from warnings import warn from lingua_nostra.time import now_local from lingua_nostra.internal import populate_localized_function_dict, \ get_active_langs, get_full_lang_code, get_primary_lang_code, \ - get_default_lang, localized_function, _raise_unsupported_language + get_default_lang, localized_function, FunctionNotLocalizedError _REGISTERED_FUNCTIONS = ("extract_numbers", "extract_number", @@ -33,6 +33,19 @@ populate_localized_function_dict("parse", langs=get_active_langs()) +@localized_function(run_own_code_on=[FunctionNotLocalizedError]) +def normalize_decimals(text, decimal, lang=""): + """ + Replace 'decimal' with decimal periods so Python can floatify them + """ + regex = r"\b\d+" + decimal + r"{1}\d+\b" + sanitize_decimals = re.compile(regex) + for _, match in enumerate(re.finditer(sanitize_decimals, text)): + text = text.replace(match.group( + 0), match.group(0).replace(decimal, '.')) + return text + + def fuzzy_match(x: str, against: str) -> float: """Perform a 'fuzzy' comparison between two strings. @@ -74,7 +87,8 @@ def match_one(query, choices): @localized_function() -def extract_numbers(text, short_scale=True, ordinals=False, lang=''): +def extract_numbers(text, short_scale=True, ordinals=False, lang='', + decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -87,13 +101,18 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=''): ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats, or empty list if none found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ @localized_function() -def extract_number(text, short_scale=True, ordinals=False, lang=''): +def extract_number(text, short_scale=True, ordinals=False, lang='', + decimal='.'): """Takes in a string and extracts a number. Args: @@ -105,9 +124,13 @@ def extract_number(text, short_scale=True, ordinals=False, lang=''): ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' Returns: (int, float or False): The number extracted or False if the input text contains no numbers + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ diff --git a/test/test_parse.py b/test/test_parse.py index a5dba551..d0043878 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -72,6 +72,17 @@ def test_articles(self): remove_articles=False), "this is an extra test") + def test_extract_number_decimal_markers(self): + # Test decimal normalization + self.assertEqual(extract_number("4,4", decimal=','), 4.4) + self.assertEqual(extract_number("we have 3,5 kilometers to go", + decimal=','), 3.5) + self.assertEqual(extract_numbers("this is a seven eight 9,5 test", + decimal=','), + [7.0, 8.0, 9.5]) + self.assertEqual(extract_numbers("this is a 7,0 8.0 9,6 test", + decimal=','), [7.0, 8.0, 9.6]) + def test_extract_number_priority(self): # sanity check self.assertEqual(extract_number("third", ordinals=True), 3) diff --git a/test/test_parse_fa.py b/test/test_parse_fa.py index d1121ea0..adaf6651 100644 --- a/test/test_parse_fa.py +++ b/test/test_parse_fa.py @@ -73,7 +73,7 @@ def test_extract_number(self): self.assertEqual(extract_number("دو میلیون و پانصد هزار " "تن گوشت یخ زده"), 2500000) - def test_extract_duration_en(self): + def test_extract_duration_farsi(self): self.assertEqual(extract_duration("10 ثانیه"), (timedelta(seconds=10.0), "")) self.assertEqual(extract_duration("5 دقیقه"), @@ -102,7 +102,7 @@ def test_extract_duration_en(self): (timedelta(hours=1, minutes=57.5), "این فیلم طول می کشد")) - def test_extractdatetime_en(self): + def test_extractdatetime_farsi(self): def extractWithFormat(text): date = datetime(2017, 6, 27, 13, 4, tzinfo=default_timezone()) # Tue June 27, 2017 @ 1:04pm [extractedDate, leftover] = extract_datetime(text, date)