From 811550cd3cbe2711f6402ab070df874b56ee1d30 Mon Sep 17 00:00:00 2001 From: jarbasai Date: Sun, 9 May 2021 13:20:35 +0100 Subject: [PATCH] support decimal markers rebase of https://github.com/MycroftAI/lingua-franca/pull/69 --- lingua_franca/lang/parse_common.py | 12 +++++++++++ lingua_franca/lang/parse_cs.py | 19 ++++++++++++++--- lingua_franca/lang/parse_da.py | 23 ++++++++++++++++++--- lingua_franca/lang/parse_de.py | 31 ++++++++++++++++++++-------- lingua_franca/lang/parse_en.py | 17 +++++++++++++-- lingua_franca/lang/parse_es.py | 30 ++++++++++++++++++++++----- lingua_franca/lang/parse_eu.py | 10 ++++++++- lingua_franca/lang/parse_fa.py | 18 ++++++++++++++-- lingua_franca/lang/parse_fr.py | 33 +++++++++++++++++++++++++----- lingua_franca/lang/parse_it.py | 18 ++++++++++++++-- lingua_franca/lang/parse_nl.py | 19 +++++++++++++++-- lingua_franca/lang/parse_pl.py | 18 ++++++++++++++-- lingua_franca/lang/parse_pt.py | 21 +++++++++++++++---- lingua_franca/lang/parse_ru.py | 10 ++++++++- lingua_franca/lang/parse_sv.py | 11 +++++++++- lingua_franca/parse.py | 14 +++++++++++-- test/unittests/test_parse_en.py | 11 ++++++++++ 17 files changed, 272 insertions(+), 43 deletions(-) diff --git a/lingua_franca/lang/parse_common.py b/lingua_franca/lang/parse_common.py index 97cf5be7..f140ac9d 100644 --- a/lingua_franca/lang/parse_common.py +++ b/lingua_franca/lang/parse_common.py @@ -192,6 +192,18 @@ def normalize(self, utterance="", remove_articles=None): return utterance +def normalize_decimals(text, decimal, lang=""): + """ + Replace 'decimal' with decimal periods so Python can floatify them + """ + regex = r"\b\d+" + decimal + r"{1}\d+\b" + sanitize_decimals = re.compile(regex) + for _, match in enumerate(re.finditer(sanitize_decimals, text)): + text = text.replace(match.group( + 0), match.group(0).replace(decimal, '.')) + return text + + def match_yes_or_no(text, lang): resource_file = resolve_resource_file(f"text/{lang}/yesno.json") if not resource_file: diff --git a/lingua_franca/lang/parse_cs.py b/lingua_franca/lang/parse_cs.py index e0144b02..13d66317 100644 --- a/lingua_franca/lang/parse_cs.py +++ b/lingua_franca/lang/parse_cs.py @@ -23,7 +23,7 @@ _LONG_ORDINAL_CS, _LONG_SCALE_CS, _SHORT_SCALE_CS, _SHORT_ORDINAL_CS, \ _FRACTION_STRING_CS, _MONTHS_CONVERSION, _MONTHS_CZECH, _TIME_UNITS_CONVERSION, \ _ORDINAL_BASE_CS # _ARTICLES_CS - +from lingua_franca.lang.parse_common import normalize_decimals import re import json from lingua_franca import resolve_resource_file @@ -579,7 +579,7 @@ def _initialize_number_data(short_scale): return multiplies, string_num_ordinal_cs, string_num_scale_cs -def extract_number_cs(text, short_scale=True, ordinals=False): +def extract_number_cs(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -590,11 +590,17 @@ def extract_number_cs(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_cs(tokenize(text.lower()), short_scale, ordinals).value @@ -1560,7 +1566,7 @@ def isFractional_cs(input_str, short_scale=True): return False -def extract_numbers_cs(text, short_scale=True, ordinals=False): +def extract_numbers_cs(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1571,9 +1577,16 @@ def extract_numbers_cs(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_cs(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_da.py b/lingua_franca/lang/parse_da.py index 14b18132..5cd15be9 100644 --- a/lingua_franca/lang/parse_da.py +++ b/lingua_franca/lang/parse_da.py @@ -20,22 +20,32 @@ from lingua_franca.lang.common_data_da import _DA_NUMBERS from lingua_franca.lang.format_da import pronounce_number_da from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals -def extract_number_da(text, short_scale=True, ordinals=False): +def extract_number_da(text, short_scale=True, ordinals=False, decimal='.'): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. undefined articles cannot be suppressed in German: 'ein Pferd' means 'one horse' and 'a horse' """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -869,7 +879,7 @@ def normalize_da(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_da(text, short_scale=True, ordinals=False): +def extract_numbers_da(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -880,9 +890,16 @@ def extract_numbers_da(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_da, extract_number_da, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index 95fda48e..3a7bbbd5 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -21,6 +21,7 @@ from lingua_franca.lang.common_data_de import _DE_NUMBERS from lingua_franca.lang.format_de import pronounce_number_de from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals de_numbers = { @@ -143,20 +144,31 @@ def repl(match): return (duration, text) -def extract_number_de(text, short_scale=True, ordinals=False): +def extract_number_de(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number - + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. - undefined articles cannot be suppressed in German: - 'ein Pferd' means 'one horse' and 'a horse' + undefined articles cannot be suppressed in German: + 'ein Pferd' means 'one horse' and 'a horse' """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1003,7 +1015,7 @@ def normalize_de(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_de(text, short_scale=True, ordinals=False): +def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1014,9 +1026,12 @@ def extract_numbers_de(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_de, extract_number_de, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_en.py b/lingua_franca/lang/parse_en.py index a51ee02c..d7d4902c 100644 --- a/lingua_franca/lang/parse_en.py +++ b/lingua_franca/lang/parse_en.py @@ -29,6 +29,7 @@ from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ invert_dict, ReplaceableNumber, partition_list, tokenize, Token, Normalizer from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def _convert_words_to_numbers_en(text, short_scale=True, ordinals=False): @@ -529,7 +530,7 @@ def _initialize_number_data_en(short_scale, speech=True): return multiplies, string_num_ordinal_en, string_num_scale_en -def extract_number_en(text, short_scale=True, ordinals=False): +def extract_number_en(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -540,11 +541,17 @@ def extract_number_en(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_en(tokenize(text.lower()), short_scale, ordinals).value @@ -1655,7 +1662,7 @@ def is_fractional_en(input_str, short_scale=True, spoken=True): return False -def extract_numbers_en(text, short_scale=True, ordinals=False): +def extract_numbers_en(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1666,9 +1673,15 @@ def extract_numbers_en(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_en(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_es.py b/lingua_franca/lang/parse_es.py index 0a810cc4..aa59d093 100644 --- a/lingua_franca/lang/parse_es.py +++ b/lingua_franca/lang/parse_es.py @@ -20,6 +20,7 @@ from lingua_franca.lang.format_es import pronounce_number_es from lingua_franca.lang.parse_common import * from lingua_franca.lang.common_data_es import _ARTICLES_ES, _STRING_NUM_ES +from lingua_franca.lang.parse_common import normalize_decimals def is_fractional_es(input_str, short_scale=True): @@ -56,16 +57,28 @@ def is_fractional_es(input_str, short_scale=True): return False -def extract_number_es(text, short_scale=True, ordinals=False): +def extract_number_es(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -268,7 +281,7 @@ def es_number(i): return es_number(i) -def extract_numbers_es(text, short_scale=True, ordinals=False): +def extract_numbers_es(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -279,9 +292,16 @@ def extract_numbers_es(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_es, extract_number_es, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_eu.py b/lingua_franca/lang/parse_eu.py index 8d10162b..2da45533 100644 --- a/lingua_franca/lang/parse_eu.py +++ b/lingua_franca/lang/parse_eu.py @@ -23,6 +23,7 @@ from lingua_franca.lang.format_eu import pronounce_number_eu from lingua_franca.lang.parse_common import * from lingua_franca.lang.common_data_eu import _NUM_STRING_EU +from lingua_franca.lang.parse_common import normalize_decimals def isFractional_eu(input_str): @@ -283,7 +284,7 @@ def eu_number(i): return eu_number(i) -def extract_numbers_eu(text, short_scale=True, ordinals=False): +def extract_numbers_eu(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -294,9 +295,16 @@ def extract_numbers_eu(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_eu, extract_number_eu, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_fa.py b/lingua_franca/lang/parse_fa.py index 753ac8eb..a15116cc 100644 --- a/lingua_franca/lang/parse_fa.py +++ b/lingua_franca/lang/parse_fa.py @@ -19,6 +19,7 @@ _FARSI_ONES, _FARSI_TENS, _FORMAL_VARIANT) from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def _is_number(s): @@ -307,7 +308,7 @@ def extract_datetime_fa(text, anchorDate=None, default_time=None): return (result, " ".join(remainder)) -def extract_numbers_fa(text, short_scale=True, ordinals=False): +def extract_numbers_fa(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -318,9 +319,16 @@ def extract_numbers_fa(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) ar = _parse_sentence(text) result = [] @@ -330,7 +338,7 @@ def extract_numbers_fa(text, short_scale=True, ordinals=False): return result -def extract_number_fa(text, ordinals=False): +def extract_number_fa(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -341,11 +349,17 @@ def extract_number_fa(text, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) x = extract_numbers_fa(text, ordinals=ordinals) if (len(x) == 0): return False diff --git a/lingua_franca/lang/parse_fr.py b/lingua_franca/lang/parse_fr.py index 9728653f..9e211e29 100644 --- a/lingua_franca/lang/parse_fr.py +++ b/lingua_franca/lang/parse_fr.py @@ -23,6 +23,7 @@ from lingua_franca.lang.common_data_fr import _ARTICLES_FR, _NUMBERS_FR, \ _ORDINAL_ENDINGS_FR from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def extract_duration_fr(text): @@ -369,13 +370,28 @@ def _number_ordinal_fr(words, i): return None -def extract_number_fr(text, short_scale=True, ordinals=False): - """Takes in a string and extracts a number. +def extract_number_fr(text, short_scale=True, ordinals=False, decimal='.'): + """ + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: - text (str): the string to extract a number from + text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (str): The number extracted or the original text. + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. @@ -1067,7 +1083,7 @@ def normalize_fr(text, remove_articles=True): return normalized[1:] # strip the initial space -def extract_numbers_fr(text, short_scale=True, ordinals=False): +def extract_numbers_fr(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1078,9 +1094,16 @@ def extract_numbers_fr(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_fr, extract_number_fr, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_it.py b/lingua_franca/lang/parse_it.py index 88c7455d..922057d3 100644 --- a/lingua_franca/lang/parse_it.py +++ b/lingua_franca/lang/parse_it.py @@ -28,6 +28,7 @@ pronounce_number_it from lingua_franca.lang.common_data_it import _SHORT_ORDINAL_STRING_IT, \ _ARTICLES_IT, _LONG_ORDINAL_STRING_IT, _STRING_NUM_IT +from lingua_franca.lang.parse_common import normalize_decimals def is_fractional_it(input_str, short_scale=False): @@ -224,7 +225,7 @@ def _extract_number_long_it(word): return value -def extract_number_it(text, short_scale=False, ordinals=False): +def extract_number_it(text, short_scale=False, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -235,11 +236,17 @@ def extract_number_it(text, short_scale=False, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) text = text.lower() string_num_ordinal_it = {} @@ -1148,7 +1155,7 @@ def get_gender_it(word, context=""): return gender -def extract_numbers_it(text, short_scale=False, ordinals=False): +def extract_numbers_it(text, short_scale=False, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1159,9 +1166,16 @@ def extract_numbers_it(text, short_scale=False, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return extract_numbers_generic(text, pronounce_number_it, extract_number_it, short_scale=short_scale, ordinals=ordinals) diff --git a/lingua_franca/lang/parse_nl.py b/lingua_franca/lang/parse_nl.py index ba197704..c67a95d1 100644 --- a/lingua_franca/lang/parse_nl.py +++ b/lingua_franca/lang/parse_nl.py @@ -25,6 +25,7 @@ _NEGATIVES_NL, _SHORT_SCALE_NL, _STRING_LONG_ORDINAL_NL, _STRING_NUM_NL, \ _STRING_SHORT_ORDINAL_NL, _SUMS_NL from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals import re @@ -414,7 +415,7 @@ def _initialize_number_data_nl(short_scale): return multiplies, string_num_ordinal_nl, string_num_scale_nl -def extract_number_nl(text, short_scale=True, ordinals=False): +def extract_number_nl(text, short_scale=True, ordinals=False, decimal='.'): """Extract a number from a text string The function handles pronunciations in long scale and short scale @@ -425,10 +426,17 @@ def extract_number_nl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_nl(tokenize(text.lower()), short_scale, ordinals).value @@ -1294,7 +1302,7 @@ def is_fractional_nl(input_str, short_scale=True): return False -def extract_numbers_nl(text, short_scale=True, ordinals=False): +def extract_numbers_nl(text, short_scale=True, ordinals=False, decimal='.'): """Takes in a string and extracts a list of numbers. Args: @@ -1304,9 +1312,16 @@ def extract_numbers_nl(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_nl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_pl.py b/lingua_franca/lang/parse_pl.py index 84f83bc8..57b1245e 100644 --- a/lingua_franca/lang/parse_pl.py +++ b/lingua_franca/lang/parse_pl.py @@ -24,6 +24,7 @@ _TIME_UNITS_NORMALIZATION, _MONTHS_TO_EN, _DAYS_TO_EN, _ORDINAL_BASE_PL, \ _ALT_ORDINALS_PL from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals import re @@ -576,7 +577,7 @@ def _initialize_number_data(short_scale): return multiplies, _STRING_SHORT_ORDINAL_PL, string_num_scale -def extract_number_pl(text, short_scale=True, ordinals=False): +def extract_number_pl(text, short_scale=True, ordinals=False, decimal='.'): """ This function extracts a number from a text string, handles pronunciations in long scale and short scale @@ -587,11 +588,17 @@ def extract_number_pl(text, short_scale=True, ordinals=False): text (str): the string to normalize short_scale (bool): use short scale if True, long scale if False ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float) or False: The extracted number or False if no number was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) return _extract_number_with_text_pl(tokenize(text.lower()), True, ordinals).value @@ -1333,7 +1340,7 @@ def isFractional_pl(input_str, short_scale=True): return False -def extract_numbers_pl(text, short_scale=True, ordinals=False): +def extract_numbers_pl(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1344,9 +1351,16 @@ def extract_numbers_pl(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_pl(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_pt.py b/lingua_franca/lang/parse_pt.py index 356c1e83..c4533063 100644 --- a/lingua_franca/lang/parse_pt.py +++ b/lingua_franca/lang/parse_pt.py @@ -29,6 +29,7 @@ from lingua_franca.internal import resolve_resource_file from lingua_franca.lang.parse_common import Normalizer from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals import json import re import unicodedata @@ -77,16 +78,28 @@ def is_fractional_pt(input_str, short_scale=True): return False -def extract_number_pt(text, short_scale=True, ordinals=False): +def extract_number_pt(text, short_scale=True, ordinals=False, decimal='.'): """ - This function prepares the given text for parsing by making - numbers consistent, getting rid of contractions, etc. + This function extracts a number from a text string, + handles pronunciations in long scale and short scale + + https://en.wikipedia.org/wiki/Names_of_large_numbers + Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: - (int) or (float): The value of extracted number + (int) or (float) or False: The extracted number or False if no number + was found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API compatibility # reasons. diff --git a/lingua_franca/lang/parse_ru.py b/lingua_franca/lang/parse_ru.py index cd041ec7..035e2810 100644 --- a/lingua_franca/lang/parse_ru.py +++ b/lingua_franca/lang/parse_ru.py @@ -28,6 +28,7 @@ import json from lingua_franca import resolve_resource_file from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals def generate_plurals_ru(originals): @@ -1577,7 +1578,7 @@ def is_fractional_ru(input_str, short_scale=True): return False -def extract_numbers_ru(text, short_scale=True, ordinals=False): +def extract_numbers_ru(text, short_scale=True, ordinals=False, decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -1588,9 +1589,16 @@ def extract_numbers_ru(text, short_scale=True, ordinals=False): is now common in most English speaking countries. See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. + """ + if decimal != '.': + text = normalize_decimals(text, decimal) results = _extract_numbers_with_text_ru(tokenize(text), short_scale, ordinals) return [float(result.value) for result in results] diff --git a/lingua_franca/lang/parse_sv.py b/lingua_franca/lang/parse_sv.py index 02164111..bb23f2ee 100644 --- a/lingua_franca/lang/parse_sv.py +++ b/lingua_franca/lang/parse_sv.py @@ -17,6 +17,7 @@ from dateutil.relativedelta import relativedelta from lingua_franca.time import now_local +from lingua_franca.lang.parse_common import normalize_decimals from .parse_common import (is_numeric, look_for_fractions, Normalizer, tokenize, Token) @@ -156,15 +157,23 @@ def extract_duration_sv(text): return (td, remainder) if valid else None -def extract_number_sv(text, short_scale=True, ordinals=False): +def extract_number_sv(text, short_scale=True, ordinals=False, decimal='.'): """ This function prepares the given text for parsing by making numbers consistent, getting rid of contractions, etc. Args: text (str): the string to normalize + short_scale (bool): use short scale if True, long scale if False + ordinals (bool): consider ordinal numbers, third=3 instead of 1/3 + decimal (str): character to use as decimal point. defaults to '.' Returns: (int) or (float): The value of extracted number + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ + if decimal != '.': + text = normalize_decimals(text, decimal) # TODO: short_scale and ordinals don't do anything here. # The parameters are present in the function signature for API # compatibility reasons. diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index f1602717..1b4590f0 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -56,7 +56,8 @@ def extract_langcode(text, lang=""): @localized_function() -def extract_numbers(text, short_scale=True, ordinals=False, lang=''): +def extract_numbers(text, short_scale=True, ordinals=False, lang='', + decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -69,13 +70,18 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=''): ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats, or empty list if none found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ @localized_function() -def extract_number(text, short_scale=True, ordinals=False, lang=''): +def extract_number(text, short_scale=True, ordinals=False, lang='', + decimal='.'): """Takes in a string and extracts a number. Args: @@ -87,9 +93,13 @@ def extract_number(text, short_scale=True, ordinals=False, lang=''): ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str, optional): an optional BCP-47 language code, if omitted the default language will be used. + decimal (str): character to use as decimal point. defaults to '.' Returns: (int, float or False): The number extracted or False if the input text contains no numbers + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ diff --git a/test/unittests/test_parse_en.py b/test/unittests/test_parse_en.py index caae8999..7aeb3df9 100644 --- a/test/unittests/test_parse_en.py +++ b/test/unittests/test_parse_en.py @@ -290,6 +290,17 @@ def test_combinations(self): class TestExtractNumber(unittest.TestCase): + def test_extract_number_decimal_markers(self): + # Test decimal normalization + self.assertEqual(extract_number("4,4", decimal=','), 4.4) + self.assertEqual(extract_number("we have 3,5 kilometers to go", + decimal=','), 3.5) + self.assertEqual(extract_numbers("this is a seven eight 9,5 test", + decimal=','), + [7.0, 8.0, 9.5]) + self.assertEqual(extract_numbers("this is a 7,0 8.0 9,6 test", + decimal=','), [7.0, 8.0, 9.6]) + def test_extract_number_priority(self): # sanity check self.assertEqual(extract_number("third", ordinals=True), 3)