From 12453ab0bb99666f2df63072a83542dc9ac5b4e9 Mon Sep 17 00:00:00 2001 From: ChanceNCounter Date: Fri, 10 Jan 2020 17:40:05 -0800 Subject: [PATCH 1/6] Add logic to normalize comma-delimited decimals --- lingua_franca/parse.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index 303baedd..76e580ac 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -14,6 +14,8 @@ # limitations under the License. # from difflib import SequenceMatcher +import re + from lingua_franca.time import now_local from lingua_franca.lang import get_primary_lang_code @@ -93,6 +95,13 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None): Returns: list: list of extracted numbers as floats, or empty list if none found """ + # Replace decimal commas with decimal periods so Python can floatify them + sanitize_decimals = re.compile(r".*\d+,{1}\d+") + match = sanitize_decimals.match(text) + while match: + text = text.replace(match[0], match[0].replace(',', '.')) + match = sanitize_decimals.match(text) + lang_code = get_primary_lang_code(lang) if lang_code == "en": return extract_numbers_en(text, short_scale, ordinals) From b7c8ad6feaf6cc2894ef16bf337e26289a465fa0 Mon Sep 17 00:00:00 2001 From: ChanceNCounter Date: Sat, 11 Jan 2020 02:13:38 -0800 Subject: [PATCH 2/6] spin off normalize_decimal logic create function for both extract_number and extract_numbers to call --- lingua_franca/parse.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index 76e580ac..800e5a3f 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -80,6 +80,18 @@ def match_one(query, choices): return best +def normalize_decimals(text): + """ + Replace decimal commas with decimal periods so Python can floatify them + """ + sanitize_decimals = re.compile(r".*\d+,{1}\d+") + match = sanitize_decimals.match(text) + while match: + text = text.replace(match[0], match[0].replace(',', '.')) + match = sanitize_decimals.match(text) + return text + + def extract_numbers(text, short_scale=True, ordinals=False, lang=None): """ Takes in a string and extracts a list of numbers. @@ -95,13 +107,7 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None): Returns: list: list of extracted numbers as floats, or empty list if none found """ - # Replace decimal commas with decimal periods so Python can floatify them - sanitize_decimals = re.compile(r".*\d+,{1}\d+") - match = sanitize_decimals.match(text) - while match: - text = text.replace(match[0], match[0].replace(',', '.')) - match = sanitize_decimals.match(text) - + text = normalize_decimals(text) lang_code = get_primary_lang_code(lang) if lang_code == "en": return extract_numbers_en(text, short_scale, ordinals) @@ -136,6 +142,7 @@ def extract_number(text, short_scale=True, ordinals=False, lang=None): (int, float or False): The number extracted or False if the input text contains no numbers """ + text = normalize_decimals(text) lang_code = get_primary_lang_code(lang) if lang_code == "en": return extractnumber_en(text, short_scale=short_scale, From f7e8f5b4b2c46be5d22d7cae8939b0a03fef4499 Mon Sep 17 00:00:00 2001 From: ChanceNCounter Date: Sat, 11 Jan 2020 09:30:54 -0800 Subject: [PATCH 3/6] iterate over regex the python.regex way --- lingua_franca/parse.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index 800e5a3f..950d80d6 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -84,11 +84,9 @@ def normalize_decimals(text): """ Replace decimal commas with decimal periods so Python can floatify them """ - sanitize_decimals = re.compile(r".*\d+,{1}\d+") - match = sanitize_decimals.match(text) - while match: + sanitize_decimals = re.compile(r"\b\d+,{1}\d+\b") + for _, match in enumerate(re.finditer(sanitize_decimals, text)): text = text.replace(match[0], match[0].replace(',', '.')) - match = sanitize_decimals.match(text) return text From 9375e550c00ebc7a58d6b2cc88afa38c5b5dbc48 Mon Sep 17 00:00:00 2001 From: ChanceNCounter Date: Sat, 11 Jan 2020 09:38:06 -0800 Subject: [PATCH 4/6] add tests for decimal normalization --- test/test_parse.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test_parse.py b/test/test_parse.py index 01aec528..915e9b8a 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -123,6 +123,11 @@ def test_extract_number(self): short_scale=False), 1e12) self.assertEqual(extract_number("this is the billionth test", short_scale=False), 1e-12) + + # Test decimal normalization + self.assertEqual(extract_number("4,4"), 4.4) + self.assertEqual(extract_number("we have 3,5 kilometers to go"), 3.5) + # TODO handle this case # self.assertEqual( # extract_number("6 dot six six six"), @@ -703,6 +708,9 @@ def test_multiple_numbers(self): self.assertEqual(extract_numbers("this is a seven eight nine and a" " half test"), [7.0, 8.0, 9.5]) + self.assertEqual(extract_numbers("this is a seven eight 9,5 test"), + [7.0, 8.0, 9.5]) + self.assertEqual(extract_numbers("this is a 7,0 8.0 9,6 test"), [7.0, 8.0, 9.6]) def test_contractions(self): self.assertEqual(normalize("ain't"), "is not") From 246855d5d6b3cf567bfa09754f9b9388852a3a35 Mon Sep 17 00:00:00 2001 From: ChanceNCounter Date: Sat, 11 Jan 2020 10:36:52 -0800 Subject: [PATCH 5/6] fix regex to support py3.5 --- lingua_franca/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index 950d80d6..b906e729 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -86,7 +86,7 @@ def normalize_decimals(text): """ sanitize_decimals = re.compile(r"\b\d+,{1}\d+\b") for _, match in enumerate(re.finditer(sanitize_decimals, text)): - text = text.replace(match[0], match[0].replace(',', '.')) + text = text.replace(match.group(0), match.group(0).replace(',', '.')) return text From 402c1f271e70b1a354018bb296751762d216f2eb Mon Sep 17 00:00:00 2001 From: ChanceNCounter Date: Sun, 2 Feb 2020 10:15:49 -0800 Subject: [PATCH 6/6] replace comma-decimal handling with param Alternate decimal points now specified with function parameter --- lingua_franca/parse.py | 32 ++++++++++++++++++++++++-------- test/test_parse.py | 11 +++++++---- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index b906e729..cf5a6fb0 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -80,17 +80,20 @@ def match_one(query, choices): return best -def normalize_decimals(text): +def normalize_decimals(text, decimal): """ - Replace decimal commas with decimal periods so Python can floatify them + Replace 'decimal' with decimal periods so Python can floatify them """ - sanitize_decimals = re.compile(r"\b\d+,{1}\d+\b") + regex = r"\b\d+" + decimal + r"{1}\d+\b" + sanitize_decimals = re.compile(regex) for _, match in enumerate(re.finditer(sanitize_decimals, text)): - text = text.replace(match.group(0), match.group(0).replace(',', '.')) + text = text.replace(match.group( + 0), match.group(0).replace(decimal, '.')) return text -def extract_numbers(text, short_scale=True, ordinals=False, lang=None): +def extract_numbers(text, short_scale=True, ordinals=False, lang=None, + decimal='.'): """ Takes in a string and extracts a list of numbers. @@ -102,10 +105,16 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None): See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str): the BCP-47 code for the language to use, None uses default + decimal (str): character to use as decimal point. defaults to '.' Returns: list: list of extracted numbers as floats, or empty list if none found + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ - text = normalize_decimals(text) + if decimal != '.': + text = normalize_decimals(text, decimal) + lang_code = get_primary_lang_code(lang) if lang_code == "en": return extract_numbers_en(text, short_scale, ordinals) @@ -125,7 +134,8 @@ def extract_numbers(text, short_scale=True, ordinals=False, lang=None): return [] -def extract_number(text, short_scale=True, ordinals=False, lang=None): +def extract_number(text, short_scale=True, ordinals=False, lang=None, + decimal='.'): """Takes in a string and extracts a number. Args: @@ -136,11 +146,17 @@ def extract_number(text, short_scale=True, ordinals=False, lang=None): See https://en.wikipedia.org/wiki/Names_of_large_numbers ordinals (bool): consider ordinal numbers, e.g. third=3 instead of 1/3 lang (str): the BCP-47 code for the language to use, None uses default + decimal (str): character to use as decimal point. defaults to '.' Returns: (int, float or False): The number extracted or False if the input text contains no numbers + Note: + will always extract numbers formatted with a decimal dot/full stop, + such as '3.5', even if 'decimal' is specified. """ - text = normalize_decimals(text) + if decimal != '.': + text = normalize_decimals(text, decimal) + lang_code = get_primary_lang_code(lang) if lang_code == "en": return extractnumber_en(text, short_scale=short_scale, diff --git a/test/test_parse.py b/test/test_parse.py index 915e9b8a..afc1fddc 100644 --- a/test/test_parse.py +++ b/test/test_parse.py @@ -125,8 +125,9 @@ def test_extract_number(self): short_scale=False), 1e-12) # Test decimal normalization - self.assertEqual(extract_number("4,4"), 4.4) - self.assertEqual(extract_number("we have 3,5 kilometers to go"), 3.5) + self.assertEqual(extract_number("4,4", decimal=','), 4.4) + self.assertEqual(extract_number("we have 3,5 kilometers to go", + decimal=','), 3.5) # TODO handle this case # self.assertEqual( @@ -708,9 +709,11 @@ def test_multiple_numbers(self): self.assertEqual(extract_numbers("this is a seven eight nine and a" " half test"), [7.0, 8.0, 9.5]) - self.assertEqual(extract_numbers("this is a seven eight 9,5 test"), + self.assertEqual(extract_numbers("this is a seven eight 9,5 test", + decimal=','), [7.0, 8.0, 9.5]) - self.assertEqual(extract_numbers("this is a 7,0 8.0 9,6 test"), [7.0, 8.0, 9.6]) + self.assertEqual(extract_numbers("this is a 7,0 8.0 9,6 test", + decimal=','), [7.0, 8.0, 9.6]) def test_contractions(self): self.assertEqual(normalize("ain't"), "is not")