From ecda042cb39562b88f719c1e30d79156c8924815 Mon Sep 17 00:00:00 2001 From: emphasize Date: Fri, 31 Jul 2020 23:02:15 +0200 Subject: [PATCH 1/5] Small fix to enable duration parsing --- lingua_franca/lang/parse_de.py | 59 +++++++++++++++++++++++++++++++++- lingua_franca/parse.py | 5 ++- 2 files changed, 62 insertions(+), 2 deletions(-) diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index 4f21c54b..fdaf9370 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from datetime import datetime +import re +from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ extract_numbers_generic @@ -81,6 +82,62 @@ # The parameters are present in the function signature for API compatibility # reasons. +def extract_duration_de(text): + """ + Convert an german phrase into a number of seconds + Convert things like: + "10 Minuten" + "3 Tage 8 Stunden 10 Minuten und 49 Sekunden" + into an int, representing the total number of seconds. + The words used in the duration will be consumed, and + the remainder returned. + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + Args: + text (str): string containing a duration + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + + if not text: + return None + + text=text.lower() + # die time_unit values werden für timedelta() mit dem jeweiligen Wert überschrieben + time_units = { + 'microseconds' : 'mikrosekunden', + 'milliseconds' : 'millisekunden', + 'seconds' : 'sekunden', + 'minutes' : 'minuten', + 'hours' : 'stunden', + 'days' : 'tage', + 'weeks' : 'wochen' + } + + #### Einzahl und Mehrzahl + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ne]?" + + #### Einstiegspunkt für Text-zu-Zahlen Konversion + #text = _convert_words_to_numbers_de(text) + + for unit in time_units: + unit_de = time_units[unit] + unit_pattern = pattern.format(unit=unit_de[:-1]) # remove 'n'/'e' from unit + matches = re.findall(unit_pattern, text) + value = sum(map(float, matches)) + time_units[unit] = value + text = re.sub(unit_pattern, '', text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + print(duration) + + return (duration, text) + def extractnumber_de(text, short_scale=True, ordinals=False): """ diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index 4d5a58bd..a393c6ac 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -26,6 +26,7 @@ from lingua_franca.lang.parse_de import extractnumber_de from lingua_franca.lang.parse_de import extract_numbers_de from lingua_franca.lang.parse_de import extract_datetime_de +from lingua_franca.lang.parse_de import extract_duration_de from lingua_franca.lang.parse_de import normalize_de from lingua_franca.lang.parse_fr import extractnumber_fr from lingua_franca.lang.parse_fr import extract_numbers_fr @@ -200,9 +201,11 @@ def extract_duration(text, lang=None): return extract_duration_en(text) if lang_code == "cs": return extract_duration_cs(text) + if lang_code == "de": + return extract_duration_cs(text) # TODO: extract_duration for other languages - _log_unsupported_language(lang_code, ['en','cs']) + _log_unsupported_language(lang_code, ['en','cs', 'de']) return None From 6cb87fc1a758369caa3d7e88b12083805f8a3c2a Mon Sep 17 00:00:00 2001 From: emphasize Date: Thu, 6 Aug 2020 23:37:35 +0200 Subject: [PATCH 2/5] typo cs to de --- lingua_franca/parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index a393c6ac..edc09364 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -202,7 +202,7 @@ def extract_duration(text, lang=None): if lang_code == "cs": return extract_duration_cs(text) if lang_code == "de": - return extract_duration_cs(text) + return extract_duration_de(text) # TODO: extract_duration for other languages _log_unsupported_language(lang_code, ['en','cs', 'de']) From caedbbe1e67ffa74a06f8f7a128eaecb31b295d2 Mon Sep 17 00:00:00 2001 From: emphasize Date: Thu, 6 Aug 2020 23:41:56 +0200 Subject: [PATCH 3/5] adding commentation --- lingua_franca/lang/parse_de.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index fdaf9370..2e0daf5d 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -121,7 +121,7 @@ def extract_duration_de(text): #### Einzahl und Mehrzahl pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ne]?" - #### Einstiegspunkt für Text-zu-Zahlen Konversion + #### TODO Einstiegspunkt für Text-zu-Zahlen Konversion #text = _convert_words_to_numbers_de(text) for unit in time_units: From a262106341f3798bc3b5626c0c5dd1586bc810ad Mon Sep 17 00:00:00 2001 From: emphasize Date: Fri, 7 Aug 2020 00:13:17 +0200 Subject: [PATCH 4/5] added Testing templates for extract_duration --- test/test_parse_de.py | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/test/test_parse_de.py b/test/test_parse_de.py index 1a10f0af..64764f95 100644 --- a/test/test_parse_de.py +++ b/test/test_parse_de.py @@ -166,6 +166,46 @@ def test_extractdatetime_default_de(self): anchor, lang='de-de', default_time=default) self.assertEqual(default, res[0].time()) + def test_extract_duration_de(self): + self.assertEqual(extract_duration("10 sekunden"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5 minuten"), + (timedelta(minutes=5), "")) + self.assertEqual(extract_duration("2 stunden"), + (timedelta(hours=2), "")) + self.assertEqual(extract_duration("3 tage"), + (timedelta(days=3), "")) + self.assertEqual(extract_duration("25 wochen"), + (timedelta(weeks=25), "")) + # TODO no german text to number parsing yet + #self.assertEqual(extract_duration("sieben stunden"), + # (timedelta(hours=7), "")) + self.assertEqual(extract_duration("7.5 sekunden"), + (timedelta(seconds=7.5), "")) + #self.assertEqual(extract_duration("eight and a half days thirty" + # " nine seconds"), + # (timedelta(days=8.5, seconds=39), "")) + self.assertEqual(extract_duration("starte timer für 30 minuten"), + (timedelta(minutes=30), "starte timer für")) + #self.assertEqual(extract_duration("Four and a half minutes until" + # " sunset"), + # (timedelta(minutes=4.5), "until sunset")) + #self.assertEqual(extract_duration("Nineteen minutes past the hour"), + # (timedelta(minutes=19), "past the hour")) + self.assertEqual(extract_duration("weck mich in 3 wochen, " + " 497 tagen und" + " 391.6 sekunden"), + (timedelta(weeks=3, days=497, seconds=391.6), + "weck mich in , , und")) + #self.assertEqual(extract_duration("The movie is one hour, fifty seven" + # " and a half minutes long"), + # (timedelta(hours=1, minutes=57.5), + # "the movie is , long")) + self.assertEqual(extract_duration("10-sekunden"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5-minuten"), + (timedelta(minutes=5), "")) + def test_spaces(self): self.assertEqual(normalize(" dies ist ein test", lang="de-de"), "dies ist 1 test") From 3d5a5effdfbf50ae3ce5cc97c437cf6a7ea4db8f Mon Sep 17 00:00:00 2001 From: emphasize Date: Fri, 7 Aug 2020 01:30:22 +0200 Subject: [PATCH 5/5] touched up the testing templates --- test/test_parse_de.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/test/test_parse_de.py b/test/test_parse_de.py index 64764f95..650fbc21 100644 --- a/test/test_parse_de.py +++ b/test/test_parse_de.py @@ -14,9 +14,10 @@ # limitations under the License. # import unittest -from datetime import datetime, time +from datetime import datetime, time, timedelta from lingua_franca.parse import extract_datetime +from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number from lingua_franca.parse import normalize @@ -167,25 +168,25 @@ def test_extractdatetime_default_de(self): self.assertEqual(default, res[0].time()) def test_extract_duration_de(self): - self.assertEqual(extract_duration("10 sekunden"), + self.assertEqual(extract_duration("10 sekunden", lang="de-de"), (timedelta(seconds=10.0), "")) - self.assertEqual(extract_duration("5 minuten"), + self.assertEqual(extract_duration("5 minuten", lang="de-de"), (timedelta(minutes=5), "")) - self.assertEqual(extract_duration("2 stunden"), + self.assertEqual(extract_duration("2 stunden", lang="de-de"), (timedelta(hours=2), "")) - self.assertEqual(extract_duration("3 tage"), + self.assertEqual(extract_duration("3 tage", lang="de-de"), (timedelta(days=3), "")) - self.assertEqual(extract_duration("25 wochen"), + self.assertEqual(extract_duration("25 wochen", lang="de-de"), (timedelta(weeks=25), "")) # TODO no german text to number parsing yet #self.assertEqual(extract_duration("sieben stunden"), # (timedelta(hours=7), "")) - self.assertEqual(extract_duration("7.5 sekunden"), + self.assertEqual(extract_duration("7.5 sekunden", lang="de-de"), (timedelta(seconds=7.5), "")) #self.assertEqual(extract_duration("eight and a half days thirty" # " nine seconds"), # (timedelta(days=8.5, seconds=39), "")) - self.assertEqual(extract_duration("starte timer für 30 minuten"), + self.assertEqual(extract_duration("starte timer für 30 minuten", lang="de-de"), (timedelta(minutes=30), "starte timer für")) #self.assertEqual(extract_duration("Four and a half minutes until" # " sunset"), @@ -193,17 +194,17 @@ def test_extract_duration_de(self): #self.assertEqual(extract_duration("Nineteen minutes past the hour"), # (timedelta(minutes=19), "past the hour")) self.assertEqual(extract_duration("weck mich in 3 wochen, " - " 497 tagen und" - " 391.6 sekunden"), + " 497 tage und" + " 391.6 sekunden", lang="de-de"), (timedelta(weeks=3, days=497, seconds=391.6), - "weck mich in , , und")) + "weck mich in , und")) #self.assertEqual(extract_duration("The movie is one hour, fifty seven" # " and a half minutes long"), # (timedelta(hours=1, minutes=57.5), # "the movie is , long")) - self.assertEqual(extract_duration("10-sekunden"), + self.assertEqual(extract_duration("10-sekunden", lang="de-de"), (timedelta(seconds=10.0), "")) - self.assertEqual(extract_duration("5-minuten"), + self.assertEqual(extract_duration("5-minuten", lang="de-de"), (timedelta(minutes=5), "")) def test_spaces(self):