diff --git a/lingua_franca/lang/parse_de.py b/lingua_franca/lang/parse_de.py index 9f6d4478..bf9f08cd 100644 --- a/lingua_franca/lang/parse_de.py +++ b/lingua_franca/lang/parse_de.py @@ -13,7 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # -from datetime import datetime +import re +from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \ extract_numbers_generic @@ -81,6 +82,62 @@ # The parameters are present in the function signature for API compatibility # reasons. +def extract_duration_de(text): + """ + Convert an german phrase into a number of seconds + Convert things like: + "10 Minuten" + "3 Tage 8 Stunden 10 Minuten und 49 Sekunden" + into an int, representing the total number of seconds. + The words used in the duration will be consumed, and + the remainder returned. + As an example, "set a timer for 5 minutes" would return + (300, "set a timer for"). + Args: + text (str): string containing a duration + Returns: + (timedelta, str): + A tuple containing the duration and the remaining text + not consumed in the parsing. The first value will + be None if no duration is found. The text returned + will have whitespace stripped from the ends. + """ + + if not text: + return None + + text=text.lower() + # die time_unit values werden für timedelta() mit dem jeweiligen Wert überschrieben + time_units = { + 'microseconds' : 'mikrosekunden', + 'milliseconds' : 'millisekunden', + 'seconds' : 'sekunden', + 'minutes' : 'minuten', + 'hours' : 'stunden', + 'days' : 'tage', + 'weeks' : 'wochen' + } + + #### Einzahl und Mehrzahl + pattern = r"(?P\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ne]?" + + #### TODO Einstiegspunkt für Text-zu-Zahlen Konversion + #text = _convert_words_to_numbers_de(text) + + for unit in time_units: + unit_de = time_units[unit] + unit_pattern = pattern.format(unit=unit_de[:-1]) # remove 'n'/'e' from unit + matches = re.findall(unit_pattern, text) + value = sum(map(float, matches)) + time_units[unit] = value + text = re.sub(unit_pattern, '', text) + + text = text.strip() + duration = timedelta(**time_units) if any(time_units.values()) else None + print(duration) + + return (duration, text) + def extractnumber_de(text, short_scale=True, ordinals=False): """ diff --git a/lingua_franca/parse.py b/lingua_franca/parse.py index 4d5a58bd..edc09364 100644 --- a/lingua_franca/parse.py +++ b/lingua_franca/parse.py @@ -26,6 +26,7 @@ from lingua_franca.lang.parse_de import extractnumber_de from lingua_franca.lang.parse_de import extract_numbers_de from lingua_franca.lang.parse_de import extract_datetime_de +from lingua_franca.lang.parse_de import extract_duration_de from lingua_franca.lang.parse_de import normalize_de from lingua_franca.lang.parse_fr import extractnumber_fr from lingua_franca.lang.parse_fr import extract_numbers_fr @@ -200,9 +201,11 @@ def extract_duration(text, lang=None): return extract_duration_en(text) if lang_code == "cs": return extract_duration_cs(text) + if lang_code == "de": + return extract_duration_de(text) # TODO: extract_duration for other languages - _log_unsupported_language(lang_code, ['en','cs']) + _log_unsupported_language(lang_code, ['en','cs', 'de']) return None diff --git a/test/test_parse_de.py b/test/test_parse_de.py index e0d097c3..85d85e9c 100644 --- a/test/test_parse_de.py +++ b/test/test_parse_de.py @@ -14,9 +14,10 @@ # limitations under the License. # import unittest -from datetime import datetime, time +from datetime import datetime, time, timedelta from lingua_franca.parse import extract_datetime +from lingua_franca.parse import extract_duration from lingua_franca.parse import extract_number from lingua_franca.parse import normalize @@ -170,6 +171,46 @@ def test_extractdatetime_default_de(self): anchor, lang='de-de', default_time=default) self.assertEqual(default, res[0].time()) + def test_extract_duration_de(self): + self.assertEqual(extract_duration("10 sekunden", lang="de-de"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5 minuten", lang="de-de"), + (timedelta(minutes=5), "")) + self.assertEqual(extract_duration("2 stunden", lang="de-de"), + (timedelta(hours=2), "")) + self.assertEqual(extract_duration("3 tage", lang="de-de"), + (timedelta(days=3), "")) + self.assertEqual(extract_duration("25 wochen", lang="de-de"), + (timedelta(weeks=25), "")) + # TODO no german text to number parsing yet + #self.assertEqual(extract_duration("sieben stunden"), + # (timedelta(hours=7), "")) + self.assertEqual(extract_duration("7.5 sekunden", lang="de-de"), + (timedelta(seconds=7.5), "")) + #self.assertEqual(extract_duration("eight and a half days thirty" + # " nine seconds"), + # (timedelta(days=8.5, seconds=39), "")) + self.assertEqual(extract_duration("starte timer für 30 minuten", lang="de-de"), + (timedelta(minutes=30), "starte timer für")) + #self.assertEqual(extract_duration("Four and a half minutes until" + # " sunset"), + # (timedelta(minutes=4.5), "until sunset")) + #self.assertEqual(extract_duration("Nineteen minutes past the hour"), + # (timedelta(minutes=19), "past the hour")) + self.assertEqual(extract_duration("weck mich in 3 wochen, " + " 497 tage und" + " 391.6 sekunden", lang="de-de"), + (timedelta(weeks=3, days=497, seconds=391.6), + "weck mich in , und")) + #self.assertEqual(extract_duration("The movie is one hour, fifty seven" + # " and a half minutes long"), + # (timedelta(hours=1, minutes=57.5), + # "the movie is , long")) + self.assertEqual(extract_duration("10-sekunden", lang="de-de"), + (timedelta(seconds=10.0), "")) + self.assertEqual(extract_duration("5-minuten", lang="de-de"), + (timedelta(minutes=5), "")) + def test_spaces(self): self.assertEqual(normalize(" dies ist ein test", lang="de-de"), "dies ist 1 test")