Skip to content

Commit

Permalink
Merge pull request #119 from emphasize/fix/german_duration_parsing
Browse files Browse the repository at this point in the history
Small fix to enable german duration parsing
  • Loading branch information
krisgesling authored Aug 7, 2020
2 parents d8f9840 + 3d5a5ef commit 086b712
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 3 deletions.
59 changes: 58 additions & 1 deletion lingua_franca/lang/parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
from datetime import datetime
import re
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from lingua_franca.lang.parse_common import is_numeric, look_for_fractions, \
extract_numbers_generic
Expand Down Expand Up @@ -81,6 +82,62 @@
# The parameters are present in the function signature for API compatibility
# reasons.

def extract_duration_de(text):
"""
Convert an german phrase into a number of seconds
Convert things like:
"10 Minuten"
"3 Tage 8 Stunden 10 Minuten und 49 Sekunden"
into an int, representing the total number of seconds.
The words used in the duration will be consumed, and
the remainder returned.
As an example, "set a timer for 5 minutes" would return
(300, "set a timer for").
Args:
text (str): string containing a duration
Returns:
(timedelta, str):
A tuple containing the duration and the remaining text
not consumed in the parsing. The first value will
be None if no duration is found. The text returned
will have whitespace stripped from the ends.
"""

if not text:
return None

text=text.lower()
# die time_unit values werden für timedelta() mit dem jeweiligen Wert überschrieben
time_units = {
'microseconds' : 'mikrosekunden',
'milliseconds' : 'millisekunden',
'seconds' : 'sekunden',
'minutes' : 'minuten',
'hours' : 'stunden',
'days' : 'tage',
'weeks' : 'wochen'
}

#### Einzahl und Mehrzahl
pattern = r"(?P<value>\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ne]?"

#### TODO Einstiegspunkt für Text-zu-Zahlen Konversion
#text = _convert_words_to_numbers_de(text)

for unit in time_units:
unit_de = time_units[unit]
unit_pattern = pattern.format(unit=unit_de[:-1]) # remove 'n'/'e' from unit
matches = re.findall(unit_pattern, text)
value = sum(map(float, matches))
time_units[unit] = value
text = re.sub(unit_pattern, '', text)

text = text.strip()
duration = timedelta(**time_units) if any(time_units.values()) else None
print(duration)

return (duration, text)


def extractnumber_de(text, short_scale=True, ordinals=False):
"""
Expand Down
5 changes: 4 additions & 1 deletion lingua_franca/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from lingua_franca.lang.parse_de import extractnumber_de
from lingua_franca.lang.parse_de import extract_numbers_de
from lingua_franca.lang.parse_de import extract_datetime_de
from lingua_franca.lang.parse_de import extract_duration_de
from lingua_franca.lang.parse_de import normalize_de
from lingua_franca.lang.parse_fr import extractnumber_fr
from lingua_franca.lang.parse_fr import extract_numbers_fr
Expand Down Expand Up @@ -200,9 +201,11 @@ def extract_duration(text, lang=None):
return extract_duration_en(text)
if lang_code == "cs":
return extract_duration_cs(text)
if lang_code == "de":
return extract_duration_de(text)

# TODO: extract_duration for other languages
_log_unsupported_language(lang_code, ['en','cs'])
_log_unsupported_language(lang_code, ['en','cs', 'de'])
return None


Expand Down
43 changes: 42 additions & 1 deletion test/test_parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
# limitations under the License.
#
import unittest
from datetime import datetime, time
from datetime import datetime, time, timedelta

from lingua_franca.parse import extract_datetime
from lingua_franca.parse import extract_duration
from lingua_franca.parse import extract_number
from lingua_franca.parse import normalize

Expand Down Expand Up @@ -170,6 +171,46 @@ def test_extractdatetime_default_de(self):
anchor, lang='de-de', default_time=default)
self.assertEqual(default, res[0].time())

def test_extract_duration_de(self):
self.assertEqual(extract_duration("10 sekunden", lang="de-de"),
(timedelta(seconds=10.0), ""))
self.assertEqual(extract_duration("5 minuten", lang="de-de"),
(timedelta(minutes=5), ""))
self.assertEqual(extract_duration("2 stunden", lang="de-de"),
(timedelta(hours=2), ""))
self.assertEqual(extract_duration("3 tage", lang="de-de"),
(timedelta(days=3), ""))
self.assertEqual(extract_duration("25 wochen", lang="de-de"),
(timedelta(weeks=25), ""))
# TODO no german text to number parsing yet
#self.assertEqual(extract_duration("sieben stunden"),
# (timedelta(hours=7), ""))
self.assertEqual(extract_duration("7.5 sekunden", lang="de-de"),
(timedelta(seconds=7.5), ""))
#self.assertEqual(extract_duration("eight and a half days thirty"
# " nine seconds"),
# (timedelta(days=8.5, seconds=39), ""))
self.assertEqual(extract_duration("starte timer für 30 minuten", lang="de-de"),
(timedelta(minutes=30), "starte timer für"))
#self.assertEqual(extract_duration("Four and a half minutes until"
# " sunset"),
# (timedelta(minutes=4.5), "until sunset"))
#self.assertEqual(extract_duration("Nineteen minutes past the hour"),
# (timedelta(minutes=19), "past the hour"))
self.assertEqual(extract_duration("weck mich in 3 wochen, "
" 497 tage und"
" 391.6 sekunden", lang="de-de"),
(timedelta(weeks=3, days=497, seconds=391.6),
"weck mich in , und"))
#self.assertEqual(extract_duration("The movie is one hour, fifty seven"
# " and a half minutes long"),
# (timedelta(hours=1, minutes=57.5),
# "the movie is , long"))
self.assertEqual(extract_duration("10-sekunden", lang="de-de"),
(timedelta(seconds=10.0), ""))
self.assertEqual(extract_duration("5-minuten", lang="de-de"),
(timedelta(minutes=5), ""))

def test_spaces(self):
self.assertEqual(normalize(" dies ist ein test", lang="de-de"),
"dies ist 1 test")
Expand Down

0 comments on commit 086b712

Please sign in to comment.