Skip to content

Commit

Permalink
reduce runtime for duration parsing (#124)
Browse files Browse the repository at this point in the history
  • Loading branch information
maxbachmann authored Oct 9, 2020
1 parent 2e1cb15 commit 30e084d
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 38 deletions.
36 changes: 17 additions & 19 deletions lingua_franca/lang/parse_cs.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,31 +633,29 @@ def extract_duration_cs(text):
# Czech inflection for time: minuta,minuty,minut - safe to use minut as pattern
# For day: den, dny, dnů - short patern not applicable, list all

time_units_en = {
'microseconds': None,
'milliseconds': None,
'seconds': None,
'minutes': None,
'hours': None,
'days': None,
'weeks': None
}
time_units = {
'microseconds': 0,
'milliseconds': 0,
'seconds': 0,
'minutes': 0,
'hours': 0,
'days': 0,
'weeks': 0
}

pattern = r"(?P<value>\d+(?:\.?\d+)?)(?:\s+|\-){unit}[ay]?"
text = _convert_words_to_numbers_cs(text)

for unit in _TIME_UNITS_CONVERSION:
unit_pattern = pattern.format(unit=unit)
matches = re.findall(unit_pattern, text)
value = sum(map(float, matches))
unit_en= _TIME_UNITS_CONVERSION.get(unit) # Find unit in english
# Check if is neccesary to write value - handle (days, weeks)
if time_units_en[unit_en] is None or time_units_en.get(unit_en) == 0:
time_units_en[unit_en] = value # Write value to english unit
text = re.sub(unit_pattern, '', text)
for (unit_cs, unit_en) in _TIME_UNITS_CONVERSION.items():
unit_pattern = pattern.format(unit=unit_cs)

def repl(match):
time_units[unit_en] += float(match.group(1))
return ''
text = re.sub(unit_pattern, repl, text)

text = text.strip()
duration = timedelta(**time_units_en) if any(time_units_en.values()) else None
duration = timedelta(**time_units) if any(time_units.values()) else None

return (duration, text)

Expand Down
13 changes: 7 additions & 6 deletions lingua_franca/lang/parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,14 @@ def extract_duration_de(text):
#### TODO Einstiegspunkt für Text-zu-Zahlen Konversion
#text = _convert_words_to_numbers_de(text)

for unit in time_units:
unit_de = time_units[unit]
for (unit_en, unit_de) in time_units.items():
unit_pattern = pattern.format(unit=unit_de[:-1]) # remove 'n'/'e' from unit
matches = re.findall(unit_pattern, text)
value = sum(map(float, matches))
time_units[unit] = value
text = re.sub(unit_pattern, '', text)
time_units[unit_en] = 0

def repl(match):
time_units[unit_en] += float(match.group(1))
return ''
text = re.sub(unit_pattern, repl, text)

text = text.strip()
duration = timedelta(**time_units) if any(time_units.values()) else None
Expand Down
27 changes: 14 additions & 13 deletions lingua_franca/lang/parse_en.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,24 +610,25 @@ def extract_duration_en(text):
return None

time_units = {
'microseconds': None,
'milliseconds': None,
'seconds': None,
'minutes': None,
'hours': None,
'days': None,
'weeks': None
'microseconds': 0,
'milliseconds': 0,
'seconds': 0,
'minutes': 0,
'hours': 0,
'days': 0,
'weeks': 0
}

pattern = r"(?P<value>\d+(?:\.?\d+)?)(?:\s+|\-){unit}s?"
text = _convert_words_to_numbers_en(text)

for unit in time_units:
unit_pattern = pattern.format(unit=unit[:-1]) # remove 's' from unit
matches = re.findall(unit_pattern, text)
value = sum(map(float, matches))
time_units[unit] = value
text = re.sub(unit_pattern, '', text)
for unit_en in time_units:
unit_pattern = pattern.format(unit=unit_en[:-1]) # remove 's' from unit

def repl(match):
time_units[unit_en] += float(match.group(1))
return ''
text = re.sub(unit_pattern, repl, text)

text = text.strip()
duration = timedelta(**time_units) if any(time_units.values()) else None
Expand Down

0 comments on commit 30e084d

Please sign in to comment.