Skip to content

Commit

Permalink
feat/ german json normalizer (MycroftAI#22)
Browse files Browse the repository at this point in the history
Co-authored-by: jarbasal <jarbasai@mailfence.com>
  • Loading branch information
JarbasAl and JarbasAl committed May 9, 2021
1 parent 2812840 commit 66fd95b
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 64 deletions.
72 changes: 8 additions & 64 deletions lingua_nostra/lang/parse_de.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
#
import re
import json
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from lingua_nostra.lang.parse_common import is_numeric, look_for_fractions, \
Expand All @@ -22,69 +23,7 @@
from lingua_nostra.lang.format_de import pronounce_number_de
from lingua_nostra.time import now_local
from lingua_nostra.parse import normalize_decimals


de_numbers = {
'null': 0,
'ein': 1,
'eins': 1,
'eine': 1,
'einer': 1,
'einem': 1,
'einen': 1,
'eines': 1,
'zwei': 2,
'drei': 3,
'vier': 4,
'fünf': 5,
'sechs': 6,
'sieben': 7,
'acht': 8,
'neun': 9,
'zehn': 10,
'elf': 11,
'zwölf': 12,
'dreizehn': 13,
'vierzehn': 14,
'fünfzehn': 15,
'sechzehn': 16,
'siebzehn': 17,
'achtzehn': 18,
'neunzehn': 19,
'zwanzig': 20,
'einundzwanzig': 21,
'zweiundzwanzig': 22,
'dreiundzwanzig': 23,
'vierundzwanzig': 24,
'fünfundzwanzig': 25,
'sechsundzwanzig': 26,
'siebenundzwanzig': 27,
'achtundzwanzig': 28,
'neunundzwanzig': 29,
'dreißig': 30,
'einunddreißig': 31,
'vierzig': 40,
'fünfzig': 50,
'sechzig': 60,
'siebzig': 70,
'achtzig': 80,
'neunzig': 90,
'hundert': 100,
'zweihundert': 200,
'dreihundert': 300,
'vierhundert': 400,
'fünfhundert': 500,
'sechshundert': 600,
'siebenhundert': 700,
'achthundert': 800,
'neunhundert': 900,
'tausend': 1000,
'million': 1000000
}

# TODO: short_scale and ordinals don't do anything here.
# The parameters are present in the function signature for API compatibility
# reasons.
from lingua_nostra.internal import resolve_resource_file


def extract_duration_de(text):
Expand Down Expand Up @@ -1039,4 +978,9 @@ def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'):


class GermanNormalizer(Normalizer):
""" TODO implement language specific normalizer"""
with open(resolve_resource_file("text/de-de/normalize.json")) as f:
_default_config = json.load(f)


def normalize_de(text, remove_articles=True):
return GermanNormalizer().normalize(text, remove_articles)
80 changes: 80 additions & 0 deletions lingua_nostra/res/text/de-de/normalize.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
{
"lowercase": false,
"numbers_to_digits": true,
"expand_contractions": true,
"remove_symbols": false,
"remove_accents": false,
"remove_articles": false,
"remove_stopwords": false,
"contractions": {
"am": "an dem",
"ans": "an das",
"aufs": "auf das",
"beim": "bei dem",
"durchs": "durch das",
"fürs": "für das",
"hinterm": "hinter dem",
"ins": "in das",
"im": "in dem",
"übers": "über das",
"unters": "unter das",
"unterm": "unter dem",
"vom": "von dem",
"vors": "vor das",
"vorm": "vor dem",
"zum": "zu dem"
},
"word_replacements": {
"m": "meter",
"km": "kilometer"
},
"number_replacements": {
"null": "0",
"ein": "1",
"eins": "1",
"eine": "1",
"einer": "1",
"einem": "1",
"einen": "1",
"eines": "1",
"zwei": "2",
"drei": "3",
"vier": "4",
"fünf": "5",
"sechs": "6",
"sieben": "7",
"acht": "8",
"neun": "9",
"zehn": "10",
"elf": "11",
"zwölf": "12",
"dreizehn": "13",
"vierzehn": "14",
"fünfzehn": "15",
"sechzehn": "16",
"siebzehn": "17",
"achtzehn": "18",
"neunzehn": "19",
"zwanzig": "20",
"einundzwanzig": "21",
"zweiundzwanzig": "22",
"dreiundzwanzig": "23",
"vierundzwanzig": "24",
"dreißig": "30",
"vierzig": "40",
"fünfzig": "50",
"sechzig": "60",
"siebzig": "70",
"achtzig": "80",
"neunzig": "90"
},
"stopwords": [],
"articles": [
"der",
"die",
"das",
"dem",
"den",
"des"
]
}

0 comments on commit 66fd95b

Please sign in to comment.