From 66fd95b6cbee29d8fe69b4052d98a8a5b13ed343 Mon Sep 17 00:00:00 2001 From: JarbasAI <33701864+JarbasAl@users.noreply.github.com> Date: Sun, 9 May 2021 14:48:32 +0100 Subject: [PATCH] feat/ german json normalizer (#22) Co-authored-by: jarbasal --- lingua_nostra/lang/parse_de.py | 72 +++---------------- lingua_nostra/res/text/de-de/normalize.json | 80 +++++++++++++++++++++ 2 files changed, 88 insertions(+), 64 deletions(-) create mode 100644 lingua_nostra/res/text/de-de/normalize.json diff --git a/lingua_nostra/lang/parse_de.py b/lingua_nostra/lang/parse_de.py index ba9101ff..02d525b8 100644 --- a/lingua_nostra/lang/parse_de.py +++ b/lingua_nostra/lang/parse_de.py @@ -14,6 +14,7 @@ # limitations under the License. # import re +import json from datetime import datetime, timedelta from dateutil.relativedelta import relativedelta from lingua_nostra.lang.parse_common import is_numeric, look_for_fractions, \ @@ -22,69 +23,7 @@ from lingua_nostra.lang.format_de import pronounce_number_de from lingua_nostra.time import now_local from lingua_nostra.parse import normalize_decimals - - -de_numbers = { - 'null': 0, - 'ein': 1, - 'eins': 1, - 'eine': 1, - 'einer': 1, - 'einem': 1, - 'einen': 1, - 'eines': 1, - 'zwei': 2, - 'drei': 3, - 'vier': 4, - 'fünf': 5, - 'sechs': 6, - 'sieben': 7, - 'acht': 8, - 'neun': 9, - 'zehn': 10, - 'elf': 11, - 'zwölf': 12, - 'dreizehn': 13, - 'vierzehn': 14, - 'fünfzehn': 15, - 'sechzehn': 16, - 'siebzehn': 17, - 'achtzehn': 18, - 'neunzehn': 19, - 'zwanzig': 20, - 'einundzwanzig': 21, - 'zweiundzwanzig': 22, - 'dreiundzwanzig': 23, - 'vierundzwanzig': 24, - 'fünfundzwanzig': 25, - 'sechsundzwanzig': 26, - 'siebenundzwanzig': 27, - 'achtundzwanzig': 28, - 'neunundzwanzig': 29, - 'dreißig': 30, - 'einunddreißig': 31, - 'vierzig': 40, - 'fünfzig': 50, - 'sechzig': 60, - 'siebzig': 70, - 'achtzig': 80, - 'neunzig': 90, - 'hundert': 100, - 'zweihundert': 200, - 'dreihundert': 300, - 'vierhundert': 400, - 'fünfhundert': 500, - 'sechshundert': 600, - 'siebenhundert': 700, - 'achthundert': 800, - 'neunhundert': 900, - 'tausend': 1000, - 'million': 1000000 -} - -# TODO: short_scale and ordinals don't do anything here. -# The parameters are present in the function signature for API compatibility -# reasons. +from lingua_nostra.internal import resolve_resource_file def extract_duration_de(text): @@ -1039,4 +978,9 @@ def extract_numbers_de(text, short_scale=True, ordinals=False, decimal='.'): class GermanNormalizer(Normalizer): - """ TODO implement language specific normalizer""" + with open(resolve_resource_file("text/de-de/normalize.json")) as f: + _default_config = json.load(f) + + +def normalize_de(text, remove_articles=True): + return GermanNormalizer().normalize(text, remove_articles) diff --git a/lingua_nostra/res/text/de-de/normalize.json b/lingua_nostra/res/text/de-de/normalize.json new file mode 100644 index 00000000..020e8e05 --- /dev/null +++ b/lingua_nostra/res/text/de-de/normalize.json @@ -0,0 +1,80 @@ +{ + "lowercase": false, + "numbers_to_digits": true, + "expand_contractions": true, + "remove_symbols": false, + "remove_accents": false, + "remove_articles": false, + "remove_stopwords": false, + "contractions": { + "am": "an dem", + "ans": "an das", + "aufs": "auf das", + "beim": "bei dem", + "durchs": "durch das", + "fürs": "für das", + "hinterm": "hinter dem", + "ins": "in das", + "im": "in dem", + "übers": "über das", + "unters": "unter das", + "unterm": "unter dem", + "vom": "von dem", + "vors": "vor das", + "vorm": "vor dem", + "zum": "zu dem" + }, + "word_replacements": { + "m": "meter", + "km": "kilometer" + }, + "number_replacements": { + "null": "0", + "ein": "1", + "eins": "1", + "eine": "1", + "einer": "1", + "einem": "1", + "einen": "1", + "eines": "1", + "zwei": "2", + "drei": "3", + "vier": "4", + "fünf": "5", + "sechs": "6", + "sieben": "7", + "acht": "8", + "neun": "9", + "zehn": "10", + "elf": "11", + "zwölf": "12", + "dreizehn": "13", + "vierzehn": "14", + "fünfzehn": "15", + "sechzehn": "16", + "siebzehn": "17", + "achtzehn": "18", + "neunzehn": "19", + "zwanzig": "20", + "einundzwanzig": "21", + "zweiundzwanzig": "22", + "dreiundzwanzig": "23", + "vierundzwanzig": "24", + "dreißig": "30", + "vierzig": "40", + "fünfzig": "50", + "sechzig": "60", + "siebzig": "70", + "achtzig": "80", + "neunzig": "90" + }, + "stopwords": [], + "articles": [ + "der", + "die", + "das", + "dem", + "den", + "des" + ] +}