From 13963bb66d031da6453113fa8bf734aa9c57e5a9 Mon Sep 17 00:00:00 2001 From: Benedikt Heinrichs <49311227+BenediktHeinrichs@users.noreply.github.com> Date: Wed, 3 Jul 2024 09:10:11 +0000 Subject: [PATCH] Improved Config Overwriting --- .../Extractors/Text/SummaryExtract.py | 2 +- MetadataExtractor/_version.py | 2 +- README.md | 3 ++- server.py | 21 +++++++++++++++++-- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/MetadataExtractor/Extractors/Text/SummaryExtract.py b/MetadataExtractor/Extractors/Text/SummaryExtract.py index d389c1a..793c603 100644 --- a/MetadataExtractor/Extractors/Text/SummaryExtract.py +++ b/MetadataExtractor/Extractors/Text/SummaryExtract.py @@ -29,7 +29,7 @@ def text_extract(self, text, fileInfo): ) ], "values": [ - {"predicate": "text:summarizedBy", "object": formattedSummary} + {"predicate": "text:summary", "object": formattedSummary} ], }, ) diff --git a/MetadataExtractor/_version.py b/MetadataExtractor/_version.py index 574c066..3d18726 100644 --- a/MetadataExtractor/_version.py +++ b/MetadataExtractor/_version.py @@ -1 +1 @@ -__version__ = "0.4.9" +__version__ = "0.5.0" diff --git a/README.md b/README.md index 0eb0b67..5529a71 100644 --- a/README.md +++ b/README.md @@ -67,12 +67,13 @@ The service exposes several endpoints: - This endpoint accepts form-data with a download url or a file along with optional parameters: - `identifier`: A unique identifier for the file. - - `config`: Configuration object for extraction settings. + - `config`: Configuration object for extraction settings. (Example value: `{ "Extractors": { "Text": [ "SummaryExtract" ] } }`) - `creation_date`: File's creation date. - `modification_date`: File's modification date. - `url`: Download URL of the file. - `dynamic_url`: Download URL of the file (loading dynamic JavaScript). - `file`: The file to be processed. + - `accept`: The Accept header has to be set (default is JSON, recommended is Turtle) - Returns extracted metadata in the requested format. (JSON, Turtle, RDF/XML, JSON-LD, TriG) ### GET /defaultConfig diff --git a/server.py b/server.py index c2accba..e0d1a5c 100644 --- a/server.py +++ b/server.py @@ -8,6 +8,7 @@ import re import base64 import quopri +import json from urllib.parse import urlparse from urllib.request import urlretrieve, urlopen from playwright.sync_api import sync_playwright @@ -54,7 +55,7 @@ def encoded_words_to_text(encoded_words): parser.add_argument( "config", type=object, - help='Object defining the utilized configuration (try "/defaultConfig" to get the structure)', + help='Object defining the overwriting configuration (try "/defaultConfig" to get the structure)', location="form", ) parser.add_argument( @@ -95,6 +96,22 @@ def encoded_words_to_text(encoded_words): }, ) +# From https://stackoverflow.com/a/7205672 +def mergedicts(dict1, dict2): + for k in set(dict1.keys()).union(dict2.keys()): + if k in dict1 and k in dict2: + if isinstance(dict1[k], dict) and isinstance(dict2[k], dict): + yield (k, dict(mergedicts(dict1[k], dict2[k]))) + else: + # If one of the values is not a dict, you can't continue merging it. + # Value from second dict overrides one in first and we move on. + yield (k, dict2[k]) + # Alternatively, replace this with exception raiser to alert you of value conflicts + elif k in dict1: + yield (k, dict1[k]) + else: + yield (k, dict2[k]) + def writeDynamicUrlToSystem(dynamicUrl, fileName): with sync_playwright() as p: browser = p.chromium.launch() @@ -181,7 +198,7 @@ def post(self): pipelineInput.append({"identifier": identifier, "file": fileName}) if "config" in data: - config = data["config"] + config = dict(mergedicts(getDefaultConfig(), json.loads(data["config"]))) else: config = getDefaultConfig() if "Settings" not in config: