Skip to content

Commit

Permalink
Improved Config Overwriting
Browse files Browse the repository at this point in the history
  • Loading branch information
BenediktHeinrichs committed Jul 3, 2024
1 parent fed9aa5 commit 13963bb
Show file tree
Hide file tree
Showing 4 changed files with 23 additions and 5 deletions.
2 changes: 1 addition & 1 deletion MetadataExtractor/Extractors/Text/SummaryExtract.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def text_extract(self, text, fileInfo):
)
],
"values": [
{"predicate": "text:summarizedBy", "object": formattedSummary}
{"predicate": "text:summary", "object": formattedSummary}
],
},
)
Expand Down
2 changes: 1 addition & 1 deletion MetadataExtractor/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.9"
__version__ = "0.5.0"
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,13 @@ The service exposes several endpoints:
- This endpoint accepts form-data with a download url or a file along with optional parameters:
- `identifier`: A unique identifier for the file.
- `config`: Configuration object for extraction settings.
- `config`: Configuration object for extraction settings. (Example value: `{ "Extractors": { "Text": [ "SummaryExtract" ] } }`)
- `creation_date`: File's creation date.
- `modification_date`: File's modification date.
- `url`: Download URL of the file.
- `dynamic_url`: Download URL of the file (loading dynamic JavaScript).
- `file`: The file to be processed.
- `accept`: The Accept header has to be set (default is JSON, recommended is Turtle)
- Returns extracted metadata in the requested format. (JSON, Turtle, RDF/XML, JSON-LD, TriG)
### GET /defaultConfig
Expand Down
21 changes: 19 additions & 2 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import re
import base64
import quopri
import json
from urllib.parse import urlparse
from urllib.request import urlretrieve, urlopen
from playwright.sync_api import sync_playwright
Expand Down Expand Up @@ -54,7 +55,7 @@ def encoded_words_to_text(encoded_words):
parser.add_argument(
"config",
type=object,
help='Object defining the utilized configuration (try "/defaultConfig" to get the structure)',
help='Object defining the overwriting configuration (try "/defaultConfig" to get the structure)',
location="form",
)
parser.add_argument(
Expand Down Expand Up @@ -95,6 +96,22 @@ def encoded_words_to_text(encoded_words):
},
)

# From https://stackoverflow.com/a/7205672
def mergedicts(dict1, dict2):
for k in set(dict1.keys()).union(dict2.keys()):
if k in dict1 and k in dict2:
if isinstance(dict1[k], dict) and isinstance(dict2[k], dict):
yield (k, dict(mergedicts(dict1[k], dict2[k])))
else:
# If one of the values is not a dict, you can't continue merging it.
# Value from second dict overrides one in first and we move on.
yield (k, dict2[k])
# Alternatively, replace this with exception raiser to alert you of value conflicts
elif k in dict1:
yield (k, dict1[k])
else:
yield (k, dict2[k])

def writeDynamicUrlToSystem(dynamicUrl, fileName):
with sync_playwright() as p:
browser = p.chromium.launch()
Expand Down Expand Up @@ -181,7 +198,7 @@ def post(self):
pipelineInput.append({"identifier": identifier, "file": fileName})

if "config" in data:
config = data["config"]
config = dict(mergedicts(getDefaultConfig(), json.loads(data["config"])))
else:
config = getDefaultConfig()
if "Settings" not in config:
Expand Down

0 comments on commit 13963bb

Please sign in to comment.