diff --git a/MetadataExtractor/_version.py b/MetadataExtractor/_version.py index 7225152..43a1e95 100644 --- a/MetadataExtractor/_version.py +++ b/MetadataExtractor/_version.py @@ -1 +1 @@ -__version__ = "0.5.2" +__version__ = "0.5.3" diff --git a/README.md b/README.md index c437f4b..b86ac56 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,6 @@ The service exposes several endpoints: - `creation_date`: File's creation date. - `modification_date`: File's modification date. - `url`: Download URL of the file. - - `dynamic_url`: Download URL of the file (loading dynamic JavaScript). - `file`: The file to be processed. - `accept`: The Accept header has to be set (default is JSON, recommended is Turtle) - Returns extracted metadata in the requested format. (JSON, Turtle, RDF/XML, JSON-LD, TriG) diff --git a/installDependencies.sh b/installDependencies.sh index 604554c..77f509a 100644 --- a/installDependencies.sh +++ b/installDependencies.sh @@ -1,8 +1,6 @@ apt-get update apt-get install -y --no-install-recommends git python3-opencv default-jre tesseract-ocr build-essential default-libmysqlclient-dev pkg-config wget libmagic1 libcairo2-dev pip install --no-cache-dir --ignore-installed -r requirements.txt -playwright install chromium -playwright install-deps python -c "import nltk; nltk.download('punkt')" wget -O ./tika-server.jar https://archive.apache.org/dist/tika/2.7.0/tika-server-standard-2.7.0.jar chmod +x run.sh diff --git a/requirements.txt b/requirements.txt index 011f299..8722614 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,6 @@ wget>=3.2 chardet>=3.0.4 tika>=2.6.0 urllib3>=1.26.6 -playwright==1.44.0 seaborn>=0.11.2 scikit-learn>=1.2.1 adjustText>=0.7.3 diff --git a/server.py b/server.py index 444f4b1..4fbac89 100644 --- a/server.py +++ b/server.py @@ -11,7 +11,6 @@ import json from urllib.parse import urlparse from urllib.request import urlretrieve, urlopen -from playwright.sync_api import sync_playwright import filedate import logging from defaultConfigs import setDefaultLogging, getDefaultConfig @@ -71,7 +70,6 @@ def encoded_words_to_text(encoded_words): location="form", ) parser.add_argument("url", type=str, help="Download URL of file", location="form") -parser.add_argument("dynamic_url", type=str, help="Dynamic download URL of file (JavaScript)", location="form") parser.add_argument("file", type=FileStorage, location="files") parser.add_argument( "accept", @@ -112,14 +110,6 @@ def mergedicts(dict1, dict2): else: yield (k, dict2[k]) -def writeDynamicUrlToSystem(dynamicUrl, fileName): - with sync_playwright() as p: - browser = p.chromium.launch() - page = browser.new_page() - page.goto(dynamicUrl) - with open(fileName, 'w') as file: - file.write(page.content()) - @api.route("/") class MetadataExtractorWorker(Resource): """Performs the Metadata Extraction""" @@ -162,17 +152,6 @@ def post(self): if int(site.getheader("Content-Length")) > app.config["MAX_CONTENT_LENGTH"]: return "File too big", 400 urlretrieve(data["url"], fileName) - elif "dynamic_url" in data: - parsedUrl = urlparse(data["dynamic_url"]) - fileIdentifier = os.path.basename(parsedUrl.path) - if fileIdentifier == "": - fileIdentifier = "temp.html" - fileName = os.path.join(folder, fileIdentifier) - dirName = fileName[: fileName.rindex(os.sep)] - if not os.path.exists(dirName): - os.makedirs(dirName) - - writeDynamicUrlToSystem(data["dynamic_url"], fileName) else: return "No file sent", 400