diff --git a/MetadataExtractor/_version.py b/MetadataExtractor/_version.py index a3a9bd5..574c066 100644 --- a/MetadataExtractor/_version.py +++ b/MetadataExtractor/_version.py @@ -1 +1 @@ -__version__ = "0.4.8" +__version__ = "0.4.9" diff --git a/README.md b/README.md index 9b48772..0eb0b67 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,7 @@ The service exposes several endpoints: - `creation_date`: File's creation date. - `modification_date`: File's modification date. - `url`: Download URL of the file. + - `dynamic_url`: Download URL of the file (loading dynamic JavaScript). - `file`: The file to be processed. - Returns extracted metadata in the requested format. (JSON, Turtle, RDF/XML, JSON-LD, TriG) diff --git a/installDependencies.sh b/installDependencies.sh index 77f509a..604554c 100644 --- a/installDependencies.sh +++ b/installDependencies.sh @@ -1,6 +1,8 @@ apt-get update apt-get install -y --no-install-recommends git python3-opencv default-jre tesseract-ocr build-essential default-libmysqlclient-dev pkg-config wget libmagic1 libcairo2-dev pip install --no-cache-dir --ignore-installed -r requirements.txt +playwright install chromium +playwright install-deps python -c "import nltk; nltk.download('punkt')" wget -O ./tika-server.jar https://archive.apache.org/dist/tika/2.7.0/tika-server-standard-2.7.0.jar chmod +x run.sh diff --git a/requirements.txt b/requirements.txt index b68aaca..a89eb72 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ wget>=3.2 chardet>=3.0.4 tika>=2.6.0 urllib3>=1.26.6 +playwright==1.44.0 seaborn>=0.11.2 scikit-learn>=1.2.1 adjustText>=0.7.3 diff --git a/server.py b/server.py index b7a239a..c2accba 100644 --- a/server.py +++ b/server.py @@ -10,6 +10,7 @@ import quopri from urllib.parse import urlparse from urllib.request import urlretrieve, urlopen +from playwright.sync_api import sync_playwright import filedate import logging from defaultConfigs import setDefaultLogging, getDefaultConfig @@ -69,6 +70,7 @@ def encoded_words_to_text(encoded_words): location="form", ) parser.add_argument("url", type=str, help="Download URL of file", location="form") +parser.add_argument("dynamic_url", type=str, help="Dynamic download URL of file (JavaScript)", location="form") parser.add_argument("file", type=FileStorage, location="files") parser.add_argument( "accept", @@ -93,6 +95,13 @@ def encoded_words_to_text(encoded_words): }, ) +def writeDynamicUrlToSystem(dynamicUrl, fileName): + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + page.goto(dynamicUrl) + with open(fileName, 'w') as file: + file.write(page.content()) @api.route("/") class MetadataExtractorWorker(Resource): @@ -133,6 +142,15 @@ def post(self): if int(site.getheader("Content-Length")) > app.config["MAX_CONTENT_LENGTH"]: return "File too big", 400 urlretrieve(data["url"], fileName) + elif "dynamic_url" in data: + parsedUrl = urlparse(data["dynamic_url"]) + fileIdentifier = os.path.basename(parsedUrl.path) + fileName = os.path.join(folder, fileIdentifier) + dirName = fileName[: fileName.rindex(os.sep)] + if not os.path.exists(dirName): + os.makedirs(dirName) + + writeDynamicUrlToSystem(data["dynamic_url"], fileName) else: return "No file sent", 400