Skip to content

Commit

Permalink
New: Add Dynamic Url Server Parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
BenediktHeinrichs committed Jul 1, 2024
1 parent 025b3aa commit c6259bd
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 1 deletion.
2 changes: 1 addition & 1 deletion MetadataExtractor/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.4.8"
__version__ = "0.4.9"
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ The service exposes several endpoints:
- `creation_date`: File's creation date.
- `modification_date`: File's modification date.
- `url`: Download URL of the file.
- `dynamic_url`: Download URL of the file (loading dynamic JavaScript).
- `file`: The file to be processed.
- Returns extracted metadata in the requested format. (JSON, Turtle, RDF/XML, JSON-LD, TriG)
Expand Down
2 changes: 2 additions & 0 deletions installDependencies.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
apt-get update
apt-get install -y --no-install-recommends git python3-opencv default-jre tesseract-ocr build-essential default-libmysqlclient-dev pkg-config wget libmagic1 libcairo2-dev
pip install --no-cache-dir --ignore-installed -r requirements.txt
playwright install chromium
playwright install-deps
python -c "import nltk; nltk.download('punkt')"
wget -O ./tika-server.jar https://archive.apache.org/dist/tika/2.7.0/tika-server-standard-2.7.0.jar
chmod +x run.sh
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ wget>=3.2
chardet>=3.0.4
tika>=2.6.0
urllib3>=1.26.6
playwright==1.44.0
seaborn>=0.11.2
scikit-learn>=1.2.1
adjustText>=0.7.3
Expand Down
18 changes: 18 additions & 0 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import quopri
from urllib.parse import urlparse
from urllib.request import urlretrieve, urlopen
from playwright.sync_api import sync_playwright
import filedate
import logging
from defaultConfigs import setDefaultLogging, getDefaultConfig
Expand Down Expand Up @@ -69,6 +70,7 @@ def encoded_words_to_text(encoded_words):
location="form",
)
parser.add_argument("url", type=str, help="Download URL of file", location="form")
parser.add_argument("dynamic_url", type=str, help="Dynamic download URL of file (JavaScript)", location="form")
parser.add_argument("file", type=FileStorage, location="files")
parser.add_argument(
"accept",
Expand All @@ -93,6 +95,13 @@ def encoded_words_to_text(encoded_words):
},
)

def writeDynamicUrlToSystem(dynamicUrl, fileName):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(dynamicUrl)
with open(fileName, 'w') as file:
file.write(page.content())

@api.route("/")
class MetadataExtractorWorker(Resource):
Expand Down Expand Up @@ -133,6 +142,15 @@ def post(self):
if int(site.getheader("Content-Length")) > app.config["MAX_CONTENT_LENGTH"]:
return "File too big", 400
urlretrieve(data["url"], fileName)
elif "dynamic_url" in data:
parsedUrl = urlparse(data["dynamic_url"])
fileIdentifier = os.path.basename(parsedUrl.path)
fileName = os.path.join(folder, fileIdentifier)
dirName = fileName[: fileName.rindex(os.sep)]
if not os.path.exists(dirName):
os.makedirs(dirName)

writeDynamicUrlToSystem(data["dynamic_url"], fileName)
else:
return "No file sent", 400

Expand Down

0 comments on commit c6259bd

Please sign in to comment.