Skip to content

Commit

Permalink
Remove dynamic_url again
Browse files Browse the repository at this point in the history
  • Loading branch information
BenediktHeinrichs committed Jul 17, 2024
1 parent 7996bee commit f6e7417
Show file tree
Hide file tree
Showing 5 changed files with 1 addition and 26 deletions.
2 changes: 1 addition & 1 deletion MetadataExtractor/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.2"
__version__ = "0.5.3"
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ The service exposes several endpoints:
- `creation_date`: File's creation date.
- `modification_date`: File's modification date.
- `url`: Download URL of the file.
- `dynamic_url`: Download URL of the file (loading dynamic JavaScript).
- `file`: The file to be processed.
- `accept`: The Accept header has to be set (default is JSON, recommended is Turtle)
- Returns extracted metadata in the requested format. (JSON, Turtle, RDF/XML, JSON-LD, TriG)
Expand Down
2 changes: 0 additions & 2 deletions installDependencies.sh
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
apt-get update
apt-get install -y --no-install-recommends git python3-opencv default-jre tesseract-ocr build-essential default-libmysqlclient-dev pkg-config wget libmagic1 libcairo2-dev
pip install --no-cache-dir --ignore-installed -r requirements.txt
playwright install chromium
playwright install-deps
python -c "import nltk; nltk.download('punkt')"
wget -O ./tika-server.jar https://archive.apache.org/dist/tika/2.7.0/tika-server-standard-2.7.0.jar
chmod +x run.sh
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ wget>=3.2
chardet>=3.0.4
tika>=2.6.0
urllib3>=1.26.6
playwright==1.44.0
seaborn>=0.11.2
scikit-learn>=1.2.1
adjustText>=0.7.3
Expand Down
21 changes: 0 additions & 21 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import json
from urllib.parse import urlparse
from urllib.request import urlretrieve, urlopen
from playwright.sync_api import sync_playwright
import filedate
import logging
from defaultConfigs import setDefaultLogging, getDefaultConfig
Expand Down Expand Up @@ -71,7 +70,6 @@ def encoded_words_to_text(encoded_words):
location="form",
)
parser.add_argument("url", type=str, help="Download URL of file", location="form")
parser.add_argument("dynamic_url", type=str, help="Dynamic download URL of file (JavaScript)", location="form")
parser.add_argument("file", type=FileStorage, location="files")
parser.add_argument(
"accept",
Expand Down Expand Up @@ -112,14 +110,6 @@ def mergedicts(dict1, dict2):
else:
yield (k, dict2[k])

def writeDynamicUrlToSystem(dynamicUrl, fileName):
with sync_playwright() as p:
browser = p.chromium.launch()
page = browser.new_page()
page.goto(dynamicUrl)
with open(fileName, 'w') as file:
file.write(page.content())

@api.route("/")
class MetadataExtractorWorker(Resource):
"""Performs the Metadata Extraction"""
Expand Down Expand Up @@ -162,17 +152,6 @@ def post(self):
if int(site.getheader("Content-Length")) > app.config["MAX_CONTENT_LENGTH"]:
return "File too big", 400
urlretrieve(data["url"], fileName)
elif "dynamic_url" in data:
parsedUrl = urlparse(data["dynamic_url"])
fileIdentifier = os.path.basename(parsedUrl.path)
if fileIdentifier == "":
fileIdentifier = "temp.html"
fileName = os.path.join(folder, fileIdentifier)
dirName = fileName[: fileName.rindex(os.sep)]
if not os.path.exists(dirName):
os.makedirs(dirName)

writeDynamicUrlToSystem(data["dynamic_url"], fileName)
else:
return "No file sent", 400

Expand Down

0 comments on commit f6e7417

Please sign in to comment.