Skip to content

Commit

Permalink
Add ECVA support; more testing is needed
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkHershey committed Oct 14, 2024
1 parent 9d4ec93 commit 873c9c9
Show file tree
Hide file tree
Showing 7 changed files with 292 additions and 17 deletions.
43 changes: 29 additions & 14 deletions arxiv_dl/arxiv_dl.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
get_download_dest,
process_arxiv_target,
process_cvf_target,
process_ecva_target,
process_nips_target,
process_openreview_target,
)
Expand All @@ -19,6 +20,7 @@
from .scrapers import (
scrape_metadata_arxiv,
scrape_metadata_cvf,
scrape_metadata_ecva,
scrape_metadata_nips,
scrape_metadata_openreview,
)
Expand Down Expand Up @@ -76,6 +78,8 @@ def main(
paper_data: PaperData = process_nips_target(target)
elif "openreview.net" in target: # assume target is an OpenReview URL
paper_data: PaperData = process_openreview_target(target)
elif "ecva.net" in target: # assume target is an ECCV URL
paper_data: PaperData = process_ecva_target(target)
elif target.endswith(".pdf"): # assume target is a PDF file
# TODO: download the pdf file only
...
Expand All @@ -88,18 +92,24 @@ def main(

# start scraping from source website
try:
if paper_data.src_website == "ArXiv":
scrape_metadata_arxiv(paper_data)
elif paper_data.src_website == "CVF":
scrape_metadata_cvf(paper_data)
elif paper_data.src_website == "NeurIPS":
scrape_metadata_nips(paper_data)
elif paper_data.src_website == "OpenReview":
scrape_metadata_openreview(paper_data)
if paper_data.abs_url:
if paper_data.src_website == "ArXiv":
scrape_metadata_arxiv(paper_data)
elif paper_data.src_website == "CVF":
scrape_metadata_cvf(paper_data)
elif paper_data.src_website == "ECVA":
scrape_metadata_ecva(paper_data)
elif paper_data.src_website == "NeurIPS":
scrape_metadata_nips(paper_data)
elif paper_data.src_website == "OpenReview":
scrape_metadata_openreview(paper_data)
else:
# TODO: check here
logger.error(f"Invalid source website: '{paper_data.src_website}'")
return False
else:
# TODO: check here
logger.error(f"Invalid source website: '{paper_data.src_website}'")
return False
# TODO: think how to handle this; maybe do nothing
logger.warning("[Warn] No abstract URL")
except Exception as err:
logger.exception(err)
logger.error("[Abort] Error while getting paper")
Expand All @@ -112,9 +122,13 @@ def main(

# download paper
try:
download_pdf(
paper_data, download_dir=download_dir, parallel_connections=n_threads
)
if paper_data.pdf_url:
download_pdf(
paper_data, download_dir=download_dir, parallel_connections=n_threads
)
else:
# TODO: think how to handle this; maybe improve error message
logger.warning("[Warn] No PDF URL found")
except Exception as err:
logger.exception(err)
logger.error("[Abort] Error while downloading paper")
Expand Down Expand Up @@ -143,6 +157,7 @@ def main(
if __name__ == "__main__":
root_dir = Path(__file__).resolve().parent.parent
tmp_dir = root_dir / "tmp"
tmp_dir.mkdir(exist_ok=True)

from puts import timeitprint

Expand Down
4 changes: 3 additions & 1 deletion arxiv_dl/dl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

# --- workarounds for Python misbehavior ---


# enable passing unicode arguments from command line in Python 2.x
# https://stackoverflow.com/questions/846850/read-unicode-characters
def win32_utf8_argv():
Expand Down Expand Up @@ -446,7 +447,6 @@ def bar_adaptive(current, total, width=80):
# render
output = ""
for field in selected:

if field == "percent":
# fixed size width for percentage
output += ("%s%%" % (100 * current // total)).rjust(min_width["percent"])
Expand All @@ -468,6 +468,8 @@ def bar_adaptive(current, total, width=80):


__current_size = 0 # global state variable, which exists solely as a


# workaround against Python 3.3.0 regression
# http://bugs.python.org/issue16409
# fixed in Python 3.3.1
Expand Down
70 changes: 70 additions & 0 deletions arxiv_dl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,76 @@ def process_cvf_target(target: str) -> PaperData:
)


###############################################################################
### ECVA (ECCV) Helper Functions


def process_ecva_target(target: str) -> PaperData:
# https://www.ecva.net/papers/eccv_2024/papers_ECCV/html/6863_ECCV_2024_paper.php
# https://www.ecva.net/papers/eccv_2024/papers_ECCV/papers/06863.pdf
assert "www.ecva.net" in target
paper_data = PaperData(
src_website="ECVA",
paper_venue="ECCV",
)

tokens = target.split("/")
_start = tokens.index("www.ecva.net")
tokens = tokens[_start:]
"""
0 www.ecva.net
1 papers
2 eccv_2024
3 papers_ECCV
4 html / papers
5 xxxxxxxxxxxx.php
"""
year = int(tokens[2].split("_")[1])
paper_data.year = year

if tokens[4] == "html" and year >= 2024:
assert tokens[5].endswith(".php")
paper_id: str = tokens[5].split("_")[0]
# pad paper_id with zeros
paper_id: str = paper_id.zfill(5)
paper_data.paper_id = paper_id
paper_data.abs_url = target
paper_data.pdf_url = (
f"https://www.ecva.net/papers/eccv_{year}/papers_ECCV/papers/{paper_id}.pdf"
)
elif tokens[4] == "html" and year == 2018:
assert tokens[5].endswith(".php")
paper_id: str = tokens[5][:-4] # remove ".php"
paper_data.paper_id = paper_id
paper_data.abs_url = target
paper_data.pdf_url = (
f"https://www.ecva.net/papers/eccv_{year}/papers_ECCV/papers/{paper_id}.pdf"
)
elif tokens[4] == "html" and year <= 2022:
assert tokens[5].endswith(".php")
paper_id: str = tokens[5].split("_")[0]
# pad paper_id with zeros
paper_id: str = paper_id.zfill(5)
paper_data.paper_id = paper_id
paper_data.abs_url = target
# unable to infer pdf_url from abs_url for ECCV 2022 and 2020
elif tokens[4] == "papers" and year >= 2024:
assert tokens[5].endswith(".pdf")
paper_id: str = tokens[5].split(".")[0]
paper_data.paper_id = paper_id
# remove leading zeros
paper_id: str = paper_id.lstrip("0")
paper_data.abs_url = f"https://www.ecva.net/papers/eccv_{year}/papers_ECCV/html/{paper_id}_ECCV_2024_paper.php"
paper_data.pdf_url = target
elif tokens[4] == "papers" and year <= 2022:
paper_data.pdf_url = target
print(f"Currently unable to infer abs_url from pdf_url for ECCV {year}")
else:
raise Exception("Unexpected ECVA URL: {target}")

return paper_data


###############################################################################
### NeurIPS Helper Functions

Expand Down
1 change: 1 addition & 0 deletions arxiv_dl/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class PaperData(BaseModel):
abs_url: str = None
pdf_url: str = None
supp_url: str = None
doi_url: str = None
src_website: str = None
download_name: str = None

Expand Down
76 changes: 76 additions & 0 deletions arxiv_dl/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,82 @@ def scrape_metadata_cvf(paper_data: PaperData) -> None:
return None


def scrape_metadata_ecva(paper_data: PaperData) -> None:
# TODO
logger.setLevel(logging.DEBUG)
logger.debug("[Processing] Retrieving paper metadata...")
logger.setLevel(logging.WARNING)

response = requests.get(paper_data.abs_url)
if response.status_code != 200:
logger.error(f"Cannot connect to {paper_data.abs_url}")
raise Exception(f"Cannot connect to {paper_data.abs_url}")
# make soup
soup = BeautifulSoup(response.text, "html.parser")

# get TITLE
result = soup.find("div", id="papertitle")
tmp = [i.string.strip() for i in result if i.string]
paper_title = tmp[0].strip() # NOTE: hardcoded
paper_data.title = paper_title
# print(paper_title)

# get AUTHORS
result = soup.find("div", id="authors")
tmp = [i.string.strip() for i in result if i.string]
authors_str = tmp[0].strip() # NOTE: hardcoded
authors_list = [x.strip(" *") for x in authors_str.split(",") if x]
paper_data.authors = authors_list
# print(authors_list)

# get ABSTRACT
result = soup.find("div", id="abstract")
tmp = [i.string.strip() for i in result if i.string]
paper_abstract = "".join(tmp)
paper_data.abstract = paper_abstract.strip(' "')
# print(paper_abstract)

# get pdf path
result = soup.find_all("a", string="pdf")
if len(result) == 1:
pdf_url = result[0].get("href")
if pdf_url.startswith("../../../../"):
pdf_url = pdf_url.replace("../../../../", "https://www.ecva.net/")
paper_data.pdf_url = pdf_url
else:
# TODO: check here
# print("Unexpected pdf_url:", pdf_url)
pass

# get doi url
result = soup.find_all("a", string="DOI")
if len(result) == 1:
doi_url = result[0].get("href")
if doi_url.startswith("https"):
paper_data.doi_url = doi_url
else:
# TODO: check here
# print("Unexpected doi_url:", doi_url)
pass

# get supplementary path
result = soup.find_all("a", string="supplementary material")
if len(result) == 1:
supp_url = result[0].get("href")
if supp_url.startswith("../../../../"):
supp_url = supp_url.replace("../../../../", "https://www.ecva.net/")
paper_data.supp_url = supp_url
else:
# TODO: check here
# print("Unexpected supp_url:", supp_url)
pass

# construct filename
paper_data.download_name = f"{paper_data.year}_{paper_data.paper_venue}_{paper_data.paper_id}_{normalize_paper_title(paper_data.title)}.pdf"

return None


def scrape_metadata_nips(paper_data: PaperData) -> None:
# TODO
...
Expand Down
2 changes: 0 additions & 2 deletions tests/test_process_cvf_target.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@


class TestProcessCVFTarget(unittest.TestCase):

################################################################
# 2024
# content/venue
Expand Down Expand Up @@ -450,7 +449,6 @@ def test_CVPR2018(self):
self.assertEqual(paper_data.year, 2018)

def test_CVPR2018W(self):

abs_url = "https://openaccess.thecvf.com/content_cvpr_2018_workshops/w3/html/Naphade_The_2018_NVIDIA_CVPR_2018_paper.html"
pdf_url = "https://openaccess.thecvf.com/content_cvpr_2018_workshops/papers/w3/Naphade_The_2018_NVIDIA_CVPR_2018_paper.pdf"
paper_data = process_cvf_target(abs_url)
Expand Down
Loading

0 comments on commit 873c9c9

Please sign in to comment.