diff --git a/arxiv_dl/arxiv_dl.py b/arxiv_dl/arxiv_dl.py index c7aa001..7983b07 100644 --- a/arxiv_dl/arxiv_dl.py +++ b/arxiv_dl/arxiv_dl.py @@ -11,6 +11,7 @@ get_download_dest, process_arxiv_target, process_cvf_target, + process_ecva_target, process_nips_target, process_openreview_target, ) @@ -19,6 +20,7 @@ from .scrapers import ( scrape_metadata_arxiv, scrape_metadata_cvf, + scrape_metadata_ecva, scrape_metadata_nips, scrape_metadata_openreview, ) @@ -76,6 +78,8 @@ def main( paper_data: PaperData = process_nips_target(target) elif "openreview.net" in target: # assume target is an OpenReview URL paper_data: PaperData = process_openreview_target(target) + elif "ecva.net" in target: # assume target is an ECCV URL + paper_data: PaperData = process_ecva_target(target) elif target.endswith(".pdf"): # assume target is a PDF file # TODO: download the pdf file only ... @@ -88,18 +92,24 @@ def main( # start scraping from source website try: - if paper_data.src_website == "ArXiv": - scrape_metadata_arxiv(paper_data) - elif paper_data.src_website == "CVF": - scrape_metadata_cvf(paper_data) - elif paper_data.src_website == "NeurIPS": - scrape_metadata_nips(paper_data) - elif paper_data.src_website == "OpenReview": - scrape_metadata_openreview(paper_data) + if paper_data.abs_url: + if paper_data.src_website == "ArXiv": + scrape_metadata_arxiv(paper_data) + elif paper_data.src_website == "CVF": + scrape_metadata_cvf(paper_data) + elif paper_data.src_website == "ECVA": + scrape_metadata_ecva(paper_data) + elif paper_data.src_website == "NeurIPS": + scrape_metadata_nips(paper_data) + elif paper_data.src_website == "OpenReview": + scrape_metadata_openreview(paper_data) + else: + # TODO: check here + logger.error(f"Invalid source website: '{paper_data.src_website}'") + return False else: - # TODO: check here - logger.error(f"Invalid source website: '{paper_data.src_website}'") - return False + # TODO: think how to handle this; maybe do nothing + logger.warning("[Warn] No abstract URL") except Exception as err: logger.exception(err) logger.error("[Abort] Error while getting paper") @@ -112,9 +122,13 @@ def main( # download paper try: - download_pdf( - paper_data, download_dir=download_dir, parallel_connections=n_threads - ) + if paper_data.pdf_url: + download_pdf( + paper_data, download_dir=download_dir, parallel_connections=n_threads + ) + else: + # TODO: think how to handle this; maybe improve error message + logger.warning("[Warn] No PDF URL found") except Exception as err: logger.exception(err) logger.error("[Abort] Error while downloading paper") @@ -143,6 +157,7 @@ def main( if __name__ == "__main__": root_dir = Path(__file__).resolve().parent.parent tmp_dir = root_dir / "tmp" + tmp_dir.mkdir(exist_ok=True) from puts import timeitprint diff --git a/arxiv_dl/dl_utils.py b/arxiv_dl/dl_utils.py index f1603ba..e4aa0a6 100644 --- a/arxiv_dl/dl_utils.py +++ b/arxiv_dl/dl_utils.py @@ -25,6 +25,7 @@ # --- workarounds for Python misbehavior --- + # enable passing unicode arguments from command line in Python 2.x # https://stackoverflow.com/questions/846850/read-unicode-characters def win32_utf8_argv(): @@ -446,7 +447,6 @@ def bar_adaptive(current, total, width=80): # render output = "" for field in selected: - if field == "percent": # fixed size width for percentage output += ("%s%%" % (100 * current // total)).rjust(min_width["percent"]) @@ -468,6 +468,8 @@ def bar_adaptive(current, total, width=80): __current_size = 0 # global state variable, which exists solely as a + + # workaround against Python 3.3.0 regression # http://bugs.python.org/issue16409 # fixed in Python 3.3.1 diff --git a/arxiv_dl/helpers.py b/arxiv_dl/helpers.py index f0eddf5..4bc8c53 100644 --- a/arxiv_dl/helpers.py +++ b/arxiv_dl/helpers.py @@ -515,6 +515,76 @@ def process_cvf_target(target: str) -> PaperData: ) +############################################################################### +### ECVA (ECCV) Helper Functions + + +def process_ecva_target(target: str) -> PaperData: + # https://www.ecva.net/papers/eccv_2024/papers_ECCV/html/6863_ECCV_2024_paper.php + # https://www.ecva.net/papers/eccv_2024/papers_ECCV/papers/06863.pdf + assert "www.ecva.net" in target + paper_data = PaperData( + src_website="ECVA", + paper_venue="ECCV", + ) + + tokens = target.split("/") + _start = tokens.index("www.ecva.net") + tokens = tokens[_start:] + """ + 0 www.ecva.net + 1 papers + 2 eccv_2024 + 3 papers_ECCV + 4 html / papers + 5 xxxxxxxxxxxx.php + """ + year = int(tokens[2].split("_")[1]) + paper_data.year = year + + if tokens[4] == "html" and year >= 2024: + assert tokens[5].endswith(".php") + paper_id: str = tokens[5].split("_")[0] + # pad paper_id with zeros + paper_id: str = paper_id.zfill(5) + paper_data.paper_id = paper_id + paper_data.abs_url = target + paper_data.pdf_url = ( + f"https://www.ecva.net/papers/eccv_{year}/papers_ECCV/papers/{paper_id}.pdf" + ) + elif tokens[4] == "html" and year == 2018: + assert tokens[5].endswith(".php") + paper_id: str = tokens[5][:-4] # remove ".php" + paper_data.paper_id = paper_id + paper_data.abs_url = target + paper_data.pdf_url = ( + f"https://www.ecva.net/papers/eccv_{year}/papers_ECCV/papers/{paper_id}.pdf" + ) + elif tokens[4] == "html" and year <= 2022: + assert tokens[5].endswith(".php") + paper_id: str = tokens[5].split("_")[0] + # pad paper_id with zeros + paper_id: str = paper_id.zfill(5) + paper_data.paper_id = paper_id + paper_data.abs_url = target + # unable to infer pdf_url from abs_url for ECCV 2022 and 2020 + elif tokens[4] == "papers" and year >= 2024: + assert tokens[5].endswith(".pdf") + paper_id: str = tokens[5].split(".")[0] + paper_data.paper_id = paper_id + # remove leading zeros + paper_id: str = paper_id.lstrip("0") + paper_data.abs_url = f"https://www.ecva.net/papers/eccv_{year}/papers_ECCV/html/{paper_id}_ECCV_2024_paper.php" + paper_data.pdf_url = target + elif tokens[4] == "papers" and year <= 2022: + paper_data.pdf_url = target + print(f"Currently unable to infer abs_url from pdf_url for ECCV {year}") + else: + raise Exception("Unexpected ECVA URL: {target}") + + return paper_data + + ############################################################################### ### NeurIPS Helper Functions diff --git a/arxiv_dl/models.py b/arxiv_dl/models.py index 86b487b..503655c 100644 --- a/arxiv_dl/models.py +++ b/arxiv_dl/models.py @@ -9,6 +9,7 @@ class PaperData(BaseModel): abs_url: str = None pdf_url: str = None supp_url: str = None + doi_url: str = None src_website: str = None download_name: str = None diff --git a/arxiv_dl/scrapers.py b/arxiv_dl/scrapers.py index 230db58..6ffd089 100644 --- a/arxiv_dl/scrapers.py +++ b/arxiv_dl/scrapers.py @@ -142,6 +142,82 @@ def scrape_metadata_cvf(paper_data: PaperData) -> None: return None +def scrape_metadata_ecva(paper_data: PaperData) -> None: + # TODO + logger.setLevel(logging.DEBUG) + logger.debug("[Processing] Retrieving paper metadata...") + logger.setLevel(logging.WARNING) + + response = requests.get(paper_data.abs_url) + if response.status_code != 200: + logger.error(f"Cannot connect to {paper_data.abs_url}") + raise Exception(f"Cannot connect to {paper_data.abs_url}") + # make soup + soup = BeautifulSoup(response.text, "html.parser") + + # get TITLE + result = soup.find("div", id="papertitle") + tmp = [i.string.strip() for i in result if i.string] + paper_title = tmp[0].strip() # NOTE: hardcoded + paper_data.title = paper_title + # print(paper_title) + + # get AUTHORS + result = soup.find("div", id="authors") + tmp = [i.string.strip() for i in result if i.string] + authors_str = tmp[0].strip() # NOTE: hardcoded + authors_list = [x.strip(" *") for x in authors_str.split(",") if x] + paper_data.authors = authors_list + # print(authors_list) + + # get ABSTRACT + result = soup.find("div", id="abstract") + tmp = [i.string.strip() for i in result if i.string] + paper_abstract = "".join(tmp) + paper_data.abstract = paper_abstract.strip(' "') + # print(paper_abstract) + + # get pdf path + result = soup.find_all("a", string="pdf") + if len(result) == 1: + pdf_url = result[0].get("href") + if pdf_url.startswith("../../../../"): + pdf_url = pdf_url.replace("../../../../", "https://www.ecva.net/") + paper_data.pdf_url = pdf_url + else: + # TODO: check here + # print("Unexpected pdf_url:", pdf_url) + pass + + # get doi url + result = soup.find_all("a", string="DOI") + if len(result) == 1: + doi_url = result[0].get("href") + if doi_url.startswith("https"): + paper_data.doi_url = doi_url + else: + # TODO: check here + # print("Unexpected doi_url:", doi_url) + pass + + # get supplementary path + result = soup.find_all("a", string="supplementary material") + if len(result) == 1: + supp_url = result[0].get("href") + if supp_url.startswith("../../../../"): + supp_url = supp_url.replace("../../../../", "https://www.ecva.net/") + paper_data.supp_url = supp_url + else: + # TODO: check here + # print("Unexpected supp_url:", supp_url) + pass + + # construct filename + paper_data.download_name = f"{paper_data.year}_{paper_data.paper_venue}_{paper_data.paper_id}_{normalize_paper_title(paper_data.title)}.pdf" + + return None + + def scrape_metadata_nips(paper_data: PaperData) -> None: # TODO ... diff --git a/tests/test_process_cvf_target.py b/tests/test_process_cvf_target.py index b0dfcaf..8a5980e 100644 --- a/tests/test_process_cvf_target.py +++ b/tests/test_process_cvf_target.py @@ -11,7 +11,6 @@ class TestProcessCVFTarget(unittest.TestCase): - ################################################################ # 2024 # content/venue @@ -450,7 +449,6 @@ def test_CVPR2018(self): self.assertEqual(paper_data.year, 2018) def test_CVPR2018W(self): - abs_url = "https://openaccess.thecvf.com/content_cvpr_2018_workshops/w3/html/Naphade_The_2018_NVIDIA_CVPR_2018_paper.html" pdf_url = "https://openaccess.thecvf.com/content_cvpr_2018_workshops/papers/w3/Naphade_The_2018_NVIDIA_CVPR_2018_paper.pdf" paper_data = process_cvf_target(abs_url) diff --git a/tests/test_scrape_ecva_target.py b/tests/test_scrape_ecva_target.py new file mode 100644 index 0000000..f45284a --- /dev/null +++ b/tests/test_scrape_ecva_target.py @@ -0,0 +1,113 @@ +import sys +import unittest +from pathlib import Path + +from arxiv_dl.helpers import process_ecva_target +from arxiv_dl.models import PaperData +from arxiv_dl.scrapers import scrape_metadata_ecva + +root_dir = Path(__file__).resolve().parent.parent + +sys.path.insert(0, str(root_dir)) + + +class TestScrapeECVA(unittest.TestCase): + def test_ECCV2024(self): + abs_url = "https://www.ecva.net/papers/eccv_2024/papers_ECCV/html/4_ECCV_2024_paper.php" + pdf_url = "https://www.ecva.net/papers/eccv_2024/papers_ECCV/papers/00004.pdf" + paper_title = "Is Retain Set All You Need in Machine Unlearning? Restoring Performance of Unlearned Models with Out-Of-Distribution Images" + paper_data = process_ecva_target(abs_url) + scrape_metadata_ecva(paper_data) + self.assertTrue(isinstance(paper_data, PaperData)) + self.assertEqual(paper_data.abs_url, abs_url) + self.assertEqual(paper_data.pdf_url, pdf_url) + self.assertEqual(paper_data.src_website, "ECVA") + self.assertEqual(paper_data.title, paper_title) + self.assertEqual(paper_data.year, 2024) + self.assertEqual(paper_data.paper_venue, "ECCV") + self.assertEqual( + paper_data.authors, + [ + "Jacopo Bonato", + "Marco Cotogni", + "Luigi Sabetta", + ], + ) + + def test_ECCV2022(self): + abs_url = "https://www.ecva.net/papers/eccv_2022/papers_ECCV/html/19_ECCV_2022_paper.php" + pdf_url = ( + "https://www.ecva.net/papers/eccv_2022/papers_ECCV/papers/136610001.pdf" + ) + paper_title = "Learning Depth from Focus in the Wild" + paper_data = process_ecva_target(abs_url) + scrape_metadata_ecva(paper_data) + self.assertTrue(isinstance(paper_data, PaperData)) + self.assertEqual(paper_data.abs_url, abs_url) + self.assertEqual(paper_data.pdf_url, pdf_url) + self.assertEqual(paper_data.src_website, "ECVA") + self.assertEqual(paper_data.title, paper_title) + self.assertEqual(paper_data.year, 2022) + self.assertEqual(paper_data.paper_venue, "ECCV") + self.assertEqual( + paper_data.authors, + [ + "Changyeon Won", + "Hae-Gon Jeon", + ], + ) + + def test_ECCV2020(self): + abs_url = "https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/267_ECCV_2020_paper.php" + pdf_url = ( + "https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123460001.pdf" + ) + paper_title = "Quaternion Equivariant Capsule Networks for 3D Point Clouds" + paper_data = process_ecva_target(abs_url) + scrape_metadata_ecva(paper_data) + self.assertTrue(isinstance(paper_data, PaperData)) + self.assertEqual(paper_data.abs_url, abs_url) + self.assertEqual(paper_data.pdf_url, pdf_url) + self.assertEqual(paper_data.src_website, "ECVA") + self.assertEqual(paper_data.title, paper_title) + self.assertEqual(paper_data.year, 2020) + self.assertEqual(paper_data.paper_venue, "ECCV") + self.assertEqual( + paper_data.authors, + [ + "Yongheng Zhao", + "Tolga Birdal", + "Jan Eric Lenssen", + "Emanuele Menegatti", + "Leonidas Guibas", + "Federico Tombari", + ], + ) + + def test_ECCV2018(self): + abs_url = "https://www.ecva.net/papers/eccv_2018/papers_ECCV/html/Yonggen_Ling_Modeling_Varying_Camera-IMU_ECCV_2018_paper.php" + pdf_url = "https://www.ecva.net/papers/eccv_2018/papers_ECCV/papers/Yonggen_Ling_Modeling_Varying_Camera-IMU_ECCV_2018_paper.pdf" + paper_title = "Modeling Varying Camera-IMU Time Offset in Optimization-Based Visual-Inertial Odometry" + paper_data = process_ecva_target(abs_url) + scrape_metadata_ecva(paper_data) + self.assertTrue(isinstance(paper_data, PaperData)) + self.assertEqual(paper_data.abs_url, abs_url) + self.assertEqual(paper_data.pdf_url, pdf_url) + self.assertEqual(paper_data.src_website, "ECVA") + self.assertEqual(paper_data.title, paper_title) + self.assertEqual(paper_data.year, 2018) + self.assertEqual(paper_data.paper_venue, "ECCV") + self.assertEqual( + paper_data.authors, + [ + "Yonggen Ling", + "Linchao Bao", + "Zequn Jie", + "Fengming Zhu", + "Ziyang Li", + "Shanmin Tang", + "Yongsheng Liu", + "Wei Liu", + "Tong Zhang", + ], + )