From 3b3e78d901a600bb22943202c6a8981ca04a5e48 Mon Sep 17 00:00:00 2001 From: ArztKlein <52363453+ArztKlein@users.noreply.github.com> Date: Thu, 17 Nov 2022 15:28:46 +1300 Subject: [PATCH] Before and After methods (#175) * Added before and after functions * add tests * formatting --- tests/test_cdx_api.py | 36 +++++++++++++++++++ waybackpy/cdx_api.py | 82 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) diff --git a/tests/test_cdx_api.py b/tests/test_cdx_api.py index ba2db5a..9748b9f 100644 --- a/tests/test_cdx_api.py +++ b/tests/test_cdx_api.py @@ -176,3 +176,39 @@ def test_near() -> None: filters=["statuscode:200"], ) cdx.near(unix_timestamp=1286705410) + + +def test_before() -> None: + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) + + cdx = WaybackMachineCDXServerAPI( + url="http://www.google.com/", + user_agent=user_agent, + filters=["statuscode:200"], + ) + before = cdx.before(wayback_machine_timestamp=20160731235949) + assert "20160731233347" in before.timestamp + assert "google" in before.urlkey + assert before.original.find("google.com") != -1 + assert before.archive_url.find("google.com") != -1 + + +def test_after() -> None: + user_agent = ( + "Mozilla/5.0 (MacBook Air; M1 Mac OS X 11_4) " + "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/604.1" + ) + + cdx = WaybackMachineCDXServerAPI( + url="http://www.google.com/", + user_agent=user_agent, + filters=["statuscode:200"], + ) + after = cdx.after(wayback_machine_timestamp=20160731235949) + assert "20160801000917" in after.timestamp, after.timestamp + assert "google" in after.urlkey + assert after.original.find("google.com") != -1 + assert after.archive_url.find("google.com") != -1 diff --git a/waybackpy/cdx_api.py b/waybackpy/cdx_api.py index 6347ad6..4bd8291 100644 --- a/waybackpy/cdx_api.py +++ b/waybackpy/cdx_api.py @@ -191,6 +191,88 @@ def add_payload(self, payload: Dict[str, str]) -> None: payload["url"] = self.url + def before( + self, + year: Optional[int] = None, + month: Optional[int] = None, + day: Optional[int] = None, + hour: Optional[int] = None, + minute: Optional[int] = None, + unix_timestamp: Optional[int] = None, + wayback_machine_timestamp: Optional[Union[int, str]] = None, + ) -> CDXSnapshot: + """ + Gets the nearest archive before the given datetime. + """ + if unix_timestamp: + timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp) + elif wayback_machine_timestamp: + timestamp = str(wayback_machine_timestamp) + else: + now = datetime.utcnow().timetuple() + timestamp = wayback_timestamp( + year=now.tm_year if year is None else year, + month=now.tm_mon if month is None else month, + day=now.tm_mday if day is None else day, + hour=now.tm_hour if hour is None else hour, + minute=now.tm_min if minute is None else minute, + ) + self.closest = timestamp + self.sort = "closest" + self.limit = 25000 + for snapshot in self.snapshots(): + if snapshot.timestamp < timestamp: + return snapshot + + # If a snapshot isn't returned, then none were found. + raise NoCDXRecordFound( + "No records were found before the given date for the query." + + "Either there are no archives before the given date," + + " the URL may not have any archived, or the URL may have been" + + " recently archived and is still not available on the CDX server." + ) + + def after( + self, + year: Optional[int] = None, + month: Optional[int] = None, + day: Optional[int] = None, + hour: Optional[int] = None, + minute: Optional[int] = None, + unix_timestamp: Optional[int] = None, + wayback_machine_timestamp: Optional[Union[int, str]] = None, + ) -> CDXSnapshot: + """ + Gets the nearest archive after the given datetime. + """ + if unix_timestamp: + timestamp = unix_timestamp_to_wayback_timestamp(unix_timestamp) + elif wayback_machine_timestamp: + timestamp = str(wayback_machine_timestamp) + else: + now = datetime.utcnow().timetuple() + timestamp = wayback_timestamp( + year=now.tm_year if year is None else year, + month=now.tm_mon if month is None else month, + day=now.tm_mday if day is None else day, + hour=now.tm_hour if hour is None else hour, + minute=now.tm_min if minute is None else minute, + ) + self.closest = timestamp + self.sort = "closest" + self.limit = 25000 + for snapshot in self.snapshots(): + if snapshot.timestamp > timestamp: + return snapshot + + # If a snapshot isn't returned, then none were found. + raise NoCDXRecordFound( + "No records were found after the given date for the query." + + "Either there are no archives after the given date," + + " the URL may not have any archives, or the URL may have been" + + " recently archived and is still not available on the CDX server." + ) + def near( self, year: Optional[int] = None,