From 32688bbbfb5189be086bb1c1cb3744b0b7a55a97 Mon Sep 17 00:00:00 2001 From: Daniel Simmons-Ritchie <37225902+SimmonsRitchie@users.noreply.github.com> Date: Fri, 10 May 2024 13:36:48 -0500 Subject: [PATCH] Fix spider: atl_boe --- city_scrapers/spiders/atl_boe.py | 194 +- tests/files/atl_boe.html | 11393 ----------------------------- tests/files/atl_boe.json | 1 + tests/files/atl_boe_236419.html | 322 - tests/test_atl_boe.py | 68 +- 5 files changed, 97 insertions(+), 11881 deletions(-) delete mode 100644 tests/files/atl_boe.html create mode 100644 tests/files/atl_boe.json delete mode 100644 tests/files/atl_boe_236419.html diff --git a/city_scrapers/spiders/atl_boe.py b/city_scrapers/spiders/atl_boe.py index 4d6f7f9..4858395 100644 --- a/city_scrapers/spiders/atl_boe.py +++ b/city_scrapers/spiders/atl_boe.py @@ -1,6 +1,7 @@ from datetime import datetime, timedelta -from city_scrapers_core.constants import ADVISORY_COMMITTEE, BOARD, COMMISSION +import scrapy +from city_scrapers_core.constants import BOARD, COMMISSION, COMMITTEE from city_scrapers_core.items import Meeting from city_scrapers_core.spiders import CityScrapersSpider @@ -9,129 +10,88 @@ class AtlBoeSpider(CityScrapersSpider): name = "atl_boe" agency = "Atlanta Board of Education" timezone = "America/New_York" - start_urls = ["https://www.atlantapublicschools.us/apsboard"] - custom_settings = {"COOKIES_ENABLED": True} - weekdays = ( - "monday", - "tuesday", - "wednesday", - "thursday", - "friday", - "saturday", - "sunday", - ) + start_urls = [ + "https://www.atlantapublicschools.us/Generator/TokenGenerator.ashx/ProcessRequest" # noqa + ] + calendar_base_url = "https://awsapieast1-prod23.schoolwires.com/REST/api/v4/CalendarEvents/GetEvents/17299" # noqa + default_location = { + "name": "Atlanta Public Schools", + "address": "130 Trinity Ave SW, Atlanta, GA 30303", + } + default_links = [ + { + "title": "Atlanta BOE Facebook page", + "href": "https://www.facebook.com/apsboard/", + } + ] def parse(self, response): - for item in response.css(".ui-article"): - if not item.css(".sw-calendar-block-date"): - continue + """ + Get Bearer token from server's token generation endpoint + and then make request to calendar API + """ + data = response.json() + token = data.get("Token") + if not token: + self.logger.error("No token found") + return + + # Gen dates from 1 month ago to 3 months from today + start_date = datetime.now() - timedelta(days=30) + start_date_fmtd = start_date.strftime("%Y-%m-%d") + end_date = datetime.now() + timedelta(days=90) + end_date_fmtd = end_date.strftime("%Y-%m-%d") + + yield scrapy.Request( + f"{self.calendar_base_url}?StartDate={start_date_fmtd}&EndDate={end_date_fmtd}&ModuleInstanceFilter=&CategoryFilter=&IsDBStreamAndShowAll=true", # noqa + callback=self.parse_events, + headers={ + "Authorization": f"Bearer {token}", + "Accept": "application/json", + # The headers below are not strictly necessary but provided to + # reduce the likelihood of being blocked by the server + "Referer": "https://www.atlantapublicschools.us/", + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36", # noqa + }, + ) + + def parse_events(self, response): + """ + Parse events from JSON response + """ + try: + events = response.json() + except ValueError: + self.logger.error("Failed to parse JSON from response") + return + + for event in events: + all_day = event.get("AllDay", False) meeting = Meeting( - title=self._parse_title(item), + title=event["Title"].strip(), description="", - classification=self._parse_classification(item), - start=self._parse_start(item), - end=self._parse_end(item), - all_day=False, - time_notes="", - location=self._parse_location(item), - links=self._parse_links(item), - source=self._parse_source(response), + classification=self._get_classification(event["Title"]), + start=self._parse_datetime(event["Start"]), + end=self._parse_datetime(event["End"]) if not all_day else None, + all_day=event["AllDay"], + time_notes=None, + location=self.default_location, + links=self.default_links, + source=response.url, ) - meeting["status"] = self._get_status(meeting) meeting["id"] = self._get_id(meeting) + yield meeting - yield response.follow( - meeting["links"][0]["href"], - callback=self._follow_meeting_link, - meta={"meeting": meeting}, - dont_filter=True, - ) - - def _follow_meeting_link(self, response): - """we only need to navigate this page in order to get - the necessary cookies to interact with EventDetailWrapper.aspx, - which is processed by _parse_meeting_details""" - event_date_id = response.request.url.split("/")[-1] - follow_link = ( - "https://www.atlantapublicschools.us/site/UserControls/Calendar" - "/EventDetailWrapper.aspx?ModuleInstanceID=17299" - f"&EventDateID={event_date_id}&UserRegID=0&IsEdit=false" - ) - yield response.follow( - follow_link, - callback=self._parse_meeting_details, - meta=response.meta, - dont_filter=True, - ) - - def _parse_meeting_details(self, response): - """parse the meeting details page to scrape - a meeting notice if there is one""" - meeting = response.meta["meeting"] - meeting_notice = response.css( - "#cal-ed-description-body > p > a::attr(href)" - ).get() - if meeting_notice: - meeting_notice = response.urljoin(meeting_notice) - meeting["links"].append({"href": meeting_notice, "title": "Meeting notice"}) - yield meeting - - def _parse_title(self, item): - return item.css(".sw-calendar-block-title > a::text").get() + def _parse_datetime(self, datetime_str): + """Convert ISO formatted date and time string to a datetime object.""" + return datetime.fromisoformat(datetime_str) - def _parse_classification(self, item): - if "committee" in self._parse_title(item).lower(): - return ADVISORY_COMMITTEE - elif "commission" in self._parse_title(item).lower(): - return COMMISSION - else: + def _get_classification(self, title): + """Determine the classification based on the title or other fields.""" + clean_title = title.lower() + if "board" in clean_title: return BOARD - - def _parse_start(self, item): - date = self._parse_date(item) - time = self._parse_time(item, 0) - return datetime(date.year, date.month, date.day, time.hour, time.minute) - - def _parse_end(self, item): - date = self._parse_date(item) - time = self._parse_time(item, 1) - return datetime(date.year, date.month, date.day, time.hour, time.minute) - - def _parse_location(self, item): - return { - "address": "130 Trinity Avenue, Atlanta, GA 30303", - "name": "Center for Learning and Leadership", - } - - def _parse_links(self, item): - """only parses meeting details link. the meeting notice, - if it exists, is scraped from the meeting link""" - href = item.css(".sw-calendar-block-title > a::attr(href)").get().strip() - return [{"href": href, "title": "Meeting details"}] - - def _parse_source(self, response): - """Parse or generate source.""" - return response.url - - def _parse_date(self, item): - date_str = item.css(".sw-calendar-block-date::text").get() - if date_str.lower() in self.weekdays: - return self._get_next_weekday_date(date_str.lower()) - elif date_str.lower() == "today": - return datetime.today() - else: - return datetime.strptime(date_str, "%B %d, %Y") - - def _parse_time(self, item, index): - time_str = ( - item.css(".sw-calendar-block-time::text").get().split("-")[index].strip() - ) - return datetime.strptime(time_str, "%I:%M %p") - - def _get_next_weekday_date(self, weekday): - today = datetime.today() - today_weekday = today.weekday() - weekday_num = self.weekdays.index(weekday) - delta = (weekday_num - today_weekday) % 7 or 7 - return today + timedelta(days=delta) + elif "committee" in clean_title: + return COMMITTEE + return COMMISSION diff --git a/tests/files/atl_boe.html b/tests/files/atl_boe.html deleted file mode 100644 index b5fa88a..0000000 --- a/tests/files/atl_boe.html +++ /dev/null @@ -1,11393 +0,0 @@ - - - - - Atlanta Board of Education / Overview - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
- -
- - - - - - - - - - - - - - - - -
-
-
- - -
-
- - -
-
-
-
- -
-
-
-
- -
-
- -
-
-
-
-
-
- -
-
-
-

- Atlanta Board of Education -

-

- Eshé P. Collins, Chair -

-
-
-
-
-
-
- - -
-
-
-
-
-
-
-
- - - -
- - - - -
- - - - - - - -
- -
-
-
-
-
- -
-
-
-
-
-
-
-
- - - -
- - - - -
- - - - - - - - - - - - -
- -
-
-
-
-
-
-
-
-
-
-
- Our Commitment -
-
- The Atlanta Board of Education is committed to two-way communication with the public about the organization and operation of the Atlanta Public Schools. This commitment includes keeping the public regularly informed and providing opportunities for the public to interact with the Board and the APS. The Board encourages the public to inquire, learn about, and express a continuing interest in APS operations and to make suggestions for improvements. -
-
-
-
-
-
-
-
-
-
-
-

Upcoming Board Meetings

-
-
- -View Calendar -
- -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
- - - - - - - - -
- - -
- - -
- - - -
- - -
- - -
- - -
- - - -
- - - -
- - -
- - - -
-
- CLOSE -
-
-
-
-
-
- CLOSE -
-
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/tests/files/atl_boe.json b/tests/files/atl_boe.json new file mode 100644 index 0000000..6d76345 --- /dev/null +++ b/tests/files/atl_boe.json @@ -0,0 +1 @@ +[{"Id": 261399, "EventID": 204285, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Oglethorpe Renaming Committee Meeting", "Start": "2024-04-11T17:30:00", "End": "2024-04-11T19:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261405, "EventID": 204291, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "District 6 Community Meeting", "Start": "2024-04-17T18:00:00", "End": "2024-04-17T19:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 260671, "EventID": 203587, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Board Committee Day ", "Start": "2024-04-18T10:00:00", "End": "2024-04-18T16:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261403, "EventID": 204289, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Budget Commission Meeting", "Start": "2024-04-18T12:00:00", "End": "2024-04-18T15:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261404, "EventID": 204290, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Policy Review Committee Meeting", "Start": "2024-04-18T16:00:00", "End": "2024-04-18T17:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261406, "EventID": 204292, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "District 1 Virtual Community Meeting", "Start": "2024-04-19T09:00:00", "End": "2024-04-19T10:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261407, "EventID": 204293, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "District 2 Virtual Community Meeting ", "Start": "2024-04-23T12:00:00", "End": "2024-04-23T13:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261865, "EventID": 204731, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "BEST Renaming Committee Meeting", "Start": "2024-04-25T17:00:00", "End": "2024-04-25T19:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261408, "EventID": 204294, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "District 3 Virtual Community ", "Start": "2024-04-26T09:30:00", "End": "2024-04-26T10:30:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261974, "EventID": 204824, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Board Retreat - Whole Board Training ", "Start": "2024-05-01T13:00:00", "End": "2024-05-01T18:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 260672, "EventID": 203588, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Board Meeting and Budget Hearing #1", "Start": "2024-05-06T14:30:00", "End": "2024-05-06T20:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 262088, "EventID": 204932, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Oglethorpe Renaming Committee Meeting ", "Start": "2024-05-09T17:00:00", "End": "2024-05-09T18:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 260673, "EventID": 203589, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Board Student Recognition Meeting ", "Start": "2024-05-13T17:00:00", "End": "2024-05-13T20:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 250349, "EventID": 194223, "ModuleInstanceID": 113402, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Last Day of School", "Start": "2024-05-24T00:00:00", "End": "2024-05-24T23:59:00", "CategoryColor": "#0083a9", "CategoryTitle": "District Event", "EventSource": "Atlanta Public Schools Calendars", "RecurringInfo": "TBD", "AllDay": true, "NoEndTime": 0}, {"Id": 250350, "EventID": 194224, "ModuleInstanceID": 113402, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Memorial Day", "Start": "2024-05-27T00:00:00", "End": "2024-05-27T23:59:00", "CategoryColor": "#da7b22", "CategoryTitle": "Holiday", "EventSource": "Atlanta Public Schools Calendars", "RecurringInfo": "TBD", "AllDay": true, "NoEndTime": 0}] \ No newline at end of file diff --git a/tests/files/atl_boe_236419.html b/tests/files/atl_boe_236419.html deleted file mode 100644 index 78a896e..0000000 --- a/tests/files/atl_boe_236419.html +++ /dev/null @@ -1,322 +0,0 @@ - - - - - -
-
-
-

Accountability Commission Meeting

- - -
-
- - - -

Description

- - - - - -
- -

- - -

- -
- -
-
- - -
-
-
-

- -

- -
- -
- - -
-
-
- - diff --git a/tests/test_atl_boe.py b/tests/test_atl_boe.py index 9d51fb6..94c4c00 100644 --- a/tests/test_atl_boe.py +++ b/tests/test_atl_boe.py @@ -1,98 +1,68 @@ from datetime import datetime from os.path import dirname, join -import pytest -from city_scrapers_core.constants import COMMISSION +import pytest # noqa +from city_scrapers_core.constants import COMMITTEE, PASSED from city_scrapers_core.utils import file_response from freezegun import freeze_time from city_scrapers.spiders.atl_boe import AtlBoeSpider test_response = file_response( - join(dirname(__file__), "files", "atl_boe.html"), - url="https://www.atlantapublicschools.us/apsboard", + join(dirname(__file__), "files", "atl_boe.json"), + url="https://awsapieast1-prod23.schoolwires.com/REST/api/v4/CalendarEvents/GetEvents/17299?StartDate=2024-04-10&EndDate=2024-08-08&ModuleInstanceFilter=&CategoryFilter=&IsDBStreamAndShowAll=true", # noqa ) -meeting_response = file_response( - join(dirname(__file__), "files", "atl_boe_236419.html"), - url="https://www.atlantapublicschools.us/site/UserControls/Calendar//EventDetailWrapper.aspx?ModuleInstanceID=17299&EventDateID=236419&UserRegID=0&IsEdit=false", # noqa -) spider = AtlBoeSpider() -freezer = freeze_time("2022-08-26") +freezer = freeze_time("2024-05-10") freezer.start() -requests = [item for item in spider.parse(test_response)] -parsed_items = [] -for request in requests: - if "236419" in request.url: - meeting_response.request = request - parsed_items += [ - item for item in spider._parse_meeting_details(meeting_response) - ] - else: - parsed_items.append(request.meta["meeting"]) - -parsed_items.sort(key=lambda item: item["start"]) +parsed_items = [item for item in spider.parse_events(test_response)] freezer.stop() -def test_length(): - assert len(parsed_items) == 5 - - def test_title(): - assert parsed_items[0]["title"] == "Accountability Commission Meeting" + assert parsed_items[0]["title"] == "Oglethorpe Renaming Committee Meeting" def test_start(): - assert parsed_items[0]["start"] == datetime(2022, 8, 29, 10, 0) + assert parsed_items[0]["start"] == datetime(2024, 4, 11, 17, 30) def test_end(): - assert parsed_items[0]["end"] == datetime(2022, 8, 29, 11, 30) + assert parsed_items[0]["end"] == datetime(2024, 4, 11, 19, 0) def test_id(): assert ( parsed_items[0]["id"] - == "atl_boe/202208291000/x/accountability_commission_meeting" + == "atl_boe/202404111730/x/oglethorpe_renaming_committee_meeting" ) def test_status(): - assert parsed_items[0]["status"] == "tentative" + assert parsed_items[0]["status"] == PASSED def test_location(): assert parsed_items[0]["location"] == { - "name": "Center for Learning and Leadership", - "address": "130 Trinity Avenue, Atlanta, GA 30303", + "name": "Atlanta Public Schools", + "address": "130 Trinity Ave SW, Atlanta, GA 30303", } def test_source(): - assert parsed_items[0]["source"] == "https://www.atlantapublicschools.us/apsboard" + assert ( + parsed_items[0]["source"] + == "https://awsapieast1-prod23.schoolwires.com/REST/api/v4/CalendarEvents/GetEvents/17299?StartDate=2024-04-10&EndDate=2024-08-08&ModuleInstanceFilter=&CategoryFilter=&IsDBStreamAndShowAll=true" # noqa + ) def test_links(): - assert parsed_items[0]["links"] == [ - { - "href": "https://www.atlantapublicschools.us/site/Default.aspx?PageID=17673&DomainID=3944#calendar17299/20220829/event/236419", # noqa - "title": "Meeting details", - }, - { - "href": "https://www.atlantapublicschools.us/cms/lib/GA01000924/Centricity/Domain/3944/Meeting Notice - Accountability Commission Meeting 08292022.pdf", # noqa - "title": "Meeting notice", - }, - ] + assert parsed_items[0]["links"] == AtlBoeSpider.default_links def test_classification(): - assert parsed_items[0]["classification"] == COMMISSION - - -@pytest.mark.parametrize("item", parsed_items) -def test_all_day(item): - assert item["all_day"] is False + assert parsed_items[0]["classification"] == COMMITTEE