From 32688bbbfb5189be086bb1c1cb3744b0b7a55a97 Mon Sep 17 00:00:00 2001
From: Daniel Simmons-Ritchie
<37225902+SimmonsRitchie@users.noreply.github.com>
Date: Fri, 10 May 2024 13:36:48 -0500
Subject: [PATCH] Fix spider: atl_boe
---
city_scrapers/spiders/atl_boe.py | 194 +-
tests/files/atl_boe.html | 11393 -----------------------------
tests/files/atl_boe.json | 1 +
tests/files/atl_boe_236419.html | 322 -
tests/test_atl_boe.py | 68 +-
5 files changed, 97 insertions(+), 11881 deletions(-)
delete mode 100644 tests/files/atl_boe.html
create mode 100644 tests/files/atl_boe.json
delete mode 100644 tests/files/atl_boe_236419.html
diff --git a/city_scrapers/spiders/atl_boe.py b/city_scrapers/spiders/atl_boe.py
index 4d6f7f9..4858395 100644
--- a/city_scrapers/spiders/atl_boe.py
+++ b/city_scrapers/spiders/atl_boe.py
@@ -1,6 +1,7 @@
from datetime import datetime, timedelta
-from city_scrapers_core.constants import ADVISORY_COMMITTEE, BOARD, COMMISSION
+import scrapy
+from city_scrapers_core.constants import BOARD, COMMISSION, COMMITTEE
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
@@ -9,129 +10,88 @@ class AtlBoeSpider(CityScrapersSpider):
name = "atl_boe"
agency = "Atlanta Board of Education"
timezone = "America/New_York"
- start_urls = ["https://www.atlantapublicschools.us/apsboard"]
- custom_settings = {"COOKIES_ENABLED": True}
- weekdays = (
- "monday",
- "tuesday",
- "wednesday",
- "thursday",
- "friday",
- "saturday",
- "sunday",
- )
+ start_urls = [
+ "https://www.atlantapublicschools.us/Generator/TokenGenerator.ashx/ProcessRequest" # noqa
+ ]
+ calendar_base_url = "https://awsapieast1-prod23.schoolwires.com/REST/api/v4/CalendarEvents/GetEvents/17299" # noqa
+ default_location = {
+ "name": "Atlanta Public Schools",
+ "address": "130 Trinity Ave SW, Atlanta, GA 30303",
+ }
+ default_links = [
+ {
+ "title": "Atlanta BOE Facebook page",
+ "href": "https://www.facebook.com/apsboard/",
+ }
+ ]
def parse(self, response):
- for item in response.css(".ui-article"):
- if not item.css(".sw-calendar-block-date"):
- continue
+ """
+ Get Bearer token from server's token generation endpoint
+ and then make request to calendar API
+ """
+ data = response.json()
+ token = data.get("Token")
+ if not token:
+ self.logger.error("No token found")
+ return
+
+ # Gen dates from 1 month ago to 3 months from today
+ start_date = datetime.now() - timedelta(days=30)
+ start_date_fmtd = start_date.strftime("%Y-%m-%d")
+ end_date = datetime.now() + timedelta(days=90)
+ end_date_fmtd = end_date.strftime("%Y-%m-%d")
+
+ yield scrapy.Request(
+ f"{self.calendar_base_url}?StartDate={start_date_fmtd}&EndDate={end_date_fmtd}&ModuleInstanceFilter=&CategoryFilter=&IsDBStreamAndShowAll=true", # noqa
+ callback=self.parse_events,
+ headers={
+ "Authorization": f"Bearer {token}",
+ "Accept": "application/json",
+ # The headers below are not strictly necessary but provided to
+ # reduce the likelihood of being blocked by the server
+ "Referer": "https://www.atlantapublicschools.us/",
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36", # noqa
+ },
+ )
+
+ def parse_events(self, response):
+ """
+ Parse events from JSON response
+ """
+ try:
+ events = response.json()
+ except ValueError:
+ self.logger.error("Failed to parse JSON from response")
+ return
+
+ for event in events:
+ all_day = event.get("AllDay", False)
meeting = Meeting(
- title=self._parse_title(item),
+ title=event["Title"].strip(),
description="",
- classification=self._parse_classification(item),
- start=self._parse_start(item),
- end=self._parse_end(item),
- all_day=False,
- time_notes="",
- location=self._parse_location(item),
- links=self._parse_links(item),
- source=self._parse_source(response),
+ classification=self._get_classification(event["Title"]),
+ start=self._parse_datetime(event["Start"]),
+ end=self._parse_datetime(event["End"]) if not all_day else None,
+ all_day=event["AllDay"],
+ time_notes=None,
+ location=self.default_location,
+ links=self.default_links,
+ source=response.url,
)
-
meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
+ yield meeting
- yield response.follow(
- meeting["links"][0]["href"],
- callback=self._follow_meeting_link,
- meta={"meeting": meeting},
- dont_filter=True,
- )
-
- def _follow_meeting_link(self, response):
- """we only need to navigate this page in order to get
- the necessary cookies to interact with EventDetailWrapper.aspx,
- which is processed by _parse_meeting_details"""
- event_date_id = response.request.url.split("/")[-1]
- follow_link = (
- "https://www.atlantapublicschools.us/site/UserControls/Calendar"
- "/EventDetailWrapper.aspx?ModuleInstanceID=17299"
- f"&EventDateID={event_date_id}&UserRegID=0&IsEdit=false"
- )
- yield response.follow(
- follow_link,
- callback=self._parse_meeting_details,
- meta=response.meta,
- dont_filter=True,
- )
-
- def _parse_meeting_details(self, response):
- """parse the meeting details page to scrape
- a meeting notice if there is one"""
- meeting = response.meta["meeting"]
- meeting_notice = response.css(
- "#cal-ed-description-body > p > a::attr(href)"
- ).get()
- if meeting_notice:
- meeting_notice = response.urljoin(meeting_notice)
- meeting["links"].append({"href": meeting_notice, "title": "Meeting notice"})
- yield meeting
-
- def _parse_title(self, item):
- return item.css(".sw-calendar-block-title > a::text").get()
+ def _parse_datetime(self, datetime_str):
+ """Convert ISO formatted date and time string to a datetime object."""
+ return datetime.fromisoformat(datetime_str)
- def _parse_classification(self, item):
- if "committee" in self._parse_title(item).lower():
- return ADVISORY_COMMITTEE
- elif "commission" in self._parse_title(item).lower():
- return COMMISSION
- else:
+ def _get_classification(self, title):
+ """Determine the classification based on the title or other fields."""
+ clean_title = title.lower()
+ if "board" in clean_title:
return BOARD
-
- def _parse_start(self, item):
- date = self._parse_date(item)
- time = self._parse_time(item, 0)
- return datetime(date.year, date.month, date.day, time.hour, time.minute)
-
- def _parse_end(self, item):
- date = self._parse_date(item)
- time = self._parse_time(item, 1)
- return datetime(date.year, date.month, date.day, time.hour, time.minute)
-
- def _parse_location(self, item):
- return {
- "address": "130 Trinity Avenue, Atlanta, GA 30303",
- "name": "Center for Learning and Leadership",
- }
-
- def _parse_links(self, item):
- """only parses meeting details link. the meeting notice,
- if it exists, is scraped from the meeting link"""
- href = item.css(".sw-calendar-block-title > a::attr(href)").get().strip()
- return [{"href": href, "title": "Meeting details"}]
-
- def _parse_source(self, response):
- """Parse or generate source."""
- return response.url
-
- def _parse_date(self, item):
- date_str = item.css(".sw-calendar-block-date::text").get()
- if date_str.lower() in self.weekdays:
- return self._get_next_weekday_date(date_str.lower())
- elif date_str.lower() == "today":
- return datetime.today()
- else:
- return datetime.strptime(date_str, "%B %d, %Y")
-
- def _parse_time(self, item, index):
- time_str = (
- item.css(".sw-calendar-block-time::text").get().split("-")[index].strip()
- )
- return datetime.strptime(time_str, "%I:%M %p")
-
- def _get_next_weekday_date(self, weekday):
- today = datetime.today()
- today_weekday = today.weekday()
- weekday_num = self.weekdays.index(weekday)
- delta = (weekday_num - today_weekday) % 7 or 7
- return today + timedelta(days=delta)
+ elif "committee" in clean_title:
+ return COMMITTEE
+ return COMMISSION
diff --git a/tests/files/atl_boe.html b/tests/files/atl_boe.html
deleted file mode 100644
index b5fa88a..0000000
--- a/tests/files/atl_boe.html
+++ /dev/null
@@ -1,11393 +0,0 @@
-
-
-
-
- Atlanta Board of Education / Overview
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
Select a School...
-
-
-
-
-
-
-
-
-
Search Our Site
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/tests/files/atl_boe.json b/tests/files/atl_boe.json
new file mode 100644
index 0000000..6d76345
--- /dev/null
+++ b/tests/files/atl_boe.json
@@ -0,0 +1 @@
+[{"Id": 261399, "EventID": 204285, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Oglethorpe Renaming Committee Meeting", "Start": "2024-04-11T17:30:00", "End": "2024-04-11T19:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261405, "EventID": 204291, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "District 6 Community Meeting", "Start": "2024-04-17T18:00:00", "End": "2024-04-17T19:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 260671, "EventID": 203587, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Board Committee Day ", "Start": "2024-04-18T10:00:00", "End": "2024-04-18T16:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261403, "EventID": 204289, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Budget Commission Meeting", "Start": "2024-04-18T12:00:00", "End": "2024-04-18T15:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261404, "EventID": 204290, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Policy Review Committee Meeting", "Start": "2024-04-18T16:00:00", "End": "2024-04-18T17:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261406, "EventID": 204292, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "District 1 Virtual Community Meeting", "Start": "2024-04-19T09:00:00", "End": "2024-04-19T10:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261407, "EventID": 204293, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "District 2 Virtual Community Meeting ", "Start": "2024-04-23T12:00:00", "End": "2024-04-23T13:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261865, "EventID": 204731, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "BEST Renaming Committee Meeting", "Start": "2024-04-25T17:00:00", "End": "2024-04-25T19:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261408, "EventID": 204294, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "District 3 Virtual Community ", "Start": "2024-04-26T09:30:00", "End": "2024-04-26T10:30:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 261974, "EventID": 204824, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Board Retreat - Whole Board Training ", "Start": "2024-05-01T13:00:00", "End": "2024-05-01T18:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 260672, "EventID": 203588, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Board Meeting and Budget Hearing #1", "Start": "2024-05-06T14:30:00", "End": "2024-05-06T20:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 262088, "EventID": 204932, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Oglethorpe Renaming Committee Meeting ", "Start": "2024-05-09T17:00:00", "End": "2024-05-09T18:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 260673, "EventID": 203589, "ModuleInstanceID": 17299, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Board Student Recognition Meeting ", "Start": "2024-05-13T17:00:00", "End": "2024-05-13T20:00:00", "CategoryColor": "#849CE7", "CategoryTitle": "Community", "EventSource": "Calendar", "RecurringInfo": "TBD", "AllDay": false, "NoEndTime": 0}, {"Id": 250349, "EventID": 194223, "ModuleInstanceID": 113402, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Last Day of School", "Start": "2024-05-24T00:00:00", "End": "2024-05-24T23:59:00", "CategoryColor": "#0083a9", "CategoryTitle": "District Event", "EventSource": "Atlanta Public Schools Calendars", "RecurringInfo": "TBD", "AllDay": true, "NoEndTime": 0}, {"Id": 250350, "EventID": 194224, "ModuleInstanceID": 113402, "RecurringEvent": false, "RegisteredUsers": false, "Title": "Memorial Day", "Start": "2024-05-27T00:00:00", "End": "2024-05-27T23:59:00", "CategoryColor": "#da7b22", "CategoryTitle": "Holiday", "EventSource": "Atlanta Public Schools Calendars", "RecurringInfo": "TBD", "AllDay": true, "NoEndTime": 0}]
\ No newline at end of file
diff --git a/tests/files/atl_boe_236419.html b/tests/files/atl_boe_236419.html
deleted file mode 100644
index 78a896e..0000000
--- a/tests/files/atl_boe_236419.html
+++ /dev/null
@@ -1,322 +0,0 @@
-
-
-
-
-
-
-
-
diff --git a/tests/test_atl_boe.py b/tests/test_atl_boe.py
index 9d51fb6..94c4c00 100644
--- a/tests/test_atl_boe.py
+++ b/tests/test_atl_boe.py
@@ -1,98 +1,68 @@
from datetime import datetime
from os.path import dirname, join
-import pytest
-from city_scrapers_core.constants import COMMISSION
+import pytest # noqa
+from city_scrapers_core.constants import COMMITTEE, PASSED
from city_scrapers_core.utils import file_response
from freezegun import freeze_time
from city_scrapers.spiders.atl_boe import AtlBoeSpider
test_response = file_response(
- join(dirname(__file__), "files", "atl_boe.html"),
- url="https://www.atlantapublicschools.us/apsboard",
+ join(dirname(__file__), "files", "atl_boe.json"),
+ url="https://awsapieast1-prod23.schoolwires.com/REST/api/v4/CalendarEvents/GetEvents/17299?StartDate=2024-04-10&EndDate=2024-08-08&ModuleInstanceFilter=&CategoryFilter=&IsDBStreamAndShowAll=true", # noqa
)
-meeting_response = file_response(
- join(dirname(__file__), "files", "atl_boe_236419.html"),
- url="https://www.atlantapublicschools.us/site/UserControls/Calendar//EventDetailWrapper.aspx?ModuleInstanceID=17299&EventDateID=236419&UserRegID=0&IsEdit=false", # noqa
-)
spider = AtlBoeSpider()
-freezer = freeze_time("2022-08-26")
+freezer = freeze_time("2024-05-10")
freezer.start()
-requests = [item for item in spider.parse(test_response)]
-parsed_items = []
-for request in requests:
- if "236419" in request.url:
- meeting_response.request = request
- parsed_items += [
- item for item in spider._parse_meeting_details(meeting_response)
- ]
- else:
- parsed_items.append(request.meta["meeting"])
-
-parsed_items.sort(key=lambda item: item["start"])
+parsed_items = [item for item in spider.parse_events(test_response)]
freezer.stop()
-def test_length():
- assert len(parsed_items) == 5
-
-
def test_title():
- assert parsed_items[0]["title"] == "Accountability Commission Meeting"
+ assert parsed_items[0]["title"] == "Oglethorpe Renaming Committee Meeting"
def test_start():
- assert parsed_items[0]["start"] == datetime(2022, 8, 29, 10, 0)
+ assert parsed_items[0]["start"] == datetime(2024, 4, 11, 17, 30)
def test_end():
- assert parsed_items[0]["end"] == datetime(2022, 8, 29, 11, 30)
+ assert parsed_items[0]["end"] == datetime(2024, 4, 11, 19, 0)
def test_id():
assert (
parsed_items[0]["id"]
- == "atl_boe/202208291000/x/accountability_commission_meeting"
+ == "atl_boe/202404111730/x/oglethorpe_renaming_committee_meeting"
)
def test_status():
- assert parsed_items[0]["status"] == "tentative"
+ assert parsed_items[0]["status"] == PASSED
def test_location():
assert parsed_items[0]["location"] == {
- "name": "Center for Learning and Leadership",
- "address": "130 Trinity Avenue, Atlanta, GA 30303",
+ "name": "Atlanta Public Schools",
+ "address": "130 Trinity Ave SW, Atlanta, GA 30303",
}
def test_source():
- assert parsed_items[0]["source"] == "https://www.atlantapublicschools.us/apsboard"
+ assert (
+ parsed_items[0]["source"]
+ == "https://awsapieast1-prod23.schoolwires.com/REST/api/v4/CalendarEvents/GetEvents/17299?StartDate=2024-04-10&EndDate=2024-08-08&ModuleInstanceFilter=&CategoryFilter=&IsDBStreamAndShowAll=true" # noqa
+ )
def test_links():
- assert parsed_items[0]["links"] == [
- {
- "href": "https://www.atlantapublicschools.us/site/Default.aspx?PageID=17673&DomainID=3944#calendar17299/20220829/event/236419", # noqa
- "title": "Meeting details",
- },
- {
- "href": "https://www.atlantapublicschools.us/cms/lib/GA01000924/Centricity/Domain/3944/Meeting Notice - Accountability Commission Meeting 08292022.pdf", # noqa
- "title": "Meeting notice",
- },
- ]
+ assert parsed_items[0]["links"] == AtlBoeSpider.default_links
def test_classification():
- assert parsed_items[0]["classification"] == COMMISSION
-
-
-@pytest.mark.parametrize("item", parsed_items)
-def test_all_day(item):
- assert item["all_day"] is False
+ assert parsed_items[0]["classification"] == COMMITTEE