Skip to content

Commit

Permalink
Fix spider: atl_boe
Browse files Browse the repository at this point in the history
  • Loading branch information
SimmonsRitchie committed May 10, 2024
1 parent 6a7fe5b commit 32688bb
Show file tree
Hide file tree
Showing 5 changed files with 97 additions and 11,881 deletions.
194 changes: 77 additions & 117 deletions city_scrapers/spiders/atl_boe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from datetime import datetime, timedelta

from city_scrapers_core.constants import ADVISORY_COMMITTEE, BOARD, COMMISSION
import scrapy
from city_scrapers_core.constants import BOARD, COMMISSION, COMMITTEE
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider

Expand All @@ -9,129 +10,88 @@ class AtlBoeSpider(CityScrapersSpider):
name = "atl_boe"
agency = "Atlanta Board of Education"
timezone = "America/New_York"
start_urls = ["https://www.atlantapublicschools.us/apsboard"]
custom_settings = {"COOKIES_ENABLED": True}
weekdays = (
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday",
)
start_urls = [
"https://www.atlantapublicschools.us/Generator/TokenGenerator.ashx/ProcessRequest" # noqa
]
calendar_base_url = "https://awsapieast1-prod23.schoolwires.com/REST/api/v4/CalendarEvents/GetEvents/17299" # noqa
default_location = {
"name": "Atlanta Public Schools",
"address": "130 Trinity Ave SW, Atlanta, GA 30303",
}
default_links = [
{
"title": "Atlanta BOE Facebook page",
"href": "https://www.facebook.com/apsboard/",
}
]

def parse(self, response):
for item in response.css(".ui-article"):
if not item.css(".sw-calendar-block-date"):
continue
"""
Get Bearer token from server's token generation endpoint
and then make request to calendar API
"""
data = response.json()
token = data.get("Token")
if not token:
self.logger.error("No token found")
return

# Gen dates from 1 month ago to 3 months from today
start_date = datetime.now() - timedelta(days=30)
start_date_fmtd = start_date.strftime("%Y-%m-%d")
end_date = datetime.now() + timedelta(days=90)
end_date_fmtd = end_date.strftime("%Y-%m-%d")

yield scrapy.Request(
f"{self.calendar_base_url}?StartDate={start_date_fmtd}&EndDate={end_date_fmtd}&ModuleInstanceFilter=&CategoryFilter=&IsDBStreamAndShowAll=true", # noqa
callback=self.parse_events,
headers={
"Authorization": f"Bearer {token}",
"Accept": "application/json",
# The headers below are not strictly necessary but provided to
# reduce the likelihood of being blocked by the server
"Referer": "https://www.atlantapublicschools.us/",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36", # noqa
},
)

def parse_events(self, response):
"""
Parse events from JSON response
"""
try:
events = response.json()
except ValueError:
self.logger.error("Failed to parse JSON from response")
return

for event in events:
all_day = event.get("AllDay", False)
meeting = Meeting(
title=self._parse_title(item),
title=event["Title"].strip(),
description="",
classification=self._parse_classification(item),
start=self._parse_start(item),
end=self._parse_end(item),
all_day=False,
time_notes="",
location=self._parse_location(item),
links=self._parse_links(item),
source=self._parse_source(response),
classification=self._get_classification(event["Title"]),
start=self._parse_datetime(event["Start"]),
end=self._parse_datetime(event["End"]) if not all_day else None,
all_day=event["AllDay"],
time_notes=None,
location=self.default_location,
links=self.default_links,
source=response.url,
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
yield meeting

yield response.follow(
meeting["links"][0]["href"],
callback=self._follow_meeting_link,
meta={"meeting": meeting},
dont_filter=True,
)

def _follow_meeting_link(self, response):
"""we only need to navigate this page in order to get
the necessary cookies to interact with EventDetailWrapper.aspx,
which is processed by _parse_meeting_details"""
event_date_id = response.request.url.split("/")[-1]
follow_link = (
"https://www.atlantapublicschools.us/site/UserControls/Calendar"
"/EventDetailWrapper.aspx?ModuleInstanceID=17299"
f"&EventDateID={event_date_id}&UserRegID=0&IsEdit=false"
)
yield response.follow(
follow_link,
callback=self._parse_meeting_details,
meta=response.meta,
dont_filter=True,
)

def _parse_meeting_details(self, response):
"""parse the meeting details page to scrape
a meeting notice if there is one"""
meeting = response.meta["meeting"]
meeting_notice = response.css(
"#cal-ed-description-body > p > a::attr(href)"
).get()
if meeting_notice:
meeting_notice = response.urljoin(meeting_notice)
meeting["links"].append({"href": meeting_notice, "title": "Meeting notice"})
yield meeting

def _parse_title(self, item):
return item.css(".sw-calendar-block-title > a::text").get()
def _parse_datetime(self, datetime_str):
"""Convert ISO formatted date and time string to a datetime object."""
return datetime.fromisoformat(datetime_str)

def _parse_classification(self, item):
if "committee" in self._parse_title(item).lower():
return ADVISORY_COMMITTEE
elif "commission" in self._parse_title(item).lower():
return COMMISSION
else:
def _get_classification(self, title):
"""Determine the classification based on the title or other fields."""
clean_title = title.lower()
if "board" in clean_title:
return BOARD

def _parse_start(self, item):
date = self._parse_date(item)
time = self._parse_time(item, 0)
return datetime(date.year, date.month, date.day, time.hour, time.minute)

def _parse_end(self, item):
date = self._parse_date(item)
time = self._parse_time(item, 1)
return datetime(date.year, date.month, date.day, time.hour, time.minute)

def _parse_location(self, item):
return {
"address": "130 Trinity Avenue, Atlanta, GA 30303",
"name": "Center for Learning and Leadership",
}

def _parse_links(self, item):
"""only parses meeting details link. the meeting notice,
if it exists, is scraped from the meeting link"""
href = item.css(".sw-calendar-block-title > a::attr(href)").get().strip()
return [{"href": href, "title": "Meeting details"}]

def _parse_source(self, response):
"""Parse or generate source."""
return response.url

def _parse_date(self, item):
date_str = item.css(".sw-calendar-block-date::text").get()
if date_str.lower() in self.weekdays:
return self._get_next_weekday_date(date_str.lower())
elif date_str.lower() == "today":
return datetime.today()
else:
return datetime.strptime(date_str, "%B %d, %Y")

def _parse_time(self, item, index):
time_str = (
item.css(".sw-calendar-block-time::text").get().split("-")[index].strip()
)
return datetime.strptime(time_str, "%I:%M %p")

def _get_next_weekday_date(self, weekday):
today = datetime.today()
today_weekday = today.weekday()
weekday_num = self.weekdays.index(weekday)
delta = (weekday_num - today_weekday) % 7 or 7
return today + timedelta(days=delta)
elif "committee" in clean_title:
return COMMITTEE
return COMMISSION
Loading

0 comments on commit 32688bb

Please sign in to comment.