Skip to content

Commit

Permalink
Fix: dekalb_county_boe
Browse files Browse the repository at this point in the history
  • Loading branch information
SimmonsRitchie committed Jul 15, 2024
1 parent 169bbd3 commit ac6be03
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 119 deletions.
138 changes: 69 additions & 69 deletions city_scrapers/spiders/atl_dekalb_county_boe.py
Original file line number Diff line number Diff line change
@@ -1,84 +1,84 @@
from datetime import datetime
import json

import scrapy
from city_scrapers_core.constants import BOARD
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from dateutil.parser import parse as parse_date


class AtlDekalbCountyBoeSpider(CityScrapersSpider):
name = "atl_dekalb_county_boe"
agency = "DeKalb County Board of Education"
timezone = "America/New_York"
start_urls = ["https://www.dekalbschoolsga.org/board-of-education/board-meetings/"]
link_extractor = LxmlLinkExtractor(restrict_css=".ai1ec-event-details")
source = "https://simbli.eboardsolutions.com/SB_Meetings/SB_MeetingListing.aspx?S=4054" # noqa

def parse(self, response):
for item in response.css("a.ai1ec-read-more::attr(href)"):
print(item.get())
yield response.follow(
item.get(),
callback=self._parse_meeting_page,
)

def _parse_meeting_page(self, response):
start, end, all_day = self._parse_time(response.css(".dt-duration"))
meeting = Meeting(
title=self._parse_title(response),
description=self._parse_description(response),
classification=BOARD,
start=start,
end=end,
all_day=all_day,
time_notes="",
location=self._parse_location(response),
links=self._parse_links(response),
source=self._parse_source(response),
def start_requests(self):
url = (
"https://simbli.eboardsolutions.com/Services/api/GetMeetingListing" # noqa
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def _parse_title(self, item):
return item.css(".entry-title::text").get()

def _parse_description(self, item):
return "\n".join(item.css(".post-content > p").getall())

def _parse_time(self, dt_duration):
start_str = dt_duration.css("::text").get().strip()
all_day = True if dt_duration.css(".ai1ec-allday-badge") else False
if all_day:
start = datetime.strptime(start_str, "%B %d, %Y")
end = None
else:
start_str, end_time = start_str.split("–")
start = datetime.strptime(start_str, "%B %d, %Y @ %I:%M %p")
end = datetime.strptime(end_time, "%I:%M %p")
end = start.replace(hour=end.hour, minute=end.minnute)
return start, end, all_day

def _parse_location(self, item):
address = [
a.strip() for a in item.css(".p-location::text").getall() if a.strip()
]
return {
"address": ", ".join(address[1:]),
"name": address[0],
# Return 50 most recent meetings.
# ConnectionString and SecurityToken are required.
body = {
"ListingType": "0",
"TimeZone": "0",
"CustomSort": 0,
"SortColName": "DateTime",
"IsSortDesc": True,
"RecordStart": 0,
"RecordCount": 50,
"FilterExp": "",
"ParentGroup": None,
"IsUserLoggedIn": False,
"UserID": "",
"UserRole": None,
"EncUserId": None,
"Id": 0,
"SchoolID": "4054",
"ConnectionString": "Z6PDprZMNLXHjSBXkCx3nyYUcSP5M6UnadUK7cjlwACaJqjO6BIZp9WiwanwbY4ZVnjRygpzATee7Qu0w1S8HmAR37HwZBl63V1gla1aplusJUjsbp3RPOgYD8rKMge0DRnjghPLCYcGBvWfEYLDJCwhuND0gFm8zDEltMnSkGHH8U=", # noqa
"SecurityToken": "ZekKE44z6voP8TArAiQr1KqQ7APJMDvo3Mr5tEPYHAow2XgYXKhCVFLu2pHhFaTMoGVOGKg8vFV2Yz70u3sDLEVU4nY7qDAdNvoAJgGmnzjBEfmMTseZAXEzpY4u1Boz", # noqa
"CreatedOn": "0001-01-01T00:00:00",
"CreatedBy": None,
"ModifiedOn": "0001-01-01T00:00:00",
"ModifiedBy": None,
"DeletedBy": None,
"DeletedOnUTC": None,
"IsDeleted": False,
}
serialized_body = json.dumps(body)
yield scrapy.Request(
url,
method="POST",
body=serialized_body,
headers={
"Content-Type": "application/json",
"Accept": "application/json",
},
)

def _parse_links(self, item):
links = [
{
"href": "https://www.dekalbschoolsga.org/communications/dstv",
"title": "DSTV (Comcast channel 24)",
def parse(self, response):
data = response.json()
# write to file
with open("dekalb.json", "w") as f:
json.dump(data, f)
for item in data:
start = parse_date(item["MM_DateTime"])
location = {
"name": item["MM_Address1"],
"address": f"{item['MM_Address2']} {item['MM_Address3']}",
}
]
for link in self.link_extractor.extract_links(item):
links.append({"href": link.url, "title": link.text.strip()})
return links

def _parse_source(self, response):
return response.url
meeting = Meeting(
title=item["MM_MeetingTitle"],
description="",
classification=BOARD,
start=start,
end=None,
all_day=False,
time_notes="",
location=location,
links=[],
source=self.source,
)
meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
yield meeting
1 change: 1 addition & 0 deletions tests/files/atl_dekalb_county_boe.json

Large diffs are not rendered by default.

75 changes: 25 additions & 50 deletions tests/test_atl_dekalb_county_boe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,104 +5,79 @@
from city_scrapers_core.utils import file_response
from freezegun import freeze_time

from city_scrapers.spiders.atl_dekalb_county_boe import AtlDekalbCountyBoeSpider
from city_scrapers.spiders.atl_dekalb_county_boe import AtlDekalbCountyBoeSpider # noqa

test_response = file_response(
join(dirname(__file__), "files", "atl_dekalb_county_boe.html"),
join(dirname(__file__), "files", "atl_dekalb_county_boe.json"),
url="https://www.dekalbschoolsga.org/board-of-education/board-meetings/",
)

meeting_response = file_response(
join(dirname(__file__), "files", "atl_dekalb_county_boe_927.html"),
url="https://www.dekalbschoolsga.org/event/board-of-education-meeting-21/?instance_id=927", # noqa
)

spider = AtlDekalbCountyBoeSpider()

freezer = freeze_time("2022-09-21")
freezer = freeze_time("2024-07-15")
freezer.start()

requests = [item for item in spider.parse(test_response)]
meeting = [item for item in spider._parse_meeting_page(meeting_response)][0]

parsed_items = [item for item in spider.parse(test_response)]
parsed_item = parsed_items[0]
freezer.stop()


def test_len():
assert len(requests) == 4
assert len(parsed_items) == 49


def test_title():
assert meeting["title"] == "Board of Education Meeting"
assert parsed_item["title"] == "Audit Committee Meeting"


def test_description():
assert (
meeting["description"]
== "<p>All Business Meetings begin at 1:00pm (unless noticed for some other time), and will be held in the J. David Williamson Board Room in the Robert R. Freeman Administrative Complex, 1701 Mountain Industrial Boulevard, Stone Mountain, Georgia.</p>\n" # noqa
"<p>All Work Sessions begin at 1:00pm (unless noticed for some other time), and will be held in the J. David Williamson Board Room, in the Robert R. Freeman Administrative Complex, 1701 Mountain Industrial Boulevard, Stone Mountain, Georgia.</p>\n" # noqa
'<p>All meetings can be viewed on DeKalb Schools TV (DSTV) by going to: <a href="https://www.dekalbschoolsga.org/communications/dstv" target="_blank" rel="noopener">www.dekalbschoolsga.org/communications/dstv</a> or Comcast channel 24 (for Comcast subscribers in DeKalb County).</p>\n' # noqa
'<p>For more info, visit the <a href="https://simbli.eboardsolutions.com/Index.aspx?S=4054">Board of Education homepage</a>.</p>' # noqa
)
assert parsed_item["description"] == ""


def test_start():
assert meeting["start"] == datetime(2022, 11, 14, 0, 0)
assert parsed_item["start"] == datetime(2024, 7, 11, 13, 0)


def test_end():
assert meeting["end"] is None
assert parsed_item["end"] is None


def test_id():
assert (
meeting["id"]
== "atl_dekalb_county_boe/202211140000/x/board_of_education_meeting"
parsed_item["id"]
== "atl_dekalb_county_boe/202407111300/x/audit_committee_meeting"
)


def test_status():
assert meeting["status"] == "tentative"
assert parsed_item["status"] == "passed"


def test_location():
assert meeting["location"] == {
"name": "Robert R. Freeman Administrative & Instructional Complex",
"address": "1701 Mountain Industrial Blvd, Stone Mountain, GA 30083, USA",
assert parsed_item["location"] == {
"name": "Board Office Conference Room, Robert R. Freeman Administrative Complex", # noqa
"address": "1701 Mountain Industrial Boulevard Stone Mountain, Georgia 30083", # noqa
}


def test_source():
assert (
meeting["source"]
== "https://www.dekalbschoolsga.org/event/board-of-education-meeting-21/?instance_id=927" # noqa
parsed_item["source"]
== "https://simbli.eboardsolutions.com/SB_Meetings/SB_MeetingListing.aspx?S=4054" # noqa
)


def test_links():
assert meeting["links"] == [
{
"href": "https://www.dekalbschoolsga.org/communications/dstv",
"title": "DSTV (Comcast channel 24)",
},
{
"href": "https://www.google.com/maps?f=q&hl=&source=embed&q=33.83259%2C-84.194653", # noqa
"title": "",
},
{
"href": "https://simbli.eboardsolutions.com/index.aspx?s=4054",
"title": "Event website",
},
{
"href": "https://www.dekalbschoolsga.org/calendar/cat_ids~7/",
"title": "board of education",
},
]
assert (
parsed_item["links"] == []
) # Assuming no links are provided based on your data


def test_classification():
assert meeting["classification"] == BOARD
assert parsed_item["classification"] == BOARD


def test_all_day():
assert meeting["all_day"] is True
assert (
parsed_item["all_day"] is False
) # Based on the boolean value for all_day from the data row

0 comments on commit ac6be03

Please sign in to comment.