Skip to content

Commit

Permalink
Merge pull request #69 from City-Bureau/fix/oma-examining-engineers
Browse files Browse the repository at this point in the history
🕷️ Fix spider: Omaha Planning Department: The Board of Engineer Examiners
  • Loading branch information
SimmonsRitchie authored Jul 30, 2024
2 parents 724232f + 0db3059 commit f72ab78
Show file tree
Hide file tree
Showing 4 changed files with 717 additions and 10 deletions.
10 changes: 0 additions & 10 deletions city_scrapers/spiders/oma_examining.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,6 @@ def parse(self, response):
yield meeting


class OmahaPlanningExaminersEngineers(OmahaExaminingBoardMixin, CityScrapersSpider):
name = "oma_planning_exam_engineers"
agency = "Omaha Planning Department: Board of Examiners (For Engineers)"
start_urls = [
"https://planning.cityofomaha.org/boards/board-of-examiners-for-engineers" # noqa
]
time = "12pm"
address = "Room 1210 - 12th Floor; Omaha-Douglas Civic Center, 1819 Farnam Street"


class OmahaPlanningExaminersPipefitters(OmahaExaminingBoardMixin, CityScrapersSpider):
name = "oma_planning_exam_pipefitters"
agency = "Omaha Planning Department: Board of Examiners (For Engineers)"
Expand Down
92 changes: 92 additions & 0 deletions city_scrapers/spiders/oma_planning_exam_engineers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
import re
from urllib.parse import urljoin

from city_scrapers_core.constants import NOT_CLASSIFIED
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from dateutil import parser


class OmahaPlanningExaminersEngineersSpider(CityScrapersSpider):
name = "oma_planning_exam_engineers"
agency = "Omaha Planning Department: The Board of Engineer Examiners"
start_urls = [
"https://planning.cityofomaha.org/boards/board-of-examiners-for-engineers" # noqa
]
timezone = "America/Chicago"
base_url = "https://planning.cityofomaha.org"
start_time = "12 p.m."
location = {
"name": "",
"address": "Room 1210 - 12th Floor; Omaha-Douglas Civic Center, 1819 Farnam Street", # noqa
}

def parse(self, response):
table = response.css("table.tabclr")

# skip the first two rows, which are headers
col_headers = table.css("tr")[1]
for row in table.css("tr")[2:]:
start = self._parse_start(row)
if not start:
# If we can't parse the start time, skip this row
continue
meeting = Meeting(
title="Board of Engineer Examiners meeting",
description="",
classification=NOT_CLASSIFIED,
start=start,
end=None,
all_day=False,
time_notes="",
location=self.location,
links=self._parse_links(row, col_headers),
source=response.url,
)
meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
yield meeting

def _parse_start(self, row):
"""
Parse the start time from the second column of the row.
Date is in format "Month Day, Year" and time is always 12 p.m.
Date might be in a link, so we need to check for that.
"""
second_col = row.css("td:nth-child(2)")
date_str = second_col.css("::text").extract_first()

if not date_str:
date_str = second_col.css("a::text").extract_first()
if not date_str:
return
# use regex to capture only the date string in format "Month Day, Year"
clean_date = re.search(r"([A-Z][a-z]+ \d{1,2}, \d{4})", date_str)
if not clean_date:
return
clean_date_str = clean_date.group(0)
full_start_str = f"{clean_date_str} {self.start_time}"
return parser.parse(full_start_str)

def _parse_links(self, row, col_headers):
"""
Third and four columns contain links to meeting minutes and agendas.
"""
links = []
for col_num in [3, 4]:
col = row.css(f"td:nth-child({col_num})")
if col.css("a"):
header_selector = f"td:nth-child({col_num}) strong::text"
title_els = col_headers.css(header_selector)
# loop over all title_els and join
title = " ".join(title_els.extract())
clean_title = re.sub(r"\s+", " ", title)
relative_url = col.css("a::attr(href)").extract_first()
abs_url = urljoin(self.base_url, relative_url)
links.append(
{
"title": clean_title,
"href": abs_url,
}
)
return links
Loading

0 comments on commit f72ab78

Please sign in to comment.