Skip to content

Commit

Permalink
Merge pull request #65 from City-Bureau/fix-oma-mud
Browse files Browse the repository at this point in the history
🕷️ Fix spider: Omaha Metropolitan Utilities District
  • Loading branch information
SimmonsRitchie authored Apr 22, 2024
2 parents 46035a5 + 82eb7ec commit d842785
Show file tree
Hide file tree
Showing 5 changed files with 868 additions and 2,405 deletions.
76 changes: 25 additions & 51 deletions city_scrapers/spiders/oma_mud.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,35 +10,34 @@ class OmahaMudSpider(CityScrapersSpider):
name = "oma_mud"
agency = "Omaha Metropolitan Utilities District"
timezone = "America/Chicago"
start_urls = [
"https://www.mudomaha.com/our-company/board-of-directors/board-meetings"
]
start_urls = ["https://www.mudomaha.com/about-us/board-meetings/"]
location = {
"address": "7350 World Communications Drive", # No specific address given
"name": "Metropolitan Utilities District",
}
title = "Committee and Board meetings"

def parse(self, response):
"""
`parse` should always `yield` Meeting items.
Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
needs.
"""
for item in response.xpath("//table[1]/tbody/tr"):
date_td = item.css(".views-field-field-date::text").get()
if not date_td:
for item in response.xpath("//ul[@class='meetings-list']/li"):
date_text = item.xpath(".//p[@class='date']/text()").get()
if not date_text:
continue
date_text = date_text.strip()
details = item.xpath(".//article//p/text()").getall()
time = re.findall(r"\d{1,2}:\d{2} [a|p]\.m\.", " ".join(details))
if not time:
continue
date_td = date_td.strip()
details_td = item.css(".views-field-field-details::text").get().strip()
time = re.findall(r" \d{1,2}:\d{2} [a|p]\.m\.", details_td)[0]
# date is first part of date_td + time extracted from details
start = dateutil.parser.parse(date_td.split(" - ")[0] + time)
# Assuming the first time is always the start of the meeting
start = dateutil.parser.parse(date_text + " " + time[0])
meeting = Meeting(
title=date_td.split(" - ")[1].strip(),
description=details_td,
title=self.title,
description=" ".join(details).replace("\n", "").strip(),
classification=self._parse_classification(item),
start=start,
end=self._parse_end(item),
end=False,
all_day=self._parse_all_day(item),
time_notes=self._parse_time_notes(item),
location=self._parse_location(item),
time_notes="",
location=self.location,
links=self._parse_links(item),
source=self._parse_source(response),
)
Expand All @@ -49,44 +48,19 @@ def parse(self, response):
yield meeting

def _parse_classification(self, item):
"""Parse or generate classification from allowed options."""
return NOT_CLASSIFIED

def _parse_start(self, item):
"""Parse start datetime as a naive datetime object."""
return None

def _parse_end(self, item):
"""Parse end datetime as a naive datetime object. Added by pipeline if None"""
return None

def _parse_time_notes(self, item):
"""Parse any additional notes on the timing of the meeting"""
return ""

def _parse_all_day(self, item):
"""Parse or generate all-day status. Defaults to False."""
return False

def _parse_location(self, item):
"""Parse or generate location."""
return {
"address": "",
"name": "",
}

def _parse_links(self, item):
"""Parse or generate links."""
BASE_URL = "https://www.mudomaha.com/"
links = item.xpath(".//a/@href").getall() or []
links = item.xpath(
".//div[contains(@class, 'meetings-media')]//a/@href"
).getall()
return [
{
"href": BASE_URL + link,
"title": "Video" if "youtube" in link else "Documents",
}
{"href": link, "title": "Video" if "youtube" in link else "Documents"}
for link in links
]

def _parse_source(self, response):
"""Parse or generate source."""
return response.url
753 changes: 753 additions & 0 deletions tests/files/oma_mud.html

Large diffs are not rendered by default.

Loading

0 comments on commit d842785

Please sign in to comment.