Skip to content

Commit

Permalink
Build: wicks_wampo_tabs mixin & 3 spiders
Browse files Browse the repository at this point in the history
  • Loading branch information
SimmonsRitchie committed Mar 15, 2024
1 parent a3f9911 commit 25f67f7
Show file tree
Hide file tree
Showing 8 changed files with 2,667 additions and 0 deletions.
113 changes: 113 additions & 0 deletions city_scrapers/mixins/wampo_tabs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import re
from datetime import datetime

from city_scrapers_core.constants import COMMITTEE
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from dateutil.parser import parse


class WampoMixinTabsMeta(type):
"""
Metaclass that enforces the implementation of required static
variables in child classes that inherit from WampoMixinTabs.
"""

def __init__(cls, name, bases, dct):
required_static_vars = [
"name",
"agency",
"start_urls",
"start_time",
]
missing_vars = [var for var in required_static_vars if var not in dct]

if missing_vars:
missing_vars_str = ", ".join(missing_vars)
raise NotImplementedError(
f"{name} must define the following static variable(s): {missing_vars_str}." # noqa
)

super().__init__(name, bases, dct)


class WampoMixinTabs(CityScrapersSpider, metaclass=WampoMixinTabsMeta):
"""
This mixin is designed for scraping meeting data from specific pages on the
website of the Wichita Area Metropolitan Planning Organization (WAMPO) that use
a tabbed interface. It is not applicable for all WAMPO pages.
Child classes must define 'name', 'agency', 'start_urls', 'start_time', and 'location' # noqa
as static variables.
"""

name = None
agency = None
start_urls = None
location = {
"name": "Wichita Area Metropolitan Planning Organization",
"address": "271 W. 3rd St. N., Suite 101, Wichita, KS 67202",
}
timezone = "America/Chicago"
start_time = None

def parse(self, response):
"""
Parse the page and extract the meeting information.
"""
for item in response.css("div[role='tabpanel'] ul li p"):
start = self._parse_start(item)
if start is None:
self.logger.warn("Skipping row with no date")
continue
meeting = Meeting(
title="Executive Committee Meeting",
description="",
classification=COMMITTEE,
start=start,
end=None,
all_day=False,
time_notes="",
location=self.location,
links=self._parse_links(item),
source=response.url,
)
meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
yield meeting

def _parse_start(self, item):
"""
Extracts all the text from the row and parses the first
column that looks formatted like 11/1/2024 or November 12, 2024.
"""
item.css("span::text").extract()
for text in item.css("span::text").extract():
clean_text = text.strip()
if re.match(
r"(?:[a-zA-Z]+\s\d{1,2},\s\d{4})|(?:\d{1,2}\/\d{1,2}\/\d{2,4})",
clean_text,
):
try:
start_date = parse(clean_text)
start_datetime = datetime.combine(start_date, self.start_time)
return start_datetime
except ValueError:
self.logger.info(f"Could not parse date from {clean_text}")
return None
self.logger.info("Could not find date in row")
return None

def _parse_links(self, item):
"""
Parse all links in the row.
"""
links = []
for link in item.css("a"):
links.append(
{
"href": link.attrib["href"],
"title": link.css("::text").extract_first(),
}
)
return links
73 changes: 73 additions & 0 deletions city_scrapers/spiders/wicks_wampo_ec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import re
from datetime import datetime, time

from city_scrapers_core.constants import COMMITTEE
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider


class WampoEcSpider(CityScrapersSpider):
name = "wicks_wampo_ec"
agency = "Wichita Area Metropolitan Planning Organization – Executive Committee"
timezone = "America/Chicago"
start_urls = ["https://www.wampo.org/executive-committee"]
start_time = time(11, 0)
location = {
"name": "Wichita Area Metropolitan Planning Organization",
"address": "271 W. 3rd St. N., Suite 101, Wichita, KS 67202",
}

def parse(self, response):
"""
Parse the page and extract the meeting information.
"""
for tabpanel in response.css("div[role='tabpanel'] ul"):
for item in tabpanel.css("li p"):
start = self._parse_start(item)
if start is None:
self.logger.warn("Skipping row with no date")
continue
meeting = Meeting(
title="Executive Committee Meeting",
description="",
classification=COMMITTEE,
start=start,
end=None,
all_day=False,
time_notes="",
location=self.location,
links=self._parse_links(item),
source=response.url,
)
meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)
yield meeting

def _parse_start(self, item):
"""Extracts all the text from the row and parses the first
column that looks formatted like MM/DD/YYYY."""
item.css("span::text").extract()
for text in item.css("span::text").extract():
clean_text = text.strip()
if re.match(r"\d{1,2}/\d{1,2}/\d{4}", clean_text):
try:
start_date = datetime.strptime(clean_text, "%m/%d/%Y")
start_datetime = datetime.combine(start_date, self.start_time)
return start_datetime
except ValueError:
self.logger.info(f"Could not parse date from {clean_text}")
return None
self.logger.info("Could not find date in row")
return None

def _parse_links(self, item):
"""Parse all links in the row."""
links = []
for link in item.css("a"):
links.append(
{
"href": link.attrib["href"],
"title": link.css("::text").extract_first(),
}
)
return links
54 changes: 54 additions & 0 deletions city_scrapers/spiders/wicks_wampo_tabs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from datetime import time

from city_scrapers_core.constants import COMMITTEE

from city_scrapers.mixins.wampo_tabs import WampoMixinTabs

# Configuration for each spider
spider_configs = [
{
"class_name": "WampoECSpider",
"name": "wicks_wampo_ec",
"agency": "Wichita Area Metropolitan Planning Organization – Executive Committee", # noqa
"start_urls": ["https://www.wampo.org/executive-committee"],
"start_time": time(11, 0),
"classification": COMMITTEE,
},
{
"class_name": "WicksWampoICTSSpider",
"name": "wicks_wampo_icts",
"agency": "Wichita Area Metropolitan Planning Organization – ICT Safe",
"start_urls": ["https://www.wampo.org/ict-safe"],
"start_time": time(9, 30),
"classification": COMMITTEE,
},
{
"class_name": "WicksWampoUCTCSpider",
"name": "wicks_wampo_uctc",
"agency": "Wichita Area Metropolitan Planning Organization – United Community Transit Coalition", # noqa
"start_urls": ["https://www.wampo.org/uctc"],
"start_time": time(14, 0),
"classification": COMMITTEE,
},
]


def create_spiders():
"""
Dynamically create spider classes using the spider_configs list
and then register them in the global namespace. This approach
is the equivalent of declaring each spider class in the same
file but it is more concise and centralized.
"""
for config in spider_configs:
class_name = config.pop("class_name")
if class_name not in globals():
spider_class = type(
class_name,
(WampoMixinTabs,), # Base class
{**config}, # Attributes
)
globals()[class_name] = spider_class


create_spiders()
Loading

0 comments on commit 25f67f7

Please sign in to comment.