Build: wicks_wampo_tabs mixin & 3 spiders

City-Bureau · Mar 15, 2024 · 25f67f7 · 25f67f7
1 parent a3f9911
commit 25f67f7
Show file tree

Hide file tree

Showing 8 changed files with 2,667 additions and 0 deletions.
diff --git a/city_scrapers/mixins/wampo_tabs.py b/city_scrapers/mixins/wampo_tabs.py
@@ -0,0 +1,113 @@
+import re
+from datetime import datetime
+
+from city_scrapers_core.constants import COMMITTEE
+from city_scrapers_core.items import Meeting
+from city_scrapers_core.spiders import CityScrapersSpider
+from dateutil.parser import parse
+
+
+class WampoMixinTabsMeta(type):
+    """
+    Metaclass that enforces the implementation of required static
+    variables in child classes that inherit from WampoMixinTabs.
+    """
+
+    def __init__(cls, name, bases, dct):
+        required_static_vars = [
+            "name",
+            "agency",
+            "start_urls",
+            "start_time",
+        ]
+        missing_vars = [var for var in required_static_vars if var not in dct]
+
+        if missing_vars:
+            missing_vars_str = ", ".join(missing_vars)
+            raise NotImplementedError(
+                f"{name} must define the following static variable(s): {missing_vars_str}."  # noqa
+            )
+
+        super().__init__(name, bases, dct)
+
+
+class WampoMixinTabs(CityScrapersSpider, metaclass=WampoMixinTabsMeta):
+    """
+    This mixin is designed for scraping meeting data from specific pages on the
+    website of the Wichita Area Metropolitan Planning Organization (WAMPO) that use
+    a tabbed interface. It is not applicable for all WAMPO pages.
+
+    Child classes must define 'name', 'agency', 'start_urls', 'start_time', and 'location'  # noqa
+    as static variables.
+    """
+
+    name = None
+    agency = None
+    start_urls = None
+    location = {
+        "name": "Wichita Area Metropolitan Planning Organization",
+        "address": "271 W. 3rd St. N., Suite 101, Wichita, KS 67202",
+    }
+    timezone = "America/Chicago"
+    start_time = None
+
+    def parse(self, response):
+        """
+        Parse the page and extract the meeting information.
+        """
+        for item in response.css("div[role='tabpanel'] ul li p"):
+            start = self._parse_start(item)
+            if start is None:
+                self.logger.warn("Skipping row with no date")
+                continue
+            meeting = Meeting(
+                title="Executive Committee Meeting",
+                description="",
+                classification=COMMITTEE,
+                start=start,
+                end=None,
+                all_day=False,
+                time_notes="",
+                location=self.location,
+                links=self._parse_links(item),
+                source=response.url,
+            )
+            meeting["status"] = self._get_status(meeting)
+            meeting["id"] = self._get_id(meeting)
+            yield meeting
+
+    def _parse_start(self, item):
+        """
+        Extracts all the text from the row and parses the first
+        column that looks formatted like 11/1/2024 or November 12, 2024.
+        """
+        item.css("span::text").extract()
+        for text in item.css("span::text").extract():
+            clean_text = text.strip()
+            if re.match(
+                r"(?:[a-zA-Z]+\s\d{1,2},\s\d{4})|(?:\d{1,2}\/\d{1,2}\/\d{2,4})",
+                clean_text,
+            ):
+                try:
+                    start_date = parse(clean_text)
+                    start_datetime = datetime.combine(start_date, self.start_time)
+                    return start_datetime
+                except ValueError:
+                    self.logger.info(f"Could not parse date from {clean_text}")
+                    return None
+        self.logger.info("Could not find date in row")
+        return None
+
+    def _parse_links(self, item):
+        """
+        Parse all links in the row.
+        """
+        links = []
+        for link in item.css("a"):
+            links.append(
+                {
+                    "href": link.attrib["href"],
+                    "title": link.css("::text").extract_first(),
+                }
+            )
+        return links
diff --git a/city_scrapers/spiders/wicks_wampo_ec.py b/city_scrapers/spiders/wicks_wampo_ec.py
@@ -0,0 +1,73 @@
+import re
+from datetime import datetime, time
+
+from city_scrapers_core.constants import COMMITTEE
+from city_scrapers_core.items import Meeting
+from city_scrapers_core.spiders import CityScrapersSpider
+
+
+class WampoEcSpider(CityScrapersSpider):
+    name = "wicks_wampo_ec"
+    agency = "Wichita Area Metropolitan Planning Organization – Executive Committee"
+    timezone = "America/Chicago"
+    start_urls = ["https://www.wampo.org/executive-committee"]
+    start_time = time(11, 0)
+    location = {
+        "name": "Wichita Area Metropolitan Planning Organization",
+        "address": "271 W. 3rd St. N., Suite 101, Wichita, KS 67202",
+    }
+
+    def parse(self, response):
+        """
+        Parse the page and extract the meeting information.
+        """
+        for tabpanel in response.css("div[role='tabpanel'] ul"):
+            for item in tabpanel.css("li p"):
+                start = self._parse_start(item)
+                if start is None:
+                    self.logger.warn("Skipping row with no date")
+                    continue
+                meeting = Meeting(
+                    title="Executive Committee Meeting",
+                    description="",
+                    classification=COMMITTEE,
+                    start=start,
+                    end=None,
+                    all_day=False,
+                    time_notes="",
+                    location=self.location,
+                    links=self._parse_links(item),
+                    source=response.url,
+                )
+                meeting["status"] = self._get_status(meeting)
+                meeting["id"] = self._get_id(meeting)
+                yield meeting
+
+    def _parse_start(self, item):
+        """Extracts all the text from the row and parses the first
+        column that looks formatted like MM/DD/YYYY."""
+        item.css("span::text").extract()
+        for text in item.css("span::text").extract():
+            clean_text = text.strip()
+            if re.match(r"\d{1,2}/\d{1,2}/\d{4}", clean_text):
+                try:
+                    start_date = datetime.strptime(clean_text, "%m/%d/%Y")
+                    start_datetime = datetime.combine(start_date, self.start_time)
+                    return start_datetime
+                except ValueError:
+                    self.logger.info(f"Could not parse date from {clean_text}")
+                    return None
+        self.logger.info("Could not find date in row")
+        return None
+
+    def _parse_links(self, item):
+        """Parse all links in the row."""
+        links = []
+        for link in item.css("a"):
+            links.append(
+                {
+                    "href": link.attrib["href"],
+                    "title": link.css("::text").extract_first(),
+                }
+            )
+        return links
diff --git a/city_scrapers/spiders/wicks_wampo_tabs.py b/city_scrapers/spiders/wicks_wampo_tabs.py
@@ -0,0 +1,54 @@
+from datetime import time
+
+from city_scrapers_core.constants import COMMITTEE
+
+from city_scrapers.mixins.wampo_tabs import WampoMixinTabs
+
+# Configuration for each spider
+spider_configs = [
+    {
+        "class_name": "WampoECSpider",
+        "name": "wicks_wampo_ec",
+        "agency": "Wichita Area Metropolitan Planning Organization – Executive Committee",  # noqa
+        "start_urls": ["https://www.wampo.org/executive-committee"],
+        "start_time": time(11, 0),
+        "classification": COMMITTEE,
+    },
+    {
+        "class_name": "WicksWampoICTSSpider",
+        "name": "wicks_wampo_icts",
+        "agency": "Wichita Area Metropolitan Planning Organization – ICT Safe",
+        "start_urls": ["https://www.wampo.org/ict-safe"],
+        "start_time": time(9, 30),
+        "classification": COMMITTEE,
+    },
+    {
+        "class_name": "WicksWampoUCTCSpider",
+        "name": "wicks_wampo_uctc",
+        "agency": "Wichita Area Metropolitan Planning Organization – United Community Transit Coalition",  # noqa
+        "start_urls": ["https://www.wampo.org/uctc"],
+        "start_time": time(14, 0),
+        "classification": COMMITTEE,
+    },
+]
+
+
+def create_spiders():
+    """
+    Dynamically create spider classes using the spider_configs list
+    and then register them in the global namespace. This approach
+    is the equivalent of declaring each spider class in the same
+    file but it is more concise and centralized.
+    """
+    for config in spider_configs:
+        class_name = config.pop("class_name")
+        if class_name not in globals():
+            spider_class = type(
+                class_name,
+                (WampoMixinTabs,),  # Base class
+                {**config},  # Attributes
+            )
+            globals()[class_name] = spider_class
+
+
+create_spiders()