Fix: link parsing

Current handling doesn't adequately handle highly nested HTML structure of links on this agency's page.
City-Bureau · Mar 19, 2024 · 110025a · 110025a
1 parent 831d0c9
commit 110025a
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 11 deletions.
diff --git a/city_scrapers/spiders/wicks_wampo_tpb.py b/city_scrapers/spiders/wicks_wampo_tpb.py
@@ -1,4 +1,5 @@
 from datetime import datetime, time
+from unicodedata import normalize
 
 from city_scrapers_core.constants import BOARD
 from city_scrapers_core.items import Meeting
@@ -73,12 +74,29 @@ def _parse_start(self, item, parsed_year):
             return None
 
     def _parse_links(self, item):
-        links = []
+        """Parse links to meeting agendas and minutes. HTML is
+        very messy. Display text is often split across multiple
+        span tags. In some cases, multiple a tags make up the same
+        link (Eg. "Re", "cording")."""
+        link_text = []
+        link_hrefs = []
         for link in item.css("a"):
-            links.append(
-                {
-                    "href": link.attrib["href"],
-                    "title": link.css("::text").extract_first(),
-                }
+            url = link.attrib["href"]
+            # get text from all child spans
+            title = "".join(link.css("::text").extract()).strip()
+            # Strip white space and remove special characters
+            clean_title = (
+                normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
             )
+            if url in link_hrefs:
+                # if link already exists, append to corresponding text
+                index = link_hrefs.index(url)
+                link_text[index] += clean_title
+            else:
+                link_hrefs.append(url)
+                link_text.append(clean_title)
+        # zip together
+        links = []
+        for i, title in enumerate(link_text):
+            links.append({"title": title, "href": link_hrefs[i]})
         return links
diff --git a/tests/test_wicks_wampo_tpb.py b/tests/test_wicks_wampo_tpb.py
@@ -14,7 +14,7 @@
 )
 spider = WicksWampoTPBSpider()
 
-freezer = freeze_time(datetime(2024, 3, 14, 14, 1))
+freezer = freeze_time(datetime(2024, 3, 19, 11, 39))
 freezer.start()
 
 parsed_items = [item for item in spider.parse(test_response)]
@@ -67,15 +67,14 @@ def test_source():
 def test_links():
     assert parsed_item["links"] == [
         {
+            "title": "Agenda Packet",
             "href": "https://www.wampo.org/_files/ugd/bbf89d_bc9c575ffcd9480ca7bd7b8c7611857b.pdf",  # noqa
-            "title": "Agenda",
         },
         {
+            "title": "Minutes",
             "href": "https://www.wampo.org/_files/ugd/bbf89d_8cf9fc33872e44d2a7ade0d2db6e5fae.pdf",  # noqa
-            "title": "Min",
         },
-        {"href": "https://youtu.be/LsMI1EClvnI", "title": "Re"},
-        {"href": "https://youtu.be/LsMI1EClvnI", "title": "cording"},
+        {"title": "Recording", "href": "https://youtu.be/LsMI1EClvnI"},
     ]