Skip to content

Commit

Permalink
Fix: link parsing
Browse files Browse the repository at this point in the history
Current handling doesn't adequately handle highly nested HTML structure of links on this agency's page.
  • Loading branch information
SimmonsRitchie committed Mar 19, 2024
1 parent 831d0c9 commit 110025a
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 11 deletions.
30 changes: 24 additions & 6 deletions city_scrapers/spiders/wicks_wampo_tpb.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from datetime import datetime, time
from unicodedata import normalize

from city_scrapers_core.constants import BOARD
from city_scrapers_core.items import Meeting
Expand Down Expand Up @@ -73,12 +74,29 @@ def _parse_start(self, item, parsed_year):
return None

def _parse_links(self, item):
links = []
"""Parse links to meeting agendas and minutes. HTML is
very messy. Display text is often split across multiple
span tags. In some cases, multiple a tags make up the same
link (Eg. "Re", "cording")."""
link_text = []
link_hrefs = []
for link in item.css("a"):
links.append(
{
"href": link.attrib["href"],
"title": link.css("::text").extract_first(),
}
url = link.attrib["href"]
# get text from all child spans
title = "".join(link.css("::text").extract()).strip()
# Strip white space and remove special characters
clean_title = (
normalize("NFKD", title).encode("ascii", "ignore").decode("utf-8")
)
if url in link_hrefs:
# if link already exists, append to corresponding text
index = link_hrefs.index(url)
link_text[index] += clean_title
else:
link_hrefs.append(url)
link_text.append(clean_title)
# zip together
links = []
for i, title in enumerate(link_text):
links.append({"title": title, "href": link_hrefs[i]})
return links
9 changes: 4 additions & 5 deletions tests/test_wicks_wampo_tpb.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
spider = WicksWampoTPBSpider()

freezer = freeze_time(datetime(2024, 3, 14, 14, 1))
freezer = freeze_time(datetime(2024, 3, 19, 11, 39))
freezer.start()

parsed_items = [item for item in spider.parse(test_response)]
Expand Down Expand Up @@ -67,15 +67,14 @@ def test_source():
def test_links():
assert parsed_item["links"] == [
{
"title": "Agenda Packet",
"href": "https://www.wampo.org/_files/ugd/bbf89d_bc9c575ffcd9480ca7bd7b8c7611857b.pdf", # noqa
"title": "Agenda",
},
{
"title": "Minutes",
"href": "https://www.wampo.org/_files/ugd/bbf89d_8cf9fc33872e44d2a7ade0d2db6e5fae.pdf", # noqa
"title": "Min",
},
{"href": "https://youtu.be/LsMI1EClvnI", "title": "Re"},
{"href": "https://youtu.be/LsMI1EClvnI", "title": "cording"},
{"title": "Recording", "href": "https://youtu.be/LsMI1EClvnI"},
]


Expand Down

0 comments on commit 110025a

Please sign in to comment.