diff --git a/civic_scraper/platforms/civic_plus/parser.py b/civic_scraper/platforms/civic_plus/parser.py
index a152135e..c63852c7 100644
--- a/civic_scraper/platforms/civic_plus/parser.py
+++ b/civic_scraper/platforms/civic_plus/parser.py
@@ -38,6 +38,9 @@ def file_links_with_no_title(tag):
)
metadata = []
+ # Links often appear twice (once under meeting title, once in download menu)
+ # so we track which we've already seen to avoid duplicate entries
+ bookkeeping = set()
for div in divs:
cmte_name = self._committee_name(div)
# Line-item data for each meeting is inside table rows.
@@ -52,6 +55,9 @@ def file_links_with_no_title(tag):
# Skip links to page listing previous agenda versions
if self._previous_version_link(link):
continue
+ # Skip previously harvested links
+ if link["href"] in bookkeeping:
+ continue
metadata.append(
{
"committee_name": cmte_name,
@@ -63,13 +69,18 @@ def file_links_with_no_title(tag):
"asset_type": self._asset_type(link["href"]),
}
)
+ bookkeeping.add(link["href"])
return metadata
def _committee_name(self, div):
- # Remove span that contains
+ # If present, remove span that contains
# arrow ▼ for toggling meeting list
- div.h2.span.extract()
- return div.h2.text.strip()
+ try:
+ div.h2.span.extract()
+ except AttributeError:
+ pass
+ header_node = div.h2 or div.h3
+ return header_node.text.strip()
def _mtg_title(self, row):
return row.p.text.strip()
diff --git a/tests/fixtures/civplus_alameda_water.html b/tests/fixtures/civplus_alameda_water.html
new file mode 100644
index 00000000..6eba7bdf
--- /dev/null
+++ b/tests/fixtures/civplus_alameda_water.html
@@ -0,0 +1,2322 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Agenda Center • Alameda County Water District, CA • CivicEng
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ - Home
- Agenda Center
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
◄ Back to all agendas
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ []
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/tests/test_civic_plus_parser.py b/tests/test_civic_plus_parser.py
index 2ee9f778..8eb692c2 100644
--- a/tests/test_civic_plus_parser.py
+++ b/tests/test_civic_plus_parser.py
@@ -1,5 +1,7 @@
from datetime import datetime
+from .conftest import read_fixture
+
from civic_scraper.platforms.civic_plus.parser import Parser
@@ -38,3 +40,21 @@ def test_extract_all_asset_types_for_meeting(search_results_html):
"agenda_packet",
]
assert asset_types == expected_types
+
+def test_parse_alameda():
+ "Parser should extract all items on page for Alameda WD, which uses a different page structure"
+ html = read_fixture("civplus_alameda_water.html")
+ parser = Parser(html)
+ data = parser.parse()
+ assert len(data) == 2
+ first = data[0]
+ assert first["committee_name"] == "Engineering and Information Technology Committee"
+ assert first["url_path"] == "/AgendaCenter/ViewFile/Agenda/_09042024-1447"
+ assert first["meeting_date"] == datetime(2024, 9, 4)
+ assert first["meeting_time"] is None
+ assert (
+ first["meeting_title"]
+ == "Agenda"
+ )
+ assert first["meeting_id"] == "_09042024-1447"
+ assert first["asset_type"] == "agenda"
diff --git a/tests/test_civic_plus_site.py b/tests/test_civic_plus_site.py
index aea0dd23..6bb5a2fd 100644
--- a/tests/test_civic_plus_site.py
+++ b/tests/test_civic_plus_site.py
@@ -49,9 +49,7 @@ def test_scrape_defaults():
"agenda",
"minutes",
"agenda",
- "agenda",
"minutes",
- "agenda",
]
actual_asset_types = [asset.asset_type for asset in assets]
assert expected_asset_types == actual_asset_types