Skip to content

Commit

Permalink
Fix CivPlus bugs #176 #189 (#190)
Browse files Browse the repository at this point in the history
* Fix CivPlus bugs #176 #189
* linter fixe
* Fix GH build process
  • Loading branch information
zstumgoren authored Sep 6, 2024
1 parent a7775f1 commit 0eead10
Show file tree
Hide file tree
Showing 12 changed files with 2,370 additions and 15 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/continuous-deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,9 @@ jobs:
- id: build
name: Build release
run: make build-release
run: |
pipenv run pip install setuptools-scm>=8.1.0 --force-reinstall --upgrade
make build-release
- id: check
name: Check release
Expand Down
2 changes: 1 addition & 1 deletion civic_scraper/base/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def __init__(
meeting_id: str = None,
scraped_by: str = None,
content_type: str = None,
content_length: str = None
content_length: str = None,
) -> None:
self.url = url
self.asset_name = asset_name
Expand Down
6 changes: 3 additions & 3 deletions civic_scraper/platforms/civic_clerk/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ def __init__(self, url, place=None, state_or_province=None, cache=Cache()):
self.cache = cache

self.session = Session()
self.session.headers[
"User-Agent"
] = "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
self.session.headers["User-Agent"] = (
"Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
)

# Raise an error if a request gets a failing status code
self.session.hooks = {
Expand Down
17 changes: 14 additions & 3 deletions civic_scraper/platforms/civic_plus/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def file_links_with_no_title(tag):
)

metadata = []
# Links often appear twice (once under meeting title, once in download menu)
# so we track which we've already seen to avoid duplicate entries
bookkeeping = set()
for div in divs:
cmte_name = self._committee_name(div)
# Line-item data for each meeting is inside table rows.
Expand All @@ -52,6 +55,9 @@ def file_links_with_no_title(tag):
# Skip links to page listing previous agenda versions
if self._previous_version_link(link):
continue
# Skip previously harvested links
if link["href"] in bookkeeping:
continue
metadata.append(
{
"committee_name": cmte_name,
Expand All @@ -63,13 +69,18 @@ def file_links_with_no_title(tag):
"asset_type": self._asset_type(link["href"]),
}
)
bookkeeping.add(link["href"])
return metadata

def _committee_name(self, div):
# Remove span that contains
# If present, remove span that contains
# arrow ▼ for toggling meeting list
div.h2.span.extract()
return div.h2.text.strip()
try:
div.h2.span.extract()
except AttributeError:
pass
header_node = div.h2 or div.h3
return header_node.text.strip()

def _mtg_title(self, row):
return row.p.text.strip()
Expand Down
4 changes: 3 additions & 1 deletion civic_scraper/platforms/civic_plus/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ def __init__(self, base_url, cache=Cache(), parser_kls=Parser, place_name=None):

@property
def place(self):
return self.place_name or self._get_asset_metadata(r"(?<=-)\w+(?=\.)", self.base_url)
return self.place_name or self._get_asset_metadata(
r"(?<=-)\w+(?=\.)", self.base_url
)

def scrape(
self,
Expand Down
2 changes: 1 addition & 1 deletion civic_scraper/platforms/legistar/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def _create_asset(self, event, meeting_meta, asset_type):
name_bits.append(asset_type)
kwargs = {
"url": event[asset_type]["url"],
"asset_type": asset_type.lower().replace(' ', '_'),
"asset_type": asset_type.lower().replace(" ", "_"),
"asset_name": " - ".join(name_bits),
"content_type": None,
"content_length": None,
Expand Down
6 changes: 3 additions & 3 deletions civic_scraper/platforms/primegov/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ def __init__(self, url, place=None, state_or_province=None, cache=Cache()):
self.cache = cache

self.session = Session()
self.session.headers[
"User-Agent"
] = "Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
self.session.headers["User-Agent"] = (
"Mozilla/5.0 (X11; CrOS x86_64 12871.102.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.141 Safari/537.36"
)

# Raise an error if a request gets a failing status code
self.session.hooks = {
Expand Down
1 change: 1 addition & 0 deletions scripts/generate_civicplus_sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"/Users/amydipierro/GitHub/test.csv"
"""

import csv
import re

Expand Down
1 change: 1 addition & 0 deletions scripts/run_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
path/to/target.csv \
--scraper_args '{"start_date": "2015-09-09", "end_date": "2015-10-14"}'
"""

from civic_scraper.scrapers import SUPPORTED_SITES


Expand Down
Loading

0 comments on commit 0eead10

Please sign in to comment.