Skip to content

Commit

Permalink
Legistar refactor (#154)
Browse files Browse the repository at this point in the history
Closes #152 #153

- Standardize Site class name for Legistar
- Add date-filtering support
- Add download support
- Add filters for file type and size
- Download Agendas and Minutes by default
- Namespace vcrpy fixtures by test module name
- Add date parsing utility function
- Add test coverage for Legistar
- Fix broken CivicPlus test and namespace all tests under test mod name #149
- Fix typing-extensions build bug
  • Loading branch information
zstumgoren authored Jun 22, 2022
1 parent 2907292 commit 9d2fac2
Show file tree
Hide file tree
Showing 50 changed files with 198,823 additions and 27,774 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/continuous-deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ jobs:
name: Install Python dependencies
run: pipenv install --dev --python `which python`

- id: typing-ext-install
name: Install Python dependencies
run: pipenv install --dev --python `which python` typing-extensions

- id: build
name: Build
run: make test-docs
Expand Down
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ sphinx-autobuild = "*"
black = "*"
setuptools-scm = "*"
sphinxcontrib-napoleon = "*"
pytz = "*"

[requires]
python_version = "3.7"
886 changes: 398 additions & 488 deletions Pipfile.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion civic_scraper/platforms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .civic_clerk.site import CivicClerkSite
from .civic_plus.site import Site as CivicPlusSite
from .granicus.site import GranicusSite
from .legistar.site import LegistarSite
from .legistar.site import Site as LegistarSite
from .primegov.site import PrimeGovSite
171 changes: 118 additions & 53 deletions civic_scraper/platforms/legistar/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,17 @@
from pathlib import Path
from urllib.parse import parse_qs, urlparse

import requests
from legistar.events import LegistarEventsScraper

import civic_scraper
from civic_scraper import base
from civic_scraper.base.asset import Asset, AssetCollection
from civic_scraper.base.cache import Cache
from civic_scraper.utils import parse_date, dtz_to_dt, mb_to_bytes, today_local_str


# Scrape today's agendas and minutes from a Legistar site
class LegistarSite(base.Site):
class Site(base.Site):
def __init__(
self,
base_url,
Expand All @@ -30,14 +31,92 @@ def __init__(
self.timezone = timezone
self.event_info_keys = event_info_keys

def create_asset(self, event, scraper):
def scrape(
self,
start_date=None,
end_date=None,
download=False,
file_size=None,
asset_list=["Agenda", "Minutes"],
):
"""Scrape a government website for metadata and/or docs.
Args:
start_date (str): YYYY-MM-DD (default: current day)
end_date (str): YYYY-MM-DD (default: current day)
download (bool): Download file assets such as PDFs (default: False)
file_size (float): Max size in Megabytes of file assets to download
asset_list (list): Optional list of SUPPORTED_ASSET_TYPES to
to limit items to be scraped (e.g. agenda, minutes). (default: [])
Returns:
AssetCollection: A sequence of Asset instances
"""
# Use current day as default
today = today_local_str()
start_date = start_date or today
end_date = end_date or today
webscraper = LegistarEventsScraper(
event_info_key=self.event_info_keys["meeting_details_info"],
retry_attempts=3,
)

# required to instantiate webscraper
webscraper.BASE_URL = urlparse(self.url).netloc
webscraper.EVENTSPAGE = self.url
webscraper.TIMEZONE = self.timezone
webscraper.date_format = "%m/%d/%Y %I:%M %p"

ac = AssetCollection()
start_year = int(start_date[:4])
events = [event[0] for event in webscraper.events(since=start_year)]
for event in events:
meeting_meta = self._extract_meeting_meta(event, webscraper)
for asset_type in asset_list:
# Skip if a dictionary containing 'url' key is not present for the given asset type
try:
asset = self._create_asset(event, meeting_meta, asset_type)
except TypeError:
continue
# Apply date and other filters
if self._skippable(
asset, start_date, end_date, file_size=file_size, download=download
):
continue
ac.append(asset)
if download:
asset_dir = Path(self.cache.path, "assets")
asset_dir.mkdir(parents=True, exist_ok=True)
for asset in ac:
if asset.url:
dir_str = str(asset_dir)
asset.download(target_dir=dir_str, session=webscraper)
return ac

def _add_file_meta(self, asset):
headers = requests.head(asset.url, allow_redirects=True).headers
asset.content_type = headers["content-type"]
asset.content_length = headers["content-length"]

def _create_asset(self, event, meeting_meta, asset_type):
name_bits = [self._event_name(event)]
meeting_id = meeting_meta["meeting_id"]
if meeting_id:
clean_id = meeting_id.split("_")[-1]
name_bits.append(clean_id)
name_bits.append(asset_type)
kwargs = {
"url": event[asset_type]["url"],
"asset_type": asset_type.lower().replace(' ', '_'),
"asset_name": " - ".join(name_bits),
"content_type": None,
"content_length": None,
}
kwargs.update(meeting_meta)
return Asset(**kwargs)

def _extract_meeting_meta(self, event, scraper):
detail_info = event[self.event_info_keys["meeting_details_info"]]
date_info = event[self.event_info_keys["meeting_date_info"]]
time_info = event[self.event_info_keys["meeting_time_info"]] or None
location_info = None
if self.event_info_keys["meeting_location_info"] in event.keys():
location_info = event[self.event_info_keys["meeting_location_info"]]

time_format = None
if time_info:
time_format = re.match(r"\d*?:\d{2} \w{2}", time_info)
Expand All @@ -47,7 +126,6 @@ def create_asset(self, event, scraper):
else:
meeting_datetime = " ".join((date_info, "12:00 AM"))

meeting_date = scraper.toDate(meeting_datetime)
meeting_time = scraper.toTime(meeting_datetime)

# use regex to match pattern #/#/#; raise warning if no match
Expand All @@ -65,55 +143,42 @@ def create_asset(self, event, scraper):
url = None
meeting_id = None

# get event name
if type(event["Name"]) == dict:
asset_name = event["Name"]["label"]
committee_name = event["Name"]["label"]
else:
asset_name = event["Name"]
committee_name = event["Name"]

e = {
"url": url,
"asset_name": asset_name,
"committee_name": committee_name,
"place": location_info,
return {
"committee_name": self._event_name(event),
"place": None,
"state_or_province": None,
"asset_type": "Agenda",
"meeting_date": meeting_date.strip(),
"meeting_date": dtz_to_dt(meeting_time),
"meeting_time": meeting_time,
"meeting_id": meeting_id,
"scraped_by": f"civic-scraper_{civic_scraper.__version__}",
"content_type": "txt",
"content_length": None,
}
return Asset(**e)

def scrape(self, download=True):
webscraper = LegistarEventsScraper(
event_info_key=self.event_info_keys["meeting_details_info"],
retry_attempts=3,
)

# required to instantiate webscraper
webscraper.BASE_URL = urlparse(self.url).netloc
webscraper.EVENTSPAGE = self.url
webscraper.TIMEZONE = self.timezone
webscraper.date_format = "%m/%d/%Y %I:%M %p"

ac = AssetCollection()
assets = [
self.create_asset(event[0], webscraper)
for event in webscraper.events(since=2021)
]
for a in assets:
ac.append(a)

def _event_name(self, event):
try:
return event["Name"]["label"]
except (KeyError, TypeError):
return event["Name"]

def _skippable(self, asset, start_date, end_date, file_size=None, download=False):
start = parse_date(start_date)
end = parse_date(end_date)
# Use a generic (non-timezone aware) date for filtering
meeting_date = dtz_to_dt(asset.meeting_date)
# Skip if document URL is not available
try:
if not asset.url.startswith("http"):
return True
except AttributeError:
return True
# Skip if meeting date isn't between/equal to start and end dates
if not start <= meeting_date <= end:
return True
# Add Content Type and Length when download specified
if download:
asset_dir = Path(self.cache.path, "assets")
asset_dir.mkdir(parents=True, exist_ok=True)
for asset in ac:
if asset.url:
dir_str = str(asset_dir)
asset.download(target_dir=dir_str, session=webscraper)
return ac
self._add_file_meta(asset)
# if file_size and download are given, then check byte count
if file_size and download:
max_bytes = mb_to_bytes(file_size)
if float(asset.content_length) > max_bytes:
return True
return False
44 changes: 24 additions & 20 deletions civic_scraper/platforms/primegov/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,19 @@
from datetime import datetime, timedelta
from urllib.parse import urlparse
from requests import Session
from collections import deque

import civic_scraper
from civic_scraper import base
from civic_scraper.base.asset import Asset, AssetCollection
from civic_scraper.base.cache import Cache


class PrimeGovSite(base.Site):
"""For each Primegov site, there seems to be multiple API endpoints that can be queried:
1. (GET) https://[city].primegov.com/api/meeting/search?from=[m/d/y]&to=[m/d/y]
2. (GET) https://[city].primegov.com/v2/PublicPortal/ListUpcomingMeetings
2. (GET) https://[city].primegov.com/v2/PublicPortal/ListArchivedMeetings?year=[year]
3. (POST) https://[city].primegov.com/api/search?
1. (GET) https://[city].primegov.com/api/meeting/search?from=[m/d/y]&to=[m/d/y]
2. (GET) https://[city].primegov.com/v2/PublicPortal/ListUpcomingMeetings
2. (GET) https://[city].primegov.com/v2/PublicPortal/ListArchivedMeetings?year=[year]
3. (POST) https://[city].primegov.com/api/search?
"""

def __init__(self, url, place=None, state_or_province=None, cache=Cache()):
Expand All @@ -38,21 +38,21 @@ def __init__(self, url, place=None, state_or_province=None, cache=Cache()):

def create_asset(self, entry, document):

url = self._get_agenda_url(entry['id'])
meeting_datetime = datetime.fromisoformat(entry['dateTime'])
meeting_id = self._get_meeting_id(document['id'])
url = self._get_agenda_url(entry["id"])
meeting_datetime = datetime.fromisoformat(entry["dateTime"])
meeting_id = self._get_meeting_id(document["id"])

e = {
"url": url,
"asset_name": entry['title'],
"asset_name": entry["title"],
"committee_name": None,
"place": self.place,
"state_or_province": self.state_or_province,
"asset_type": "Meeting",
"meeting_date": meeting_datetime.date(),
"meeting_time": meeting_datetime.time(),
"meeting_id": meeting_id,
"scraped_by": f'civic-scraper_{civic_scraper.__version__}',
"scraped_by": f"civic-scraper_{civic_scraper.__version__}",
"content_type": "html",
"content_length": None,
}
Expand All @@ -61,31 +61,35 @@ def create_asset(self, entry, document):

def _get_agenda_url(self, id):

return f'{self.base_url}/Portal/MeetingPreview?compiledMeetingDocumentFileId={id}'
return (
f"{self.base_url}/Portal/MeetingPreview?compiledMeetingDocumentFileId={id}"
)

def _get_meeting_id(self, object_id):

pattern = r'http[s]?:\/\/[www.]?(\S*).primegov.com\/[\S]*'
pattern = r"http[s]?:\/\/[www.]?(\S*).primegov.com\/[\S]*"
match = re.match(pattern, self.url)
return f'primegov_{match.group(1)}_{object_id}'
return f"primegov_{match.group(1)}_{object_id}"

def scrape(self, start_date=None, end_date=None):

# API requires both start and end dates
if not start_date or not end_date:
start_date = (datetime.today() - timedelta(days=30)).strftime('%m/%d/%Y')
end_date = datetime.today().strftime('%m/%d/%Y')
start_date = (datetime.today() - timedelta(days=30)).strftime("%m/%d/%Y")
end_date = datetime.today().strftime("%m/%d/%Y")

response = self.session.get(f'{self.base_url}/api/meeting/search?from={start_date}&to={end_date}')
response = self.session.get(
f"{self.base_url}/api/meeting/search?from={start_date}&to={end_date}"
)

ac = AssetCollection()

for meeting in response.json():
for entry in meeting['templates']:
if 'Agenda' in entry['title']:
for doc in entry['compiledMeetingDocumentFiles']:
for entry in meeting["templates"]:
if "Agenda" in entry["title"]:
for doc in entry["compiledMeetingDocumentFiles"]:
# HTML files have a compileOutputCode of 3
if doc['compileOutputType'] == 3:
if doc["compileOutputType"] == 3:
ac.append(self.create_asset(meeting, doc))

return ac
14 changes: 14 additions & 0 deletions civic_scraper/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,19 @@ def today_local_str():
return datetime.now().strftime("%Y-%m-%d")


def parse_date(date_str, format="%Y-%m-%d"):
return datetime.strptime(date_str, format)


def dtz_to_dt(dtz):
return datetime.fromordinal(dtz.toordinal())


def default_user_home():
return join(expanduser("~"), ".civic-scraper")


def mb_to_bytes(size_mb):
if size_mb is None:
return None
return float(size_mb) * 1048576
1 change: 1 addition & 0 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
sphinx
myst_parser
sphinxcontrib-napoleon
typing-extensions
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ docutils
Sphinx
pytest
pytest-vcr
pytz
vcrpy
argh
twine
typing-extensions
sphinxcontrib-napoleon
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ per-file-ignores =
civic_scraper/base/__init__.py:F401
civic_scraper/platforms/__init__.py:F401
tests/test_civic_plus_site.py: E501
tests/legistar_site.py: E402
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ def local_version(version):
"flake8",
"pytest",
"pytest-vcr",
"pytz",
"typing-extensions",
"vcrpy",
],
setup_requires=["setuptools_scm"],
Expand Down
Loading

0 comments on commit 9d2fac2

Please sign in to comment.