Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🏗️ Build spider: Morton County Commission #10

Merged
merged 3 commits into from
Mar 19, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,5 @@ src/
logs/*.log
travis/*.json

# output files: local gitignore added to city_scrapers/local_outputs/
# output files
*.csv
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ scrapy = "*"
scrapy-sentry-errors = "1.0.0"
city-scrapers-core = {ref = "main", git = "https://github.com/City-Bureau/city-scrapers-core.git", extras = ["azure"]}
scrapy-wayback-middleware = "*"
python-dateutil = "*"

[dev-packages]
freezegun = "*"
Expand Down
492 changes: 251 additions & 241 deletions Pipfile.lock

Large diffs are not rendered by default.

100 changes: 100 additions & 0 deletions city_scrapers/spiders/bisnd_mcc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from datetime import datetime, time

from city_scrapers_core.constants import COMMISSION
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from dateutil.parser import parse


class BisndMccSpider(CityScrapersSpider):
name = "bisnd_mcc"
agency = "Morton County Commission"
timezone = "America/Chicago"
start_urls = [
"https://www.mortonnd.org/?Type=B_BASIC&SEC={8182F8B8-3783-4C56-B690-F78FEE7CAC95}" # noqa
]
title = "Morton County Commission Meeting"
meeting_time = time(17, 30)
location = {
"address": "Morton County Commission Room",
"name": "Morton County Courthouse, 210 2nd Ave NW, Mandan ND",
}
custom_settings = {
"ROBOTSTXT_OBEY": False,
}

def parse(self, response):
"""
Parse HTML table and extract meeting information. Due to a lack of info,
we hardcode certain fields, like start time and title
"""
for item in response.css("main[role='main'] table tbody tr:not(:first-child)"):
start, description = self._first_col(item)
if start is None:
self.logger.warning("Invalid date format - skipping")
continue
# ignore meetings that are more than a year ago
if start < datetime.now().replace(year=datetime.now().year - 1):
self.logger.info(
f"Skipping meeting from {start} as it is more than a year ago"
)
continue
meeting = Meeting(
title=self.title,
description=description,
classification=COMMISSION,
start=start,
end=None,
all_day=False,
time_notes="",
location=self.location,
links=self._parse_links(item),
source=response.url,
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def _first_col(self, item):
"""Extract date of the meeting and any additional text from
the first column of the table row. Date of the meeting is in
"Month Day, Year" format."""
col_str = item.css("td span::text").extract_first()
description = ""

# Parse text
if col_str is None:
self.logger.info("No text found in date column")
return None, description
col_words = col_str.split()
if len(col_words) < 3:
# date should always be first three words
self.logger.info(f"Invalid date format: {col_words}")
return None, description
date_str = " ".join(col_words[:3])
if len(col_words) > 3:
# if there's any other text, use that as the description
description = " ".join(col_words[3:])

# parse date
try:
parsed_date = parse(date_str)
# combine date and time
parsed_date = datetime.combine(parsed_date.date(), self.meeting_time)
return parsed_date, description
except ValueError:
self.logger.info(f"Invalid date format: {date_str}")
return None, description

def _parse_links(self, item):
"""Parse or generate links."""
links = []
for row in item.css("td:not(:first-child) a"):
link = {
"title": row.css("::text").extract_first(),
"href": row.css("::attr(href)").extract_first(),
}
links.append(link)
return links
Loading
Loading