MITLibraries · ehanson8 · Sep 20, 2024 · Sep 10, 2024 · Sep 11, 2024 · Sep 11, 2024
diff --git a/.github/pull-request-template.md b/.github/pull-request-template.md
@@ -1,39 +1,30 @@
-### What does this PR do?
-
-Describe the overall purpose of the PR changes. Doesn't need to be as specific as the
-individual commits.
-
-### Helpful background context
-
-Describe any additional context beyond what the PR accomplishes if it is likely to be
-useful to a reviewer.
-
-Delete this section if it isn't applicable to the PR.
+### Purpose and background context
+Describe the overall purpose of the PR changes and any useful background context.
 
 ### How can a reviewer manually see the effects of these changes?
-
 Explain how to see the proposed changes in the application if possible.
 
 Delete this section if it isn't applicable to the PR.
 
 ### Includes new or updated dependencies?
+YES | NO
 
+### Changes expectations for external applications?
 YES | NO
 
 ### What are the relevant tickets?
-
-Include links to Jira Software and/or Jira Service Management tickets here.
+- Include links to Jira Software and/or Jira Service Management tickets here.
 
 ### Developer
-
-- [ ] All new ENV is documented in README (or there is none)
+- [ ] All new ENV is documented in README
+- [ ] All new ENV has been added to staging and production environments
+- [ ] All related Jira tickets are linked in commit message(s)
 - [ ] Stakeholder approval has been confirmed (or is not needed)
 
-### Code Reviewer
-
-- [ ] The commit message is clear and follows our guidelines
-      (not just this pull request message)
+### Code Reviewer(s)
+- [ ] The commit message is clear and follows our guidelines (not just this PR message)
 - [ ] There are appropriate tests covering any new functionality
-- [ ] The documentation has been updated or is unnecessary
-- [ ] The changes have been verified
+- [ ] The provided documentation is sufficient for understanding any new functionality introduced
+- [ ] Any manual tests have been performed **or** provided examples verified
 - [ ] New dependencies are appropriate or there were no changes
+
diff --git a/.python-version b/.python-version
@@ -1 +1 @@
-3.11
+3.12
diff --git a/Dockerfile b/Dockerfile
@@ -14,10 +14,10 @@ RUN apt-get update \
     && add-apt-repository ppa:deadsnakes/ppa \
     && apt-get update
 
-# Install Python 3.11
-RUN apt-get install -y python3.11 python3.11-venv python3.11-dev
+# Install Python
+RUN apt-get install -y python3.12 python3.12-venv python3.12-dev
 
-# Install pip for Python 3.11
+# Install pip for Python
 RUN apt-get install -y python3-pip
 
 # Upgrade pip and install pipenv
@@ -28,7 +28,7 @@ RUN pip3 install --upgrade pip \
 # Setup python virtual environment
 WORKDIR /browsertrix-harvester
 COPY Pipfile /browsertrix-harvester/Pipfile
-RUN pipenv install --python 3.11
+RUN pipenv install --python 3.12
 
 # Copy full browstrix-harvester app
 COPY pyproject.toml /browsertrix-harvester/

diff --git a/Makefile b/Makefile
@@ -7,6 +7,10 @@ ECR_URL_DEV:=222053980223.dkr.ecr.us-east-1.amazonaws.com/browsertrix-harvester-
 SHELL=/bin/bash
 DATETIME:=$(shell date -u +%Y%m%dT%H%M%SZ)
 
+help: ## Print this message
+	@awk 'BEGIN { FS = ":.*##"; print "Usage:  make <target>\n\nTargets:" } \
+/^[-_[:alpha:]]+:.?*##/ { printf "  %-15s%s\n", $$1, $$2 }' $(MAKEFILE_LIST)
+
 ### Dependency commands ###
 install: # install python dependencies
 	pipenv install --dev

diff --git a/Pipfile b/Pipfile
@@ -4,30 +4,30 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
+bs4 = "*"
 click = "*"
-sentry-sdk = "*"
-warcio = "*"
-requests = "*"
 pandas = "*"
-bs4 = "*"
+requests = "*"
+sentry-sdk = "*"
 smart-open = {version = "*", extras = ["s3"]}
+warcio = "*"
 yake = "*"
 
 [dev-packages]
 black = "*"
 coverage = "*"
 coveralls = "*"
+ipython = "*"
 mypy = "*"
+pandas-stubs = "*"
+pre-commit = "*"
 pytest = "*"
 ruff = "*"
 safety= "*"
-pre-commit = "*"
-ipython = "*"
 types-beautifulsoup4 = "*"
-pandas-stubs = "*"
 
 [requires]
-python_version = "3.11"
+python_version = "3.12"
 
 [scripts]
 harvester = "python -c \"from harvester.cli import main; main()\""

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/harvester/cli.py b/harvester/cli.py
@@ -1,4 +1,5 @@
 """harvester.cli"""
+
 # ruff: noqa: FBT001
 
 import logging

diff --git a/harvester/exceptions.py b/harvester/exceptions.py
@@ -1,4 +1,5 @@
 """browsertrix_harvest.exceptions."""
+
 # ruff: noqa: N818
 
 

diff --git a/harvester/metadata.py b/harvester/metadata.py
@@ -1,9 +1,10 @@
 """harvester.parse"""
+
 # ruff: noqa: N813
 
 import logging
 import time
-import xml.etree.ElementTree as etree
+import xml.etree.ElementTree as ET
 
 import pandas as pd
 import smart_open  # type: ignore[import]
@@ -118,15 +119,16 @@ def generate_metadata(self, include_fulltext: bool = False) -> "CrawlMetadataRec
 
                 # augment with metadata parsed from the website's HTML content
                 html_content = wacz_client.get_website_content(
-                    row.filename, row.offset, decode=True
+                    str(row.filename), str(row.offset), decode=True
                 )
                 html_metadata = self.get_html_content_metadata(html_content)
                 metadata.update(html_metadata)
 
                 # augment again with data parsed from, and including, HTML fulltext
                 metadata.update(
                     self.parse_fulltext_fields(
-                        row.text, include_fulltext=include_fulltext
+                        row.text,  # type:ignore[arg-type]
+                        include_fulltext=include_fulltext,
                     )
                 )
 
@@ -198,9 +200,11 @@ def parse_fulltext_fields(
         fulltext = self._remove_fulltext_whitespace(raw_fulltext)
         return {
             "fulltext": fulltext if include_fulltext else None,
-            "fulltext_keywords": self._generate_fulltext_keywords(fulltext)
-            if include_fulltext_keywords
-            else None,
+            "fulltext_keywords": (
+                self._generate_fulltext_keywords(fulltext)
+                if include_fulltext_keywords
+                else None
+            ),
         }
 
     @property
@@ -242,15 +246,15 @@ def to_xml(self) -> bytes:
             <record>...</record>, ...
         </records>
         """
-        root = etree.Element("records")
+        root = ET.Element("records")
         for _, row in self.metadata_df.iterrows():
-            item = etree.Element("record")
+            item = ET.Element("record")
             root.append(item)
             for col in self.metadata_df.columns:
-                cell = etree.Element(col)
+                cell = ET.Element(col)
                 cell.text = str(row[col])
                 item.append(cell)
-        return etree.tostring(root, encoding="utf-8", method="xml")
+        return ET.tostring(root, encoding="utf-8", method="xml")
 
     def write(self, filepath: str) -> None:
         """Serialize metadata records in various file formats.

diff --git a/harvester/utils.py b/harvester/utils.py
@@ -1,4 +1,5 @@
 """harvester.utils"""
+
 # ruff: noqa: ANN401
 
 import os

diff --git a/pyproject.toml b/pyproject.toml
@@ -15,7 +15,15 @@ markers = [
 ]
 
 [tool.ruff]
-target-version = "py311"
+target-version = "py312"
+
+# set max line length
+line-length = 90
+
+# enumerate all fixed violations
+show-fixes = true
+
+[tool.ruff.lint]
 select = ["ALL", "PT"]
 
 ignore = [
@@ -35,6 +43,7 @@ ignore = [
     "D103",
     "D104",
     "D415",
+    "G004",
     "PLR0912",
     "PLR0913",
     "PLR0915",
@@ -47,23 +56,21 @@ ignore = [
 # allow autofix behavior for specified rules
 fixable = ["E", "F", "I", "Q"]
 
-# set max line length
-line-length = 90
-
-# enumerate all fixed violations
-show-fixes = true
-
-[tool.ruff.flake8-annotations]
+[tool.ruff.lint.flake8-annotations]
 mypy-init-return = true
 
-[tool.ruff.flake8-pytest-style]
+[tool.ruff.lint.flake8-pytest-style]
 fixture-parentheses = false
 
-[tool.ruff.per-file-ignores]
-"tests/**/*" = ["ANN", "ARG001", "S101"]
+[tool.ruff.lint.per-file-ignores]
+"tests/**/*" = [
+    "ANN",
+    "ARG001",
+    "S101",
+]
 
-[tool.ruff.pycodestyle]
+[tool.ruff.lint.pycodestyle]
 max-doc-length = 90
 
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
 convention = "google"
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,4 +1,5 @@
 """tests.test_parser"""
+
 # ruff: noqa: S108
 
 from unittest.mock import call, mock_open, patch

diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -1,4 +1,5 @@
 """tests.test_metadata"""
+
 # ruff: noqa: SLF001, PD002, PD901
 
 from unittest.mock import mock_open, patch

diff --git a/tests/test_wacz.py b/tests/test_wacz.py
@@ -1,4 +1,5 @@
 """tests.test_wacz"""
+
 # ruff: noqa: SLF001, PD901
 
 import logging