From adb1a4bb96af528f8e46b3f141390cab4368d0b7 Mon Sep 17 00:00:00 2001
From: John Clary <john.clary@austintexas.gov>
Date: Tue, 27 Aug 2024 08:31:38 -0400
Subject: [PATCH 1/5] handle smaller cr3 pdf page size

---
 atd-etl/cris_import/utils/process_pdfs.py | 47 ++++++++++++++++-------
 atd-etl/cris_import/utils/settings.py     |  5 ++-
 2 files changed, 37 insertions(+), 15 deletions(-)

diff --git a/atd-etl/cris_import/utils/process_pdfs.py b/atd-etl/cris_import/utils/process_pdfs.py
index 383679536..db5fa5a08 100644
--- a/atd-etl/cris_import/utils/process_pdfs.py
+++ b/atd-etl/cris_import/utils/process_pdfs.py
@@ -5,7 +5,7 @@
 
 import time
 
-from pdf2image import convert_from_path
+from pdf2image import convert_from_path, pdfinfo_from_path
 
 from utils.graphql import UPDATE_CRASH_CR3_FIELDS, make_hasura_request
 from utils.logging import get_logger
@@ -19,27 +19,42 @@
 logger = get_logger()
 
 
-def is_new_cr3_form(page):
-    """Determine if the CR3 is following the older or newer format.
-
+def get_cr3_version(page, page_width):
+    """Determine the CR3 from version.
+    
     The check is conducted by sampling if various pixels are black.
 
     Args:
         page (PIL image): the pdf page as an image
+        page_width (int): the width of the PDF in points
 
     Returns:
-        bool: true if the provided page passes the sampling tests
+        str: 'v1', 'v2_large', or 'v2_small'
     """
-    new_cr3_form = True
+    if page_width < 700:
+        # the most recent version is the same layout as v2-large
+        # but it is delivered in a smaller page size
+        # this took effect 27 Aug 2024
+        return "v2_small"
+
     for pixel in NEW_CR3_FORM_TEST_PIXELS:
         rgb_pixel = page.getpixel(pixel)
         if rgb_pixel[0] != 0 or rgb_pixel[1] != 0 or rgb_pixel[2] != 0:
-            new_cr3_form = False
-            break
-    return new_cr3_form
+            # the PDF fails our pixel checks, so assume it's the
+            # earliest version
+            return "v1"
+
+    return "v2_large"
+
+
+def get_pdf_width(pdf_path):
+    """Return the width of the pdf in points"""
+    pdf_info = pdfinfo_from_path(pdf_path)
+    # parse width from a string that looks like '612 x 792 pts (letter)'
+    return int(pdf_info["Page size"].split(" ")[0])
 
 
-def crop_and_save_diagram(page, cris_crash_id, is_new_cr3_form, extract_dir):
+def crop_and_save_diagram(page, cris_crash_id, bbox, extract_dir):
     """Crop out the crash diagram and save it to the local directory.
 
     The diagram is saved to <extract_dir>/crash_diagrams/<cris_crash_id>.jpeg
@@ -47,14 +62,13 @@ def crop_and_save_diagram(page, cris_crash_id, is_new_cr3_form, extract_dir):
     Args:
         page (PIL image): the CR3 pdf page as an image
         cris_crash_id (int): the CRIS crash ID
-        is_new_cr3_form (bool): if the CR3 is in the 'new' format
+        bbox (tuple[int]): the bounding box pixels to crop
         extract_dir (str): the local directory in which to save the file
 
     Returns:
         str: diagram_full_path - the full path to the saved diagram, including it's name
         str: diagram_filename - the name of diagram file, .e.g 12345678.jpeg
     """
-    bbox = DIAGRAM_BBOX_PIXELS["new"] if is_new_cr3_form else DIAGRAM_BBOX_PIXELS["old"]
     diagram_image = page.crop(bbox)
     diagram_filename = f"{cris_crash_id}.jpeg"
     diagram_full_path = os.path.join(extract_dir, "crash_diagrams", diagram_filename)
@@ -79,7 +93,10 @@ def process_pdf(extract_dir, filename, s3_upload, index):
     logger.info(f"Processing {filename} ({index})")
     cris_crash_id = int(filename.replace(".pdf", ""))
     pdf_path = os.path.join(extract_dir, "crashReports", filename)
+    page_width = get_pdf_width(pdf_path)
+
     logger.debug("Converting PDF to image...")
+
     page = convert_from_path(
         pdf_path,
         fmt="jpeg",  # jpeg is much faster than the default ppm fmt
@@ -88,9 +105,13 @@ def process_pdf(extract_dir, filename, s3_upload, index):
         dpi=150,
     )[0]
 
+    cr3_version = get_cr3_version(page, page_width)
+    bbox = DIAGRAM_BBOX_PIXELS[cr3_version]
+
     logger.debug("Cropping crash diagram...")
+
     diagram_full_path, diagram_filename = crop_and_save_diagram(
-        page, cris_crash_id, is_new_cr3_form(page), extract_dir
+        page, cris_crash_id, bbox, extract_dir
     )
 
     if s3_upload:
diff --git a/atd-etl/cris_import/utils/settings.py b/atd-etl/cris_import/utils/settings.py
index c4fba923f..1a33b5cea 100644
--- a/atd-etl/cris_import/utils/settings.py
+++ b/atd-etl/cris_import/utils/settings.py
@@ -2,8 +2,9 @@
 
 """This is the bbox used to crop the crash diagram"""
 DIAGRAM_BBOX_PIXELS = {
-    "old": (2589, 3531, 5001, 6048),
-    "new": (2496, 3036, 4836, 5464),
+    "v1": (2589, 3531, 5001, 6048),
+    "v2_small": (658, 791, 1270, 1430),
+    "v2_large": (2496, 3036, 4836, 5464),
 }
 
 """If all four of these pixels are black, it is a 'new' CR3 pdf"""

From 7afe88a78e1ef20d1475d7535e1d5f249ac7ebfe Mon Sep 17 00:00:00 2001
From: John Clary <john.clary@austintexas.gov>
Date: Tue, 27 Aug 2024 11:26:42 -0400
Subject: [PATCH 2/5] handle new and legacy cr3 versions in both small and
 large

---
 atd-etl/cris_import/utils/process_pdfs.py | 23 ++++++++++++-----------
 atd-etl/cris_import/utils/settings.py     | 23 ++++++++++++++++-------
 2 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/atd-etl/cris_import/utils/process_pdfs.py b/atd-etl/cris_import/utils/process_pdfs.py
index db5fa5a08..81610b6a4 100644
--- a/atd-etl/cris_import/utils/process_pdfs.py
+++ b/atd-etl/cris_import/utils/process_pdfs.py
@@ -21,30 +21,31 @@
 
 def get_cr3_version(page, page_width):
     """Determine the CR3 from version.
-    
+
     The check is conducted by sampling if various pixels are black.
 
+    On August 27, 2024 CRIS started delivering all CR3s using a
+    smaller page size. This function was adapted to handle the
+    legacy large format page and the new smaller page size.
+
     Args:
         page (PIL image): the pdf page as an image
         page_width (int): the width of the PDF in points
 
     Returns:
-        str: 'v1', 'v2_large', or 'v2_small'
+        str: 'v1_small', 'v1_large','v2_large', or 'v2_small'
     """
-    if page_width < 700:
-        # the most recent version is the same layout as v2-large
-        # but it is delivered in a smaller page size
-        # this took effect 27 Aug 2024
-        return "v2_small"
+    page_size = "small" if page_width < 700 else "large"
+    test_pixels = NEW_CR3_FORM_TEST_PIXELS[page_size]
 
-    for pixel in NEW_CR3_FORM_TEST_PIXELS:
+    for pixel in test_pixels:
         rgb_pixel = page.getpixel(pixel)
-        if rgb_pixel[0] != 0 or rgb_pixel[1] != 0 or rgb_pixel[2] != 0:
+        if rgb_pixel[0] > 5 or rgb_pixel[1] > 5 or rgb_pixel[2] > 5:
             # the PDF fails our pixel checks, so assume it's the
             # earliest version
-            return "v1"
+            return f"v1_{page_size}"
 
-    return "v2_large"
+    return f"v2_{page_size}"
 
 
 def get_pdf_width(pdf_path):
diff --git a/atd-etl/cris_import/utils/settings.py b/atd-etl/cris_import/utils/settings.py
index 1a33b5cea..6a3e91f7a 100644
--- a/atd-etl/cris_import/utils/settings.py
+++ b/atd-etl/cris_import/utils/settings.py
@@ -2,18 +2,27 @@
 
 """This is the bbox used to crop the crash diagram"""
 DIAGRAM_BBOX_PIXELS = {
-    "v1": (2589, 3531, 5001, 6048),
+    "v1_small": (2589 * 0.263, 3531 * 0.263, 5001 * 0.263, 6048 * 0.263),
+    "v1_large": (2589, 3531, 5001, 6048),
     "v2_small": (658, 791, 1270, 1430),
     "v2_large": (2496, 3036, 4836, 5464),
 }
 
 """If all four of these pixels are black, it is a 'new' CR3 pdf"""
-NEW_CR3_FORM_TEST_PIXELS = [
-    (215, 2567),
-    (872, 2568),
-    (625, 1806),
-    (4834, 279),
-]
+NEW_CR3_FORM_TEST_PIXELS = {
+    "small": [
+        (115, 670),
+        (300, 670),
+        (165, 224),
+        (545, 224),
+    ],
+    "large": [
+        (215, 2567),
+        (872, 2568),
+        (625, 1806),
+        (4834, 279),
+    ],
+}
 
 CSV_UPLOAD_BATCH_SIZE = 1000
 

From 812aaec922ceff5967a59d4c9358f276063cef43 Mon Sep 17 00:00:00 2001
From: John Clary <john.clary@austintexas.gov>
Date: Wed, 28 Aug 2024 08:15:35 -0400
Subject: [PATCH 3/5] fix typo and clean up bbox values

---
 atd-etl/cris_import/utils/process_pdfs.py | 2 +-
 atd-etl/cris_import/utils/settings.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/atd-etl/cris_import/utils/process_pdfs.py b/atd-etl/cris_import/utils/process_pdfs.py
index 81610b6a4..f91a61e61 100644
--- a/atd-etl/cris_import/utils/process_pdfs.py
+++ b/atd-etl/cris_import/utils/process_pdfs.py
@@ -20,7 +20,7 @@
 
 
 def get_cr3_version(page, page_width):
-    """Determine the CR3 from version.
+    """Determine the CR3 form version.
 
     The check is conducted by sampling if various pixels are black.
 
diff --git a/atd-etl/cris_import/utils/settings.py b/atd-etl/cris_import/utils/settings.py
index 6a3e91f7a..d80ee6e36 100644
--- a/atd-etl/cris_import/utils/settings.py
+++ b/atd-etl/cris_import/utils/settings.py
@@ -2,7 +2,7 @@
 
 """This is the bbox used to crop the crash diagram"""
 DIAGRAM_BBOX_PIXELS = {
-    "v1_small": (2589 * 0.263, 3531 * 0.263, 5001 * 0.263, 6048 * 0.263),
+    "v1_small": (681, 928, 1315, 1590),
     "v1_large": (2589, 3531, 5001, 6048),
     "v2_small": (658, 791, 1270, 1430),
     "v2_large": (2496, 3036, 4836, 5464),

From fc04990ae0a8befb1d7f66c6c708ee3923125bdc Mon Sep 17 00:00:00 2001
From: John Clary <john.clary@austintexas.gov>
Date: Wed, 28 Aug 2024 08:18:56 -0400
Subject: [PATCH 4/5] log instead of raise csv <> pdf count mismatch

---
 atd-etl/cris_import/cris_import.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/atd-etl/cris_import/cris_import.py b/atd-etl/cris_import/cris_import.py
index e94a9f3be..80ecdcfaf 100755
--- a/atd-etl/cris_import/cris_import.py
+++ b/atd-etl/cris_import/cris_import.py
@@ -66,11 +66,13 @@ def main(cli_args):
             # cause an unwanted failure
             logger.info("Skipping PDF processing because no CSV crashes were processed")
 
-        # if processing CSVs and PDFs, make sure the number of crashes matches the number of PDFs
+        # if processing CSVs and PDFs, check the the number of crashes matches the number of PDFs
+        # this used to raise and exception until the CRIS v28 release on August 26, 2024 which 
+        # resulted in some PDFs being excluded from extracts
         if cli_args.pdf and cli_args.csv:
             if records_processed["crashes"] != records_processed["pdfs"]:
-                raise Exception(
-                    "Mismatch between # of crashes processed vs PDFs. This should never happen!"
+                logger.warning(
+                    f"Warning: there was a mismatch between the # of CSV crashes processed ({records_processed['crashes']}) vs the CR3 PDFs processed ({records_processed['pdfs']})."
                 )
 
         if cli_args.s3_download and cli_args.s3_archive and not cli_args.skip_unzip:

From 891400c960ab318a8f87caf80e7fb7340765eaf7 Mon Sep 17 00:00:00 2001
From: John Clary <john.clary@austintexas.gov>
Date: Thu, 29 Aug 2024 08:13:41 -0400
Subject: [PATCH 5/5] update log entry after csvs are done

---
 atd-etl/cris_import/cris_import.py   | 24 ++++++++++++++++++++----
 atd-etl/cris_import/utils/graphql.py | 16 +++++++---------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/atd-etl/cris_import/cris_import.py b/atd-etl/cris_import/cris_import.py
index 80ecdcfaf..c66f3a782 100755
--- a/atd-etl/cris_import/cris_import.py
+++ b/atd-etl/cris_import/cris_import.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
+from datetime import datetime, timezone
+
 from utils.cli import get_cli_args
-from utils.graphql import create_log_entry, set_log_entry_complete
+from utils.graphql import create_log_entry, update_log_entry
 from utils.logging import init_logger
 from utils.process_csvs import process_csvs
 from utils.process_pdfs import process_pdfs
@@ -51,6 +53,14 @@ def main(cli_args):
             csv_records_processed_dict = process_csvs(extract_dir)
             records_processed.update(csv_records_processed_dict)
 
+            # update the import log to capture # of csvs processed
+            update_log_entry(
+                log_entry_id=log_entry_id,
+                payload={
+                    "records_processed": records_processed,
+                },
+            )
+
         no_crashes_found = (
             True if cli_args.csv and records_processed["crashes"] == 0 else False
         )
@@ -67,7 +77,7 @@ def main(cli_args):
             logger.info("Skipping PDF processing because no CSV crashes were processed")
 
         # if processing CSVs and PDFs, check the the number of crashes matches the number of PDFs
-        # this used to raise and exception until the CRIS v28 release on August 26, 2024 which 
+        # this used to raise and exception until the CRIS v28 release on August 26, 2024 which
         # resulted in some PDFs being excluded from extracts
         if cli_args.pdf and cli_args.csv:
             if records_processed["crashes"] != records_processed["pdfs"]:
@@ -77,8 +87,14 @@ def main(cli_args):
 
         if cli_args.s3_download and cli_args.s3_archive and not cli_args.skip_unzip:
             archive_extract_zip(extract["s3_file_key"])
-        set_log_entry_complete(
-            log_entry_id=log_entry_id, records_processed=records_processed
+
+        # update the import log entry
+        update_log_entry(
+            log_entry_id=log_entry_id,
+            payload={
+                "records_processed": records_processed,
+                "completed_at": datetime.now(timezone.utc).isoformat(),
+            },
         )
 
 
diff --git a/atd-etl/cris_import/utils/graphql.py b/atd-etl/cris_import/utils/graphql.py
index 70e32c741..c7d9af0eb 100644
--- a/atd-etl/cris_import/utils/graphql.py
+++ b/atd-etl/cris_import/utils/graphql.py
@@ -1,4 +1,3 @@
-from datetime import datetime, timezone
 import os
 
 import requests
@@ -167,19 +166,18 @@ def create_log_entry(
     return data["insert__cris_import_log_one"]["id"]
 
 
-def set_log_entry_complete(*, log_entry_id, records_processed):
-    """Set the completed_at timestamp of a cris_activity_log record
+def update_log_entry(*, log_entry_id, payload):
+    """Update a cris_activity_log record
 
     Args:
         log_entry_id (int): the log record ID
-        records_processed (dict): a dict with the number of records processed by table type.
-            E.g.: {"crashes": 0,"units": 0,"persons": 0,"charges": 0,"pdfs": 0}
+        payload (dict): the record values to update. typically a combination of:
+            - completed_at (str): the utc iso timestamp at which the import completed
+            - records_processed (dict): a dict with the number of records processed by table type.
+                E.g.: {"crashes": 0,"units": 0,"persons": 0,"charges": 0,"pdfs": 0}
     """
     variables = {
         "id": log_entry_id,
-        "data": {
-            "completed_at": datetime.now(timezone.utc).isoformat(),
-            "records_processed": records_processed,
-        },
+        "data": payload,
     }
     make_hasura_request(query=CRIS_IMPORT_LOG_UPDATE_MUTATION, variables=variables)