cityofaustin · johnclary · Aug 30, 2024 · Aug 27, 2024 · Aug 27, 2024 · Aug 28, 2024
diff --git a/atd-etl/cris_import/cris_import.py b/atd-etl/cris_import/cris_import.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
+from datetime import datetime, timezone
+
 from utils.cli import get_cli_args
-from utils.graphql import create_log_entry, set_log_entry_complete
+from utils.graphql import create_log_entry, update_log_entry
 from utils.logging import init_logger
 from utils.process_csvs import process_csvs
 from utils.process_pdfs import process_pdfs
@@ -51,6 +53,14 @@ def main(cli_args):
             csv_records_processed_dict = process_csvs(extract_dir)
             records_processed.update(csv_records_processed_dict)
 
+            # update the import log to capture # of csvs processed
+            update_log_entry(
+                log_entry_id=log_entry_id,
+                payload={
+                    "records_processed": records_processed,
+                },
+            )
+
         no_crashes_found = (
             True if cli_args.csv and records_processed["crashes"] == 0 else False
         )
@@ -66,17 +76,25 @@ def main(cli_args):
             # cause an unwanted failure
             logger.info("Skipping PDF processing because no CSV crashes were processed")
 
-        # if processing CSVs and PDFs, make sure the number of crashes matches the number of PDFs
+        # if processing CSVs and PDFs, check the the number of crashes matches the number of PDFs
+        # this used to raise and exception until the CRIS v28 release on August 26, 2024 which
+        # resulted in some PDFs being excluded from extracts
         if cli_args.pdf and cli_args.csv:
             if records_processed["crashes"] != records_processed["pdfs"]:
-                raise Exception(
-                    "Mismatch between # of crashes processed vs PDFs. This should never happen!"
+                logger.warning(
+                    f"Warning: there was a mismatch between the # of CSV crashes processed ({records_processed['crashes']}) vs the CR3 PDFs processed ({records_processed['pdfs']})."
                 )
 
         if cli_args.s3_download and cli_args.s3_archive and not cli_args.skip_unzip:
             archive_extract_zip(extract["s3_file_key"])
-        set_log_entry_complete(
-            log_entry_id=log_entry_id, records_processed=records_processed
+
+        # update the import log entry
+        update_log_entry(
+            log_entry_id=log_entry_id,
+            payload={
+                "records_processed": records_processed,
+                "completed_at": datetime.now(timezone.utc).isoformat(),
+            },
         )
 
 

diff --git a/atd-etl/cris_import/utils/graphql.py b/atd-etl/cris_import/utils/graphql.py
@@ -1,4 +1,3 @@
-from datetime import datetime, timezone
 import os
 
 import requests
@@ -167,19 +166,18 @@ def create_log_entry(
     return data["insert__cris_import_log_one"]["id"]
 
 
-def set_log_entry_complete(*, log_entry_id, records_processed):
-    """Set the completed_at timestamp of a cris_activity_log record
+def update_log_entry(*, log_entry_id, payload):
+    """Update a cris_activity_log record
 
     Args:
         log_entry_id (int): the log record ID
-        records_processed (dict): a dict with the number of records processed by table type.
-            E.g.: {"crashes": 0,"units": 0,"persons": 0,"charges": 0,"pdfs": 0}
+        payload (dict): the record values to update. typically a combination of:
+            - completed_at (str): the utc iso timestamp at which the import completed
+            - records_processed (dict): a dict with the number of records processed by table type.
+                E.g.: {"crashes": 0,"units": 0,"persons": 0,"charges": 0,"pdfs": 0}
     """
     variables = {
         "id": log_entry_id,
-        "data": {
-            "completed_at": datetime.now(timezone.utc).isoformat(),
-            "records_processed": records_processed,
-        },
+        "data": payload,
     }
     make_hasura_request(query=CRIS_IMPORT_LOG_UPDATE_MUTATION, variables=variables)
diff --git a/atd-etl/cris_import/utils/process_pdfs.py b/atd-etl/cris_import/utils/process_pdfs.py
@@ -5,7 +5,7 @@
 
 import time
 
-from pdf2image import convert_from_path
+from pdf2image import convert_from_path, pdfinfo_from_path
 
 from utils.graphql import UPDATE_CRASH_CR3_FIELDS, make_hasura_request
 from utils.logging import get_logger
@@ -19,42 +19,57 @@
 logger = get_logger()
 
 
-def is_new_cr3_form(page):
-    """Determine if the CR3 is following the older or newer format.
+def get_cr3_version(page, page_width):
+    """Determine the CR3 form version.
 
     The check is conducted by sampling if various pixels are black.
 
+    On August 27, 2024 CRIS started delivering all CR3s using a
+    smaller page size. This function was adapted to handle the
+    legacy large format page and the new smaller page size.
+
     Args:
         page (PIL image): the pdf page as an image
+        page_width (int): the width of the PDF in points
 
     Returns:
-        bool: true if the provided page passes the sampling tests
+        str: 'v1_small', 'v1_large','v2_large', or 'v2_small'
     """
-    new_cr3_form = True
-    for pixel in NEW_CR3_FORM_TEST_PIXELS:
+    page_size = "small" if page_width < 700 else "large"
+    test_pixels = NEW_CR3_FORM_TEST_PIXELS[page_size]
+
+    for pixel in test_pixels:
         rgb_pixel = page.getpixel(pixel)
-        if rgb_pixel[0] != 0 or rgb_pixel[1] != 0 or rgb_pixel[2] != 0:
-            new_cr3_form = False
-            break
-    return new_cr3_form
+        if rgb_pixel[0] > 5 or rgb_pixel[1] > 5 or rgb_pixel[2] > 5:
+            # the PDF fails our pixel checks, so assume it's the
+            # earliest version
+            return f"v1_{page_size}"
+
+    return f"v2_{page_size}"
+
+
+def get_pdf_width(pdf_path):
+    """Return the width of the pdf in points"""
+    pdf_info = pdfinfo_from_path(pdf_path)
+    # parse width from a string that looks like '612 x 792 pts (letter)'
+    return int(pdf_info["Page size"].split(" ")[0])
 
 
-def crop_and_save_diagram(page, cris_crash_id, is_new_cr3_form, extract_dir):
+def crop_and_save_diagram(page, cris_crash_id, bbox, extract_dir):
     """Crop out the crash diagram and save it to the local directory.
 
     The diagram is saved to <extract_dir>/crash_diagrams/<cris_crash_id>.jpeg
 
     Args:
         page (PIL image): the CR3 pdf page as an image
         cris_crash_id (int): the CRIS crash ID
-        is_new_cr3_form (bool): if the CR3 is in the 'new' format
+        bbox (tuple[int]): the bounding box pixels to crop
         extract_dir (str): the local directory in which to save the file
 
     Returns:
         str: diagram_full_path - the full path to the saved diagram, including it's name
         str: diagram_filename - the name of diagram file, .e.g 12345678.jpeg
     """
-    bbox = DIAGRAM_BBOX_PIXELS["new"] if is_new_cr3_form else DIAGRAM_BBOX_PIXELS["old"]
     diagram_image = page.crop(bbox)
     diagram_filename = f"{cris_crash_id}.jpeg"
     diagram_full_path = os.path.join(extract_dir, "crash_diagrams", diagram_filename)
@@ -79,7 +94,10 @@ def process_pdf(extract_dir, filename, s3_upload, index):
     logger.info(f"Processing {filename} ({index})")
     cris_crash_id = int(filename.replace(".pdf", ""))
     pdf_path = os.path.join(extract_dir, "crashReports", filename)
+    page_width = get_pdf_width(pdf_path)
+
     logger.debug("Converting PDF to image...")
+
     page = convert_from_path(
         pdf_path,
         fmt="jpeg",  # jpeg is much faster than the default ppm fmt
@@ -88,9 +106,13 @@ def process_pdf(extract_dir, filename, s3_upload, index):
         dpi=150,
     )[0]
 
+    cr3_version = get_cr3_version(page, page_width)
+    bbox = DIAGRAM_BBOX_PIXELS[cr3_version]
+
     logger.debug("Cropping crash diagram...")
+
     diagram_full_path, diagram_filename = crop_and_save_diagram(
-        page, cris_crash_id, is_new_cr3_form(page), extract_dir
+        page, cris_crash_id, bbox, extract_dir
     )
 
     if s3_upload:

diff --git a/atd-etl/cris_import/utils/settings.py b/atd-etl/cris_import/utils/settings.py
@@ -2,17 +2,27 @@
 
 """This is the bbox used to crop the crash diagram"""
 DIAGRAM_BBOX_PIXELS = {
-    "old": (2589, 3531, 5001, 6048),
-    "new": (2496, 3036, 4836, 5464),
+    "v1_small": (681, 928, 1315, 1590),
+    "v1_large": (2589, 3531, 5001, 6048),
+    "v2_small": (658, 791, 1270, 1430),
+    "v2_large": (2496, 3036, 4836, 5464),
 }
 
 """If all four of these pixels are black, it is a 'new' CR3 pdf"""
-NEW_CR3_FORM_TEST_PIXELS = [
-    (215, 2567),
-    (872, 2568),
-    (625, 1806),
-    (4834, 279),
-]
+NEW_CR3_FORM_TEST_PIXELS = {
+    "small": [
+        (115, 670),
+        (300, 670),
+        (165, 224),
+        (545, 224),
+    ],
+    "large": [
+        (215, 2567),
+        (872, 2568),
+        (625, 1806),
+        (4834, 279),
+    ],
+}
 
 CSV_UPLOAD_BATCH_SIZE = 1000