From adb1a4bb96af528f8e46b3f141390cab4368d0b7 Mon Sep 17 00:00:00 2001 From: John Clary Date: Tue, 27 Aug 2024 08:31:38 -0400 Subject: [PATCH 1/5] handle smaller cr3 pdf page size --- atd-etl/cris_import/utils/process_pdfs.py | 47 ++++++++++++++++------- atd-etl/cris_import/utils/settings.py | 5 ++- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/atd-etl/cris_import/utils/process_pdfs.py b/atd-etl/cris_import/utils/process_pdfs.py index 383679536..db5fa5a08 100644 --- a/atd-etl/cris_import/utils/process_pdfs.py +++ b/atd-etl/cris_import/utils/process_pdfs.py @@ -5,7 +5,7 @@ import time -from pdf2image import convert_from_path +from pdf2image import convert_from_path, pdfinfo_from_path from utils.graphql import UPDATE_CRASH_CR3_FIELDS, make_hasura_request from utils.logging import get_logger @@ -19,27 +19,42 @@ logger = get_logger() -def is_new_cr3_form(page): - """Determine if the CR3 is following the older or newer format. - +def get_cr3_version(page, page_width): + """Determine the CR3 from version. + The check is conducted by sampling if various pixels are black. Args: page (PIL image): the pdf page as an image + page_width (int): the width of the PDF in points Returns: - bool: true if the provided page passes the sampling tests + str: 'v1', 'v2_large', or 'v2_small' """ - new_cr3_form = True + if page_width < 700: + # the most recent version is the same layout as v2-large + # but it is delivered in a smaller page size + # this took effect 27 Aug 2024 + return "v2_small" + for pixel in NEW_CR3_FORM_TEST_PIXELS: rgb_pixel = page.getpixel(pixel) if rgb_pixel[0] != 0 or rgb_pixel[1] != 0 or rgb_pixel[2] != 0: - new_cr3_form = False - break - return new_cr3_form + # the PDF fails our pixel checks, so assume it's the + # earliest version + return "v1" + + return "v2_large" + + +def get_pdf_width(pdf_path): + """Return the width of the pdf in points""" + pdf_info = pdfinfo_from_path(pdf_path) + # parse width from a string that looks like '612 x 792 pts (letter)' + return int(pdf_info["Page size"].split(" ")[0]) -def crop_and_save_diagram(page, cris_crash_id, is_new_cr3_form, extract_dir): +def crop_and_save_diagram(page, cris_crash_id, bbox, extract_dir): """Crop out the crash diagram and save it to the local directory. The diagram is saved to /crash_diagrams/.jpeg @@ -47,14 +62,13 @@ def crop_and_save_diagram(page, cris_crash_id, is_new_cr3_form, extract_dir): Args: page (PIL image): the CR3 pdf page as an image cris_crash_id (int): the CRIS crash ID - is_new_cr3_form (bool): if the CR3 is in the 'new' format + bbox (tuple[int]): the bounding box pixels to crop extract_dir (str): the local directory in which to save the file Returns: str: diagram_full_path - the full path to the saved diagram, including it's name str: diagram_filename - the name of diagram file, .e.g 12345678.jpeg """ - bbox = DIAGRAM_BBOX_PIXELS["new"] if is_new_cr3_form else DIAGRAM_BBOX_PIXELS["old"] diagram_image = page.crop(bbox) diagram_filename = f"{cris_crash_id}.jpeg" diagram_full_path = os.path.join(extract_dir, "crash_diagrams", diagram_filename) @@ -79,7 +93,10 @@ def process_pdf(extract_dir, filename, s3_upload, index): logger.info(f"Processing {filename} ({index})") cris_crash_id = int(filename.replace(".pdf", "")) pdf_path = os.path.join(extract_dir, "crashReports", filename) + page_width = get_pdf_width(pdf_path) + logger.debug("Converting PDF to image...") + page = convert_from_path( pdf_path, fmt="jpeg", # jpeg is much faster than the default ppm fmt @@ -88,9 +105,13 @@ def process_pdf(extract_dir, filename, s3_upload, index): dpi=150, )[0] + cr3_version = get_cr3_version(page, page_width) + bbox = DIAGRAM_BBOX_PIXELS[cr3_version] + logger.debug("Cropping crash diagram...") + diagram_full_path, diagram_filename = crop_and_save_diagram( - page, cris_crash_id, is_new_cr3_form(page), extract_dir + page, cris_crash_id, bbox, extract_dir ) if s3_upload: diff --git a/atd-etl/cris_import/utils/settings.py b/atd-etl/cris_import/utils/settings.py index c4fba923f..1a33b5cea 100644 --- a/atd-etl/cris_import/utils/settings.py +++ b/atd-etl/cris_import/utils/settings.py @@ -2,8 +2,9 @@ """This is the bbox used to crop the crash diagram""" DIAGRAM_BBOX_PIXELS = { - "old": (2589, 3531, 5001, 6048), - "new": (2496, 3036, 4836, 5464), + "v1": (2589, 3531, 5001, 6048), + "v2_small": (658, 791, 1270, 1430), + "v2_large": (2496, 3036, 4836, 5464), } """If all four of these pixels are black, it is a 'new' CR3 pdf""" From 7afe88a78e1ef20d1475d7535e1d5f249ac7ebfe Mon Sep 17 00:00:00 2001 From: John Clary Date: Tue, 27 Aug 2024 11:26:42 -0400 Subject: [PATCH 2/5] handle new and legacy cr3 versions in both small and large --- atd-etl/cris_import/utils/process_pdfs.py | 23 ++++++++++++----------- atd-etl/cris_import/utils/settings.py | 23 ++++++++++++++++------- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/atd-etl/cris_import/utils/process_pdfs.py b/atd-etl/cris_import/utils/process_pdfs.py index db5fa5a08..81610b6a4 100644 --- a/atd-etl/cris_import/utils/process_pdfs.py +++ b/atd-etl/cris_import/utils/process_pdfs.py @@ -21,30 +21,31 @@ def get_cr3_version(page, page_width): """Determine the CR3 from version. - + The check is conducted by sampling if various pixels are black. + On August 27, 2024 CRIS started delivering all CR3s using a + smaller page size. This function was adapted to handle the + legacy large format page and the new smaller page size. + Args: page (PIL image): the pdf page as an image page_width (int): the width of the PDF in points Returns: - str: 'v1', 'v2_large', or 'v2_small' + str: 'v1_small', 'v1_large','v2_large', or 'v2_small' """ - if page_width < 700: - # the most recent version is the same layout as v2-large - # but it is delivered in a smaller page size - # this took effect 27 Aug 2024 - return "v2_small" + page_size = "small" if page_width < 700 else "large" + test_pixels = NEW_CR3_FORM_TEST_PIXELS[page_size] - for pixel in NEW_CR3_FORM_TEST_PIXELS: + for pixel in test_pixels: rgb_pixel = page.getpixel(pixel) - if rgb_pixel[0] != 0 or rgb_pixel[1] != 0 or rgb_pixel[2] != 0: + if rgb_pixel[0] > 5 or rgb_pixel[1] > 5 or rgb_pixel[2] > 5: # the PDF fails our pixel checks, so assume it's the # earliest version - return "v1" + return f"v1_{page_size}" - return "v2_large" + return f"v2_{page_size}" def get_pdf_width(pdf_path): diff --git a/atd-etl/cris_import/utils/settings.py b/atd-etl/cris_import/utils/settings.py index 1a33b5cea..6a3e91f7a 100644 --- a/atd-etl/cris_import/utils/settings.py +++ b/atd-etl/cris_import/utils/settings.py @@ -2,18 +2,27 @@ """This is the bbox used to crop the crash diagram""" DIAGRAM_BBOX_PIXELS = { - "v1": (2589, 3531, 5001, 6048), + "v1_small": (2589 * 0.263, 3531 * 0.263, 5001 * 0.263, 6048 * 0.263), + "v1_large": (2589, 3531, 5001, 6048), "v2_small": (658, 791, 1270, 1430), "v2_large": (2496, 3036, 4836, 5464), } """If all four of these pixels are black, it is a 'new' CR3 pdf""" -NEW_CR3_FORM_TEST_PIXELS = [ - (215, 2567), - (872, 2568), - (625, 1806), - (4834, 279), -] +NEW_CR3_FORM_TEST_PIXELS = { + "small": [ + (115, 670), + (300, 670), + (165, 224), + (545, 224), + ], + "large": [ + (215, 2567), + (872, 2568), + (625, 1806), + (4834, 279), + ], +} CSV_UPLOAD_BATCH_SIZE = 1000 From 812aaec922ceff5967a59d4c9358f276063cef43 Mon Sep 17 00:00:00 2001 From: John Clary Date: Wed, 28 Aug 2024 08:15:35 -0400 Subject: [PATCH 3/5] fix typo and clean up bbox values --- atd-etl/cris_import/utils/process_pdfs.py | 2 +- atd-etl/cris_import/utils/settings.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/atd-etl/cris_import/utils/process_pdfs.py b/atd-etl/cris_import/utils/process_pdfs.py index 81610b6a4..f91a61e61 100644 --- a/atd-etl/cris_import/utils/process_pdfs.py +++ b/atd-etl/cris_import/utils/process_pdfs.py @@ -20,7 +20,7 @@ def get_cr3_version(page, page_width): - """Determine the CR3 from version. + """Determine the CR3 form version. The check is conducted by sampling if various pixels are black. diff --git a/atd-etl/cris_import/utils/settings.py b/atd-etl/cris_import/utils/settings.py index 6a3e91f7a..d80ee6e36 100644 --- a/atd-etl/cris_import/utils/settings.py +++ b/atd-etl/cris_import/utils/settings.py @@ -2,7 +2,7 @@ """This is the bbox used to crop the crash diagram""" DIAGRAM_BBOX_PIXELS = { - "v1_small": (2589 * 0.263, 3531 * 0.263, 5001 * 0.263, 6048 * 0.263), + "v1_small": (681, 928, 1315, 1590), "v1_large": (2589, 3531, 5001, 6048), "v2_small": (658, 791, 1270, 1430), "v2_large": (2496, 3036, 4836, 5464), From fc04990ae0a8befb1d7f66c6c708ee3923125bdc Mon Sep 17 00:00:00 2001 From: John Clary Date: Wed, 28 Aug 2024 08:18:56 -0400 Subject: [PATCH 4/5] log instead of raise csv <> pdf count mismatch --- atd-etl/cris_import/cris_import.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/atd-etl/cris_import/cris_import.py b/atd-etl/cris_import/cris_import.py index e94a9f3be..80ecdcfaf 100755 --- a/atd-etl/cris_import/cris_import.py +++ b/atd-etl/cris_import/cris_import.py @@ -66,11 +66,13 @@ def main(cli_args): # cause an unwanted failure logger.info("Skipping PDF processing because no CSV crashes were processed") - # if processing CSVs and PDFs, make sure the number of crashes matches the number of PDFs + # if processing CSVs and PDFs, check the the number of crashes matches the number of PDFs + # this used to raise and exception until the CRIS v28 release on August 26, 2024 which + # resulted in some PDFs being excluded from extracts if cli_args.pdf and cli_args.csv: if records_processed["crashes"] != records_processed["pdfs"]: - raise Exception( - "Mismatch between # of crashes processed vs PDFs. This should never happen!" + logger.warning( + f"Warning: there was a mismatch between the # of CSV crashes processed ({records_processed['crashes']}) vs the CR3 PDFs processed ({records_processed['pdfs']})." ) if cli_args.s3_download and cli_args.s3_archive and not cli_args.skip_unzip: From 891400c960ab318a8f87caf80e7fb7340765eaf7 Mon Sep 17 00:00:00 2001 From: John Clary Date: Thu, 29 Aug 2024 08:13:41 -0400 Subject: [PATCH 5/5] update log entry after csvs are done --- atd-etl/cris_import/cris_import.py | 24 ++++++++++++++++++++---- atd-etl/cris_import/utils/graphql.py | 16 +++++++--------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/atd-etl/cris_import/cris_import.py b/atd-etl/cris_import/cris_import.py index 80ecdcfaf..c66f3a782 100755 --- a/atd-etl/cris_import/cris_import.py +++ b/atd-etl/cris_import/cris_import.py @@ -1,6 +1,8 @@ #!/usr/bin/env python +from datetime import datetime, timezone + from utils.cli import get_cli_args -from utils.graphql import create_log_entry, set_log_entry_complete +from utils.graphql import create_log_entry, update_log_entry from utils.logging import init_logger from utils.process_csvs import process_csvs from utils.process_pdfs import process_pdfs @@ -51,6 +53,14 @@ def main(cli_args): csv_records_processed_dict = process_csvs(extract_dir) records_processed.update(csv_records_processed_dict) + # update the import log to capture # of csvs processed + update_log_entry( + log_entry_id=log_entry_id, + payload={ + "records_processed": records_processed, + }, + ) + no_crashes_found = ( True if cli_args.csv and records_processed["crashes"] == 0 else False ) @@ -67,7 +77,7 @@ def main(cli_args): logger.info("Skipping PDF processing because no CSV crashes were processed") # if processing CSVs and PDFs, check the the number of crashes matches the number of PDFs - # this used to raise and exception until the CRIS v28 release on August 26, 2024 which + # this used to raise and exception until the CRIS v28 release on August 26, 2024 which # resulted in some PDFs being excluded from extracts if cli_args.pdf and cli_args.csv: if records_processed["crashes"] != records_processed["pdfs"]: @@ -77,8 +87,14 @@ def main(cli_args): if cli_args.s3_download and cli_args.s3_archive and not cli_args.skip_unzip: archive_extract_zip(extract["s3_file_key"]) - set_log_entry_complete( - log_entry_id=log_entry_id, records_processed=records_processed + + # update the import log entry + update_log_entry( + log_entry_id=log_entry_id, + payload={ + "records_processed": records_processed, + "completed_at": datetime.now(timezone.utc).isoformat(), + }, ) diff --git a/atd-etl/cris_import/utils/graphql.py b/atd-etl/cris_import/utils/graphql.py index 70e32c741..c7d9af0eb 100644 --- a/atd-etl/cris_import/utils/graphql.py +++ b/atd-etl/cris_import/utils/graphql.py @@ -1,4 +1,3 @@ -from datetime import datetime, timezone import os import requests @@ -167,19 +166,18 @@ def create_log_entry( return data["insert__cris_import_log_one"]["id"] -def set_log_entry_complete(*, log_entry_id, records_processed): - """Set the completed_at timestamp of a cris_activity_log record +def update_log_entry(*, log_entry_id, payload): + """Update a cris_activity_log record Args: log_entry_id (int): the log record ID - records_processed (dict): a dict with the number of records processed by table type. - E.g.: {"crashes": 0,"units": 0,"persons": 0,"charges": 0,"pdfs": 0} + payload (dict): the record values to update. typically a combination of: + - completed_at (str): the utc iso timestamp at which the import completed + - records_processed (dict): a dict with the number of records processed by table type. + E.g.: {"crashes": 0,"units": 0,"persons": 0,"charges": 0,"pdfs": 0} """ variables = { "id": log_entry_id, - "data": { - "completed_at": datetime.now(timezone.utc).isoformat(), - "records_processed": records_processed, - }, + "data": payload, } make_hasura_request(query=CRIS_IMPORT_LOG_UPDATE_MUTATION, variables=variables)