Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle smaller cr3 pdf page size #1529

Merged
merged 5 commits into from
Aug 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions atd-etl/cris_import/cris_import.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python
from datetime import datetime, timezone

from utils.cli import get_cli_args
from utils.graphql import create_log_entry, set_log_entry_complete
from utils.graphql import create_log_entry, update_log_entry
from utils.logging import init_logger
from utils.process_csvs import process_csvs
from utils.process_pdfs import process_pdfs
Expand Down Expand Up @@ -51,6 +53,14 @@ def main(cli_args):
csv_records_processed_dict = process_csvs(extract_dir)
records_processed.update(csv_records_processed_dict)

# update the import log to capture # of csvs processed
update_log_entry(
log_entry_id=log_entry_id,
payload={
"records_processed": records_processed,
},
)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i added this so that the number of CSVs processed will be saved in the import log even if the PDF import fails. the records_processed will look something like this:

{"pdfs": 0, "units": 82, "charges": 16, "crashes": 38, "persons": 85}

and the completed_at value will be null, indicating that the import is not complete.


no_crashes_found = (
True if cli_args.csv and records_processed["crashes"] == 0 else False
)
Expand All @@ -66,17 +76,25 @@ def main(cli_args):
# cause an unwanted failure
logger.info("Skipping PDF processing because no CSV crashes were processed")

# if processing CSVs and PDFs, make sure the number of crashes matches the number of PDFs
# if processing CSVs and PDFs, check the the number of crashes matches the number of PDFs
# this used to raise and exception until the CRIS v28 release on August 26, 2024 which
# resulted in some PDFs being excluded from extracts
if cli_args.pdf and cli_args.csv:
if records_processed["crashes"] != records_processed["pdfs"]:
raise Exception(
"Mismatch between # of crashes processed vs PDFs. This should never happen!"
logger.warning(
f"Warning: there was a mismatch between the # of CSV crashes processed ({records_processed['crashes']}) vs the CR3 PDFs processed ({records_processed['pdfs']})."
)

if cli_args.s3_download and cli_args.s3_archive and not cli_args.skip_unzip:
archive_extract_zip(extract["s3_file_key"])
set_log_entry_complete(
log_entry_id=log_entry_id, records_processed=records_processed

# update the import log entry
update_log_entry(
log_entry_id=log_entry_id,
payload={
"records_processed": records_processed,
"completed_at": datetime.now(timezone.utc).isoformat(),
},
)


Expand Down
16 changes: 7 additions & 9 deletions atd-etl/cris_import/utils/graphql.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from datetime import datetime, timezone
import os

import requests
Expand Down Expand Up @@ -167,19 +166,18 @@ def create_log_entry(
return data["insert__cris_import_log_one"]["id"]


def set_log_entry_complete(*, log_entry_id, records_processed):
"""Set the completed_at timestamp of a cris_activity_log record
def update_log_entry(*, log_entry_id, payload):
"""Update a cris_activity_log record

Args:
log_entry_id (int): the log record ID
records_processed (dict): a dict with the number of records processed by table type.
E.g.: {"crashes": 0,"units": 0,"persons": 0,"charges": 0,"pdfs": 0}
payload (dict): the record values to update. typically a combination of:
- completed_at (str): the utc iso timestamp at which the import completed
- records_processed (dict): a dict with the number of records processed by table type.
E.g.: {"crashes": 0,"units": 0,"persons": 0,"charges": 0,"pdfs": 0}
"""
variables = {
"id": log_entry_id,
"data": {
"completed_at": datetime.now(timezone.utc).isoformat(),
"records_processed": records_processed,
},
"data": payload,
}
make_hasura_request(query=CRIS_IMPORT_LOG_UPDATE_MUTATION, variables=variables)
50 changes: 36 additions & 14 deletions atd-etl/cris_import/utils/process_pdfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import time

from pdf2image import convert_from_path
from pdf2image import convert_from_path, pdfinfo_from_path

from utils.graphql import UPDATE_CRASH_CR3_FIELDS, make_hasura_request
from utils.logging import get_logger
Expand All @@ -19,42 +19,57 @@
logger = get_logger()


def is_new_cr3_form(page):
"""Determine if the CR3 is following the older or newer format.
def get_cr3_version(page, page_width):
"""Determine the CR3 form version.

The check is conducted by sampling if various pixels are black.

On August 27, 2024 CRIS started delivering all CR3s using a
smaller page size. This function was adapted to handle the
legacy large format page and the new smaller page size.

Args:
page (PIL image): the pdf page as an image
page_width (int): the width of the PDF in points

Returns:
bool: true if the provided page passes the sampling tests
str: 'v1_small', 'v1_large','v2_large', or 'v2_small'
"""
new_cr3_form = True
for pixel in NEW_CR3_FORM_TEST_PIXELS:
page_size = "small" if page_width < 700 else "large"
test_pixels = NEW_CR3_FORM_TEST_PIXELS[page_size]

for pixel in test_pixels:
rgb_pixel = page.getpixel(pixel)
if rgb_pixel[0] != 0 or rgb_pixel[1] != 0 or rgb_pixel[2] != 0:
new_cr3_form = False
break
return new_cr3_form
if rgb_pixel[0] > 5 or rgb_pixel[1] > 5 or rgb_pixel[2] > 5:
Copy link
Member Author

@johnclary johnclary Aug 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i was having trouble hitting pure black pixels in the smaller format, so i added this narrow tolerance of 0 to 5.

# the PDF fails our pixel checks, so assume it's the
# earliest version
return f"v1_{page_size}"

return f"v2_{page_size}"


def get_pdf_width(pdf_path):
"""Return the width of the pdf in points"""
pdf_info = pdfinfo_from_path(pdf_path)
# parse width from a string that looks like '612 x 792 pts (letter)'
return int(pdf_info["Page size"].split(" ")[0])


def crop_and_save_diagram(page, cris_crash_id, is_new_cr3_form, extract_dir):
def crop_and_save_diagram(page, cris_crash_id, bbox, extract_dir):
"""Crop out the crash diagram and save it to the local directory.

The diagram is saved to <extract_dir>/crash_diagrams/<cris_crash_id>.jpeg

Args:
page (PIL image): the CR3 pdf page as an image
cris_crash_id (int): the CRIS crash ID
is_new_cr3_form (bool): if the CR3 is in the 'new' format
bbox (tuple[int]): the bounding box pixels to crop
extract_dir (str): the local directory in which to save the file

Returns:
str: diagram_full_path - the full path to the saved diagram, including it's name
str: diagram_filename - the name of diagram file, .e.g 12345678.jpeg
"""
bbox = DIAGRAM_BBOX_PIXELS["new"] if is_new_cr3_form else DIAGRAM_BBOX_PIXELS["old"]
diagram_image = page.crop(bbox)
diagram_filename = f"{cris_crash_id}.jpeg"
diagram_full_path = os.path.join(extract_dir, "crash_diagrams", diagram_filename)
Expand All @@ -79,7 +94,10 @@ def process_pdf(extract_dir, filename, s3_upload, index):
logger.info(f"Processing {filename} ({index})")
cris_crash_id = int(filename.replace(".pdf", ""))
pdf_path = os.path.join(extract_dir, "crashReports", filename)
page_width = get_pdf_width(pdf_path)

logger.debug("Converting PDF to image...")

page = convert_from_path(
pdf_path,
fmt="jpeg", # jpeg is much faster than the default ppm fmt
Expand All @@ -88,9 +106,13 @@ def process_pdf(extract_dir, filename, s3_upload, index):
dpi=150,
)[0]

cr3_version = get_cr3_version(page, page_width)
bbox = DIAGRAM_BBOX_PIXELS[cr3_version]

logger.debug("Cropping crash diagram...")

diagram_full_path, diagram_filename = crop_and_save_diagram(
page, cris_crash_id, is_new_cr3_form(page), extract_dir
page, cris_crash_id, bbox, extract_dir
)

if s3_upload:
Expand Down
26 changes: 18 additions & 8 deletions atd-etl/cris_import/utils/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,27 @@

"""This is the bbox used to crop the crash diagram"""
DIAGRAM_BBOX_PIXELS = {
"old": (2589, 3531, 5001, 6048),
"new": (2496, 3036, 4836, 5464),
"v1_small": (681, 928, 1315, 1590),
"v1_large": (2589, 3531, 5001, 6048),
"v2_small": (658, 791, 1270, 1430),
"v2_large": (2496, 3036, 4836, 5464),
}

"""If all four of these pixels are black, it is a 'new' CR3 pdf"""
NEW_CR3_FORM_TEST_PIXELS = [
(215, 2567),
(872, 2568),
(625, 1806),
(4834, 279),
]
NEW_CR3_FORM_TEST_PIXELS = {
"small": [
(115, 670),
(300, 670),
(165, 224),
(545, 224),
],
"large": [
(215, 2567),
(872, 2568),
(625, 1806),
(4834, 279),
],
}

CSV_UPLOAD_BATCH_SIZE = 1000

Expand Down
Loading