Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Return of the OCR'd crash narrative #1568

Merged
merged 29 commits into from
Oct 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2d72e03
restore old files - wip
johnclary Sep 26, 2024
3703029
wip ocr narrative extract within cris_import etl
johnclary Sep 26, 2024
4af186d
remove files that were briefly restored
johnclary Sep 26, 2024
b1abc9c
remove jpeg
johnclary Sep 26, 2024
f035461
create migrations for new col
johnclary Sep 26, 2024
0280a9d
use db view for this
johnclary Sep 26, 2024
5738208
add migration to use old narrative when new is null
johnclary Sep 27, 2024
c9d605d
update update statement
johnclary Sep 27, 2024
18535b0
more logging
johnclary Sep 27, 2024
8df55db
add comment
johnclary Sep 27, 2024
4d10547
thread it
johnclary Sep 27, 2024
514ce8b
refine download file
johnclary Sep 27, 2024
a884e99
rework pixel tests and handle narrative-related tasks
johnclary Sep 27, 2024
4a7ad2a
cleanup unused objects
johnclary Sep 27, 2024
9c4628f
comments
johnclary Sep 27, 2024
8dfc6b7
Merge branch 'jc-download-db-data' into 19204-ocr-narrative
johnclary Sep 30, 2024
a772af1
add docs and rename env to bucket_env
johnclary Sep 30, 2024
7f93f3d
typos
johnclary Sep 30, 2024
a704bc3
interrobang + comment
johnclary Sep 30, 2024
ea5352c
remove cr3 v0 test
johnclary Sep 30, 2024
ee3bff7
lint
johnclary Sep 30, 2024
004b670
comment out bulk narrative update
johnclary Sep 30, 2024
6bfcf11
add ocr script usage example
johnclary Sep 30, 2024
cb550ae
update env var
johnclary Sep 30, 2024
066be37
rename env var and doctring
johnclary Sep 30, 2024
2c3f07c
doc string and var passing
johnclary Sep 30, 2024
5631bac
use a better exception
johnclary Sep 30, 2024
bf27634
add jpeg to .gitignore
johnclary Oct 1, 2024
ff113ec
rename env to bucket_env in template
johnclary Oct 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
table:
name: view_crash_narratives_ocr_todo
schema: public
1 change: 1 addition & 0 deletions database/metadata/databases/default/tables/tables.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,4 @@
- "!include public_units.yaml"
- "!include public_units_cris.yaml"
- "!include public_units_edits.yaml"
- "!include public_view_crash_narratives_ocr_todo.yaml"
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
drop view if exists view_crash_narratives_ocr_todo;

alter table crashes_edits
drop column investigator_narrative_ocr_processed_at;

alter table crashes
drop column investigator_narrative_ocr_processed_at;
39 changes: 39 additions & 0 deletions database/migrations/default/1727375054583_narrative_ocr/up.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
alter table crashes_edits
add column investigator_narrative_ocr_processed_at timestamp with time zone;

alter table crashes
add column investigator_narrative_ocr_processed_at timestamp with time zone;

comment on column crashes_edits.investigator_narrative_ocr_processed_at is 'The most recent
timestamp at which the OCR process attempted to extract the investigator narrative. If null,
indicates that the OCR narrative extract has never been attempted. This value should be set
via ETL process.';

comment on column crashes.investigator_narrative_ocr_processed_at is 'The most recent
timestamp at which the OCR process attempted to extract the investigator narrative. If null,
indicates that the OCR narrative extract has never been attempted. This value should be set
via ETL process on the crashes_edits table.';

create or replace view view_crash_narratives_ocr_todo as (
select
id,
cris_crash_id
from
crashes
where
cr3_stored_fl = TRUE
and investigator_narrative is NULL
and (
investigator_narrative_ocr_processed_at is NULL
or cr3_processed_at >= investigator_narrative_ocr_processed_at
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so if we got a new cr3 that was processed and we have to OCR the investigator narrative again?

)
-- this issue started in Sep 2024
-- we do not OCR very old crashes
and updated_at > '2024-09-01'
order by
cr3_processed_at asc,
id asc
);

comment on view view_crash_narratives_ocr_todo is 'View which lists crashes which need to
be processed by the OCR narrative extraction ETL'
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
drop trigger if exists
crashes_cris_preserve_investigator_narrative_on_update on crashes_cris;

drop function if exists
public.crashes_cris_set_old_investigator_narrative;
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
create or replace function public.crashes_cris_set_old_investigator_narrative()
returns trigger
language plpgsql
as $function$
begin
new.investigator_narrative = old.investigator_narrative;
return new;
end;
$function$;

create or replace trigger crashes_cris_preserve_investigator_narrative_on_update
before update on public.crashes_cris
for each row
when (
new.investigator_narrative is null and old.investigator_narrative is not null
)
execute procedure public.crashes_cris_set_old_investigator_narrative();

comment on function public.crashes_cris_set_old_investigator_narrative is 'Sets the
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

awesome - this comment answered my question before i could ask it! 🙏

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 loving the function comments

investigator_narrative to its previous value if the updated value is null. This
trigger function addresses a known CRIS bug in which updated crash records are
missing the invesitgator narrative. It is tracked via DTS issue
https://github.com/cityofaustin/atd-data-tech/issues/18971 and CRIS ticket #854366';

--
-- backfill narratives which have been erased
-- run this manually to prevent migration timeout
--

-- update
-- crashes_cris
-- set
-- investigator_narrative = updates_todo.investigator_narrative_old
-- from (select
-- record_id as crash_pk,
-- crashes.investigator_narrative as investigator_narrative_new,
-- record_json -> 'old' ->> 'investigator_narrative' as investigator_narrative_old
-- from
-- change_log_crashes_cris as changes
-- left join crashes on changes.record_id = crashes.id
-- where
-- record_json -> 'old' ->> 'investigator_narrative' is not null
-- and record_json -> 'new' ->> 'investigator_narrative' is null
-- and operation_type = 'UPDATE'
-- and changes.created_at > '2024-09-09'
-- and changes.created_by = 'cris'
-- order by
-- changes.id asc) as updates_todo
-- where
-- crashes_cris.id = updates_todo.crash_pk;
1 change: 1 addition & 0 deletions etl/cris_import/.gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
extracts/*
!extracts/*.md
*.zip
*.jpeg
2 changes: 1 addition & 1 deletion etl/cris_import/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ COPY . /app
RUN apt-get -y update
# we need p7zip-full to extract CRIS zips
# poppler-utils as a pdf2image dependency to snip crash diagrams out of pdfs
RUN apt-get install -y p7zip-full poppler-utils
RUN apt-get install -y p7zip-full poppler-utils tesseract-ocr
RUN chmod -R 755 /app/*
RUN chmod +x /app/cris_import.py
RUN pip install -r requirements.txt
39 changes: 26 additions & 13 deletions etl/cris_import/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,23 @@

This ETL manages the processing and importing of TxDOT CRIS data into the Vision Zero database.

All data processing is managed by a single script, `cris_import.py` which processes both CSV files and CR3 PDF crash reports. The script supports a number of CLI arguments to handle a number of data processing scenarios.
The following scripts are available:

## Quick start
- **`cris_import.py`**: the primary data import script which processes both CSV files and CR3 PDF crash reports.
- **`cr3_ocr_narrative.py`**: a utility script which extracts crash narrative data from CR3 PDFs using Optical Character Recognition (OCR)
- **`_restore_zips_from_archive.py`**: a development helper script that moves CRIS extract zips from `./archive` to `./inbox` in the S3 bucket.

Set your `ENV` to `dev` in order to safely run the S3 operations locally.
## Quick start - CRIS import

1. Start your local Vision Zero cluster (database + Hasura + editor).

2. Save a copy of the `env_template` file as `.env`, and fill in the details.
2. Save a copy of the `env_template` file as `.env`, and fill in the details. Set the `BUCKET_ENV` variable to `dev` in order to safely run the S3 operations locally.
Copy link
Member Author

@johnclary johnclary Sep 30, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i decided to change the env var ENV to BUCKET_ENV, just to be super clear that only S3 operations are affected by this variable. For example, you could set BUCKET_ENV to prod and still point your hasura endpoint/secret to local, and vice-versa.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this change, what do you think about updating the env_tamplate with the new var name?


3. Build and run the docker image. This will drop you into the docker image's shell:

```shell
$ docker compose build # <- only need to do this once
$ docker compose run cris_import
docker compose build # <- do this once, and when dependencies change
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🙏

docker compose run cris_import
```

4. Run the CRIS import script. This will download any extracts available in S3, load the CSV crash records into the database, crop crash diagrams out of the CR3 PDFs, and upload the CR3 pdfs and crash diagrams to the s3 bucket.
Expand All @@ -30,26 +32,27 @@ $ ./cris_import.py --s3-download --s3-upload --csv --pdf

Create your environment by saving a copy of the `env_template` file as `.env`. The template includes default values for local development. See the password store for more details.

All interactions with AWS S3 occur with against a single bucket which has subdirectores for the `dev` and `prod` environments. If you set your `ENV` to `dev` you can safely run this ETL's S3 operations.
All interactions with AWS S3 occur against a single bucket which has subdirectores for the `dev` and `prod` environments. If you set your `BUCKET_ENV` to `dev` you can safely run this ETL's S3 operations.

```
ENV=dev
HASURA_GRAPHQL_ENDPOINT="http://localhost:8084/v1/graphql"
BUCKET_ENV=dev
HASURA_GRAPHQL_ENDPOINT=http://localhost:8084/v1/graphql
HASURA_GRAPHQL_ADMIN_SECRET=hasurapassword
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
BUCKET_NAME=
EXTRACT_PASSWORD=
```

## Usage examples
## CRIS Import - `cris_import.py`

The only script that should be run directly is `cris_import.py`. It supports the following CLI args:
This is the primary data import script which processes both CSV files and CR3 PDF crash reports. It supports a number of CLI args, with usage examples described below.

```shell
--csv Process CSV files. At least one of --csv or --pdf is required.
--pdf Process CR3 pdfs. At least one of --csv or --pdf is required.
--s3-download Source zip extracts from S3 bucket
--s3-upload Upload cr3 pdfs and digrams to S3 bucket
--s3-upload Upload cr3 pdfs and diagrams to S3 bucket
--s3-archive If using --s3-download, move the processed extracts from ./inbox to ./archive when done
--skip-unzip Only process files that are already unzipped in the local directory
--verbose, -v Sets logging level to DEBUG mode
Expand All @@ -69,7 +72,7 @@ $ ./cris_import.py --s3-download --s3-upload --s3-archive --csv --pdf --workers

### Local import

Process any extract zips in your local `./extracts` directory. CSVs will be loaded ino the db, and crash diagrams will be extracted but not uploaded to S3.
Process any extract zips in your local `./extracts` directory. CSVs will be loaded into the db, and crash diagrams will be extracted but not uploaded to S3.

```shell
$ ./cris_import.py --csv --pdf
Expand All @@ -92,3 +95,13 @@ $ ./cris_import.py --skip-unzip --csv
# process pdfs with more workers and in debug mode
$ ./cris_import.py --pdf --skip-unzip --s3-upload --workers 8 --verbose
```

## CR3 Narrative Extraction - `cr3_ocr_narrative.py`

This utility script extracts crash narrative data from CR3 PDFs using Optical Character Recognition (OCR). Although CRIS provides an `investigator_narrative` column, it is often blank due to an unknown CRIS issue tracked [here](https://github.com/cityofaustin/atd-data-tech/issues/18971).

It supports the `--verbose` flag to enable debug logging, and the number of concurrent workers can be set with the `--workers` flag.

```shell
./cr3_ocr_narrative.py --verbose --workers 4
```
2 changes: 1 addition & 1 deletion etl/cris_import/_restore_zips_from_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)

BUCKET_NAME = os.environ["BUCKET_NAME"]
ENV = os.environ["ENV"]
ENV = os.environ["BUCKET_ENV"]


def main():
Expand Down
115 changes: 115 additions & 0 deletions etl/cris_import/cr3_ocr_narrative.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python
from datetime import datetime, timezone
from concurrent.futures import ProcessPoolExecutor

from pdf2image import convert_from_bytes
from pytesseract import image_to_string, image_to_data

from utils.cli import get_cli_args
from utils.files import download_cr3_pdf
from utils.graphql import (
make_hasura_request,
NARRATIVES_TODO_QUERY,
UPDATE_CRASH_NARRATIVE_OCR_MUTATION,
)
from utils.logging import init_logger
from utils.process_pdfs import get_cr3_version
from utils.settings import NARRATIVE_BBOX_PIXELS


def extract_narrative_pdf(cris_crash_id, crash_pk, index):
"""Handles narrative extraction of one PDF
Args:
cris_crash_id (int): the CRIS crash ID
crash_pk (int): the crash ID in the VZ database
index (int): a unique id which captures the position of this
item in the list of narratives being processed by the
script. it enables better error logging during concurrency
"""
logger.info(f"Processing cris crash ID {cris_crash_id} ({index})")
pdf = download_cr3_pdf(cris_crash_id)

logger.debug("Converting PDF to image...")
page = convert_from_bytes(
pdf,
fmt="jpeg", # jpeg is much faster than the default ppm fmt
first_page=2, # page 2 has the crash narrative
last_page=2,
dpi=150,
)[0]

cr3_version = get_cr3_version(page)
logger.debug(f"CR3 version: {cr3_version}")

# uncomment to save a copy of the pdf page image
# page.save(f"temp/{cris_crash_id}_page_{cr3_version}.jpeg")

bbox = NARRATIVE_BBOX_PIXELS[cr3_version]

logger.debug("Cropping narrative from PDF...")
narrative_image = page.crop(bbox)

# uncomment to save a copy of the cropped narrative image
# narrative_image.save(f"temp/{cris_crash_id}_narrative_{cr3_version}.jpeg")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is essential for debugging this, so i figured i'd leave it in here 🤷


logger.debug("Extracting narrative text...")
narrative = None
narrative = image_to_string(narrative_image, lang="eng")
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the old version of the script used a try/except/ block for this call, but it never errored for me, even when parsing some gnarly old garbage PDFs. my thinking was that for now it would be better to know if this breaks somehow, but if it becomes a nuisance we can easily revisit.


logger.debug(f"Extracted narrative: {narrative}")

variables = {
"id": crash_pk,
"updates": {
"investigator_narrative": narrative,
"investigator_narrative_ocr_processed_at": datetime.now(
timezone.utc
).isoformat(),
"updated_by": "dts_automation",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be our standard naming convention going forward for automated tasks/etls making updates?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it would be nice to agree on a convention—i did feel 100% great about this and am very open to ideas.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you did feel 100% great? or are you looking for other ideas?
would you want to label anything updated by our scripts with dts_automation, or make it more specific with which script updated it? like cris_import_script

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did not* feel 100% great. yeah, i was wondering if we see benefit in being specific about what task made the updates or just using the generic dts_automation.

},
}

logger.debug("Updating crash record...")
make_hasura_request(
query=UPDATE_CRASH_NARRATIVE_OCR_MUTATION, variables=variables
)

def main(cli_args):
logger.info("Downloading crashes todo...")
todos = make_hasura_request(query=NARRATIVES_TODO_QUERY)[
"view_crash_narratives_ocr_todo"
]

logger.info(f"{len(todos)} to process")
if not todos:
return

futures = []
with ProcessPoolExecutor(max_workers=cli_args.workers) as executor:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this whole block is for creating multiple parallel processes to run the extract_narrative_pdf function concurrently for the crashes in the todos list?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep!

for index, crash in enumerate(todos):
future = executor.submit(
extract_narrative_pdf, crash["cris_crash_id"], crash["id"], index
)
futures.append(future)
errors = []
for index, future in enumerate(futures):
try:
future.result()
except Exception as e:
# grab the filename from the pdf list
crash = todos[index]
errors.append([crash["cris_crash_id"], e])

if errors:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the same behavior as the crash diagram extractor

logger.error(
f"Encountered {len(errors)} error(s) extracting narratives. Logging up to 10 errors and raising the first..."
)
for cris_crash_id, e in errors[0:10]:
logger.info(f"Error processing CRIS crash ID {cris_crash_id}: {e}")
raise errors[0][1]

if __name__ == "__main__":
cli_args = get_cli_args()
logger = init_logger(debug=cli_args.verbose)
main(cli_args)
4 changes: 2 additions & 2 deletions etl/cris_import/env_template
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
ENV=dev
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
BUCKET_ENV=dev
BUCKET_NAME=
EXTRACT_PASSWORD=
HASURA_GRAPHQL_ENDPOINT="http://localhost:8084/v1/graphql"
HASURA_GRAPHQL_ENDPOINT=http://localhost:8084/v1/graphql
HASURA_GRAPHQL_ADMIN_SECRET=hasurapassword
1 change: 1 addition & 0 deletions etl/cris_import/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
boto3==1.*
pdf2image==1.*
pytesseract==0.*
requests==2.*
Loading