diff --git a/README.md b/README.md index e1d90af..a09f84b 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,53 @@ -# etl-template +# CAPE ETL Script Template +This provides a template for easily getting started creating ETL scripts for the +CAPE system. The script provides the boilerplate for retrieving the raw data +file passed into the transform as well as the uploading of the cleaned data once +it is created. + +## Usage + +1. Click the "Use this template" button at the top of the repository which will + guide you through creating a new repository from this template code. +2. Update the `README.md` to describe the transformation +3. Update the `main.py` file to make the necessary modifications as indicated by + the "TODO" comments in the code: + + - Make necessary transformations to the data made available in the `raw` + variable and store it into the `cleaned` variable + - Update the `cleaned_key` variable to have the correct final filename for + the cleaned file + +4. Add any necessary extra dependencies to the `requirements.txt` file + +## Releasing + +Once the code is at a good point to start being used in the CAPE system we will +need releases to be tagged and maintained. We have workflows built into the +repository through GitHub Actions to help make this as easy as possible. + +If you navigate to the "Actions" tab at the top of the repository and go to the +"Release" action, there is a button to "Run workflow". If you click this and run +the workflow then the script will calculate the appropriate version, create a +new tag, and create a new release in GitHub. + +> [!NOTE] +> By default this is set up to follow a date based versioning schema. This is in +> the format of `YYYY.MM.DD`. If multiple versions are tagged on the same day +> then a revision number will be appended to the end such as `YYYY.MM.DD.1`. + +## Best Practices + +This repository comes bundled with the recommended CAPE practices such as type +checking and format checking. It is recommended to use the following tools to +make sure the tests pass: + +- Use `pyright` language server or the Python language support in VS Code to + properly check the types and validity of the code +- Use `black` and `isort` to format the code + +> [!TIP] +> The repository comes with the necessary configuration for VS Code built in. +> When loading the project it may pop up to install recommended extensions. If +> you install all of them then the editor will configure itself to follow the +> above practices. diff --git a/main.py b/main.py index 18a9045..475833b 100644 --- a/main.py +++ b/main.py @@ -1,27 +1,19 @@ """ETL script Template.""" import sys +from pathlib import Path import boto3 as boto3 from awsglue.context import GlueContext from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext -# for our purposes here, the spark and glue context are only (currently) needed -# to get the logger. +# Initialize logging and context spark_ctx = SparkContext() glue_ctx = GlueContext(spark_ctx) logger = glue_ctx.get_logger() -# TODO: -# - add error handling for the format of the document being incorrect -# - figure out how we want to name and namespace clean files (e.g. will we -# take the object key we're given, strip the extension and replace it with -# one for the new format, or will we do something else) -# - see what we can extract out of here to be useful for other ETLs. imagine -# we'd have a few different things that could be made into a reusable -# package - +# Evaluate parameters parameters = getResolvedOptions( sys.argv, [ @@ -30,22 +22,14 @@ "CLEAN_BUCKET_NAME", ], ) - raw_bucket = parameters["RAW_BUCKET_NAME"] raw_key = parameters["ALERT_OBJ_KEY"] clean_bucket = parameters["CLEAN_BUCKET_NAME"] -# NOTE: for now we'll take the alert object key and change out the file -# extension for the clean data (leaving all namespacing and such). this -# will probably need to change - -# NOTE: May need some creds here +# Retrieve the raw file passed into the ETL script +# Fail nicely if there is an error and log it s3_client = boto3.client("s3") - -# try to get the docx object from S3 and handle any error that would keep us -# from continuing. response = s3_client.get_object(Bucket=raw_bucket, Key=raw_key) - status = response.get("ResponseMetadata", {}).get("HTTPStatusCode") if status != 200: @@ -53,31 +37,28 @@ f"ERROR - Could not get object {raw_key} from bucket " f"{raw_bucket}. ETL Cannot continue." ) - logger.error(err) - - # NOTE: need to properly handle exception stuff here, and we probably want - # this going somewhere very visible (e.g. SNS topic or a perpetual log - # as someone will need to be made aware) raise Exception(err) logger.info(f"Obtained object {raw_key} from bucket {raw_bucket}.") +# `raw` has the contents of the raw file passed into the script raw = response.get("Body") -# create the cleaned output as well as the name of the cleaned file +# TODO: Here you want to clean the contents of the `raw` variable +# and produce the "cleaned" content to the `cleaned` variable cleaned = None -clean_key = None +# TODO: Specify the name of the new clean file # We typically just want to replace the file extension with a new one -# Below is a commented out example of this -# clean_key = raw_key.replace(".xlsx", ".csv") +# Below is an example of this, update with the correct extension +clean_key = str(Path(raw_key).with_suffix(".csv")) -if cleaned is not None and clean_key is not None: +# Put the new cleaned object into the clean bucket +if cleaned is not None: response = s3_client.put_object( Bucket=clean_bucket, Key=clean_key, Body=cleaned ) - status = response.get("ResponseMetadata", {}).get("HTTPStatusCode") if status != 200: @@ -85,12 +66,7 @@ f"ERROR - Could not write transformed data object {clean_key} " f"to bucket {clean_bucket}. ETL Cannot continue." ) - logger.error(err) - - # NOTE: need to properly handle exception stuff here, and we probably - # want this going somewhere very visible (e.g. SNS topic or a - # perpetual log as someone will need to be made aware) raise Exception(err) logger.info(