doc: add documentation for getting started with the template

cape-ph · Sep 12, 2024 · 62dff36 · 62dff36
1 parent fd0afbf
commit 62dff36
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,53 @@
-# etl-template
+# CAPE ETL Script Template
 
+This provides a template for easily getting started creating ETL scripts for the
+CAPE system. The script provides the boilerplate for retrieving the raw data
+file passed into the transform as well as the uploading of the cleaned data once
+it is created.
+
+## Usage
+
+1. Click the "Use this template" button at the top of the repository which will
+   guide you through creating a new repository from this template code.
+2. Update the `README.md` to describe the transformation
+3. Update the `main.py` file to make the necessary modifications as indicated by
+   the "TODO" comments in the code:
+
+    - Make necessary transformations to the data made available in the `raw`
+      variable and store it into the `cleaned` variable
+    - Update the `cleaned_key` variable to have the correct final filename for
+      the cleaned file
+
+4. Add any necessary extra dependencies to the `requirements.txt` file
+
+## Releasing
+
+Once the code is at a good point to start being used in the CAPE system we will
+need releases to be tagged and maintained. We have workflows built into the
+repository through GitHub Actions to help make this as easy as possible.
+
+If you navigate to the "Actions" tab at the top of the repository and go to the
+"Release" action, there is a button to "Run workflow". If you click this and run
+the workflow then the script will calculate the appropriate version, create a
+new tag, and create a new release in GitHub.
+
+> [!NOTE]
+> By default this is set up to follow a date based versioning schema. This is in
+> the format of `YYYY.MM.DD`. If multiple versions are tagged on the same day
+> then a revision number will be appended to the end such as `YYYY.MM.DD.1`.
+
+## Best Practices
+
+This repository comes bundled with the recommended CAPE practices such as type
+checking and format checking. It is recommended to use the following tools to
+make sure the tests pass:
+
+-   Use `pyright` language server or the Python language support in VS Code to
+    properly check the types and validity of the code
+-   Use `black` and `isort` to format the code
+
+> [!TIP]
+> The repository comes with the necessary configuration for VS Code built in.
+> When loading the project it may pop up to install recommended extensions. If
+> you install all of them then the editor will configure itself to follow the
+> above practices.
diff --git a/main.py b/main.py
@@ -1,27 +1,19 @@
 """ETL script Template."""
 
 import sys
+from pathlib import Path
 
 import boto3 as boto3
 from awsglue.context import GlueContext
 from awsglue.utils import getResolvedOptions
 from pyspark.context import SparkContext
 
-# for our purposes here, the spark and glue context are only (currently) needed
-# to get the logger.
+# Initialize logging and context
 spark_ctx = SparkContext()
 glue_ctx = GlueContext(spark_ctx)
 logger = glue_ctx.get_logger()
 
-# TODO:
-#   - add error handling for the format of the document being incorrect
-#   - figure out how we want to name and namespace clean files (e.g. will we
-#     take the object key we're given, strip the extension and replace it with
-#     one for the new format, or will we do something else)
-#   - see what we can extract out of here to be useful for other ETLs. imagine
-#     we'd have a few different things that could be made into a reusable
-#     package
-
+# Evaluate parameters
 parameters = getResolvedOptions(
     sys.argv,
     [
@@ -30,67 +22,51 @@
         "CLEAN_BUCKET_NAME",
     ],
 )
-
 raw_bucket = parameters["RAW_BUCKET_NAME"]
 raw_key = parameters["ALERT_OBJ_KEY"]
 clean_bucket = parameters["CLEAN_BUCKET_NAME"]
 
-# NOTE: for now we'll take the alert object key and change out the file
-#       extension for the clean data (leaving all namespacing and such). this
-#       will probably need to change
-
-# NOTE: May need some creds here
+# Retrieve the raw file passed into the ETL script
+# Fail nicely if there is an error and log it
 s3_client = boto3.client("s3")
-
-# try to get the docx object from S3 and handle any error that would keep us
-# from continuing.
 response = s3_client.get_object(Bucket=raw_bucket, Key=raw_key)
-
 status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
 
 if status != 200:
     err = (
         f"ERROR - Could not get object {raw_key} from bucket "
         f"{raw_bucket}. ETL Cannot continue."
     )
-
     logger.error(err)
-
-    # NOTE: need to properly handle exception stuff here, and we probably want
-    #       this going somewhere very visible (e.g. SNS topic or a perpetual log
-    #       as someone will need to be made aware)
     raise Exception(err)
 
 logger.info(f"Obtained object {raw_key} from bucket {raw_bucket}.")
 
+# `raw` has the contents of the raw file passed into the script
 raw = response.get("Body")
 
-# create the cleaned output as well as the name of the cleaned file
+# TODO: Here you want to clean the contents of the `raw` variable
+# and produce the "cleaned" content to the `cleaned` variable
 cleaned = None
 
-clean_key = None
+# TODO: Specify the name of the new clean file
 # We typically just want to replace the file extension with a new one
-# Below is a commented out example of this
-# clean_key = raw_key.replace(".xlsx", ".csv")
+# Below is an example of this, update with the correct extension
+clean_key = str(Path(raw_key).with_suffix(".csv"))
 
-if cleaned is not None and clean_key is not None:
+# Put the new cleaned object into the clean bucket
+if cleaned is not None:
     response = s3_client.put_object(
         Bucket=clean_bucket, Key=clean_key, Body=cleaned
     )
-
     status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
 
     if status != 200:
         err = (
             f"ERROR - Could not write transformed data object {clean_key} "
             f"to bucket {clean_bucket}. ETL Cannot continue."
         )
-
         logger.error(err)
-
-        # NOTE: need to properly handle exception stuff here, and we probably
-        #       want this going somewhere very visible (e.g. SNS topic or a
-        #       perpetual log as someone will need to be made aware)
         raise Exception(err)
 
     logger.info(