Converted the following jupyter notebook to Python scripts:
- Install Kaggle CLI
- Install AWS CLI v2
- Install Helm
- Install Kubectl
- Optionally, install Go
# change to the local git directory
cd kubernetes-operator-roiergasias
# change to the infra/aws directory
cd infra/aws
# create a new terraform values override file: ./values-secret.tfvars
cp ./values.tfvars ./values-secret.tfvars
# substitute the value for <PREFIX> by replacing PLACEHOLDER in the following command:
# PLACEHOLDER e.g. "roiergasias" or "workflow" etc.
sed -i 's|<PREFIX>|PLACEHOLDER|g' ./values-secret.tfvars
# substitute the value for <ENVIRONMENT> by replacing PLACEHOLDER in the command
# PLACEHOLDER e.g. "demo" or "play" or "poc" or "dev" or "test" etc.
sed -i 's|<ENVIRONMENT>|PLACEHOLDER|g' ./values-secret.tfvars
# substitute the value for <REGION> by replacing PLACEHOLDER in the command
# PLACEHOLDER e.g. "ap-southeast-2" for Sydney or "ap-southeast-1" for Singapore or "us-east-1" for North Virginia etc.
# Browse for more regions
# run this to know more: "aws ec2 describe-regions -o table"
sed -i 's|<REGION>|PLACEHOLDER|g' ./values-secret.tfvars
# substitute the value for <NODE1_COUNT> by replacing PLACEHOLDER in the command
# this is the node count for primary node i.e. "node1" of size - t2.medium
sed -i 's|<NODE1_COUNT>|PLACEHOLDER|g' ./values-secret.tfvars
# substitute the value for <NODE2_COUNT> by replacing PLACEHOLDER in the command
# this is the node count for secondary node i.e. "node2" of size - t2.large
sed -i 's|<NODE2_COUNT>|PLACEHOLDER|g' ./values-secret.tfvars
# verify the ./values-secret.tfvars file by displaying its content
cat ./values-secret.tfvars
# output should be something like this
# if there is a correction needed then use text editor 'nano' to update the file and then press ctrl+x after you are done editing
nano ./values-secret.tfvars
# initialise terraform providers
terraform init
# execute infrastructure provisioning command
terraform apply -var-file=values-secret.tfvars
# get kubectl credentials
aws eks update-kubeconfig --region <REGION> --name <PREFIX>-<ENVIRONMENT>-eks01
# for e.g., aws eks update-kubeconfig --region ap-southeast-2 --name roiergasias-demo-eks01
# change to the local git directory
cd kubernetes-operator-roiergasias
# or, if coming from previous steps then
cd ../..
# copy kaggle api credentials from ~/.kaggle
cp ~/.kaggle/kaggle.json cmd/
# copy aws cli credentials from ~/.aws
cp -RP ~/.aws cmd/.aws/
# set execute permissions to go binary
chmod +x cmd/linux/roiergasias
# build docker image
docker build -t roiergasias:aws -f cmd/Dockerfile-aws cmd
# re-tag local docker image
docker tag roiergasias:aws<REPOSITORY>/roiergasias:aws
# where, <REPOSITORY> is the docker hub repository name or docker hub username, for e.g.,
# docker tag roiergasias:aws
# login to docker hub
docker login
# push the docker image to docker hub
docker push<REPOSITORY>/roiergasias:aws
# where, <REPOSITORY> is the docker hub repository name or docker hub username, for e.g.,
# docker push
NOTE: Make sure you have changed the above mentioned docker hub repository as private because it contains your kaggle api key credentials and aws cli credentials
Create Kubernetes secret for docker hub credentials (after pushing docker image for AWS as mentioned above)
# create docker hub registry credentials (for pulling docker image pushed previously)
helm upgrade -i --repo imagepullsecrets imagepullsecrets \
--version 3.0.0 \
--create-namespace -n roiergasias \
--set imagePullSecret.registryURL="" \
--set imagePullSecret.secretName="container-registry-secret" \
--set imagePullSecret.username="<USERNAME>" \
--set imagePullSecret.password="<PASSWORD>"
# where, <USERNAME> and <PASSWORD> are the credentials for login to docker hub
Run workflow via Kubernetes operator (after provisioning AWS infrastructure and creating kubernetes secret for docker hub credentials as mentioned above)
# change to the local git directory
cd kubernetes-operator-roiergasias
# install the Kubernetes operator
helm install --repo \
--version v0.1.2 \
roiergasias-operator roiergasias-operator
# change to the examples/machine-learning/aws directory
cd examples/machine-learning/aws
# upload the workflow yaml and python script files
# assumes 'roiergasias' as <PREFIX> and 'demo' as <ENVIRONMENT> values
# otherwise, change to correct <PREFIX> and <ENVIRONMENT> values in "s3://<PREFIX>-<ENVIRONMENT>-s3b01/"
aws s3 cp s3://roiergasias-demo-s3b01/
aws s3 cp s3://roiergasias-demo-s3b01/
aws s3 cp s3://roiergasias-demo-s3b01/
# create a new helm chart values override file: ./helm/roiergasias-aws/values-secret.yaml
cp ./helm/roiergasias-aws/values.yaml ./helm/roiergasias-aws/values-secret.yaml
# update the values in ./helm/roiergasias-aws/values-secret.yaml using nano or vi editor
nano ./helm/roiergasias-aws/values-secret.yaml
# update "image" to be "<REPOSITORY>/roiergasias:local"
# where, <REPOSITORY> is the docker hub repository name or docker hub username, for e.g.,
# ""
# update "s3URI" to be "s3://<PREFIX>-<ENVIRONMENT>-s3b01/",
# where, <PREFIX> and <ENVIRONMENT> were set in the infra/aws/values-secret.tfvars, for e.g.,
# "s3://roiergasias-demo-s3b01/"
# update "enablePersistentVolume" to be either 0 (default) or 1 to turn OFF or ON the persistent volume from elastic file system (EFS)
# it gives persistence to the data written by steps in the workflow. Regardless of value, each sequential task syncs up to the S3 at the end of each stage.
# if "enablePersistentVolume" is set to 1 then, update "efsId" by running the command and copying the second value from output:
# aws --region <REGION> efs describe-file-systems --query 'FileSystems[*].[Name, FileSystemId]' --output text | grep <PREFIX>-<ENVIRONMENT>-efs01, for e.g.,
# aws --region ap-southeast-2 efs describe-file-systems --query 'FileSystems[*].[Name, FileSystemId]' --output text | grep roiergasias-demo-efs01
# output helm chart template for roiergasias aws
helm template \
-n roiergasias \
-f ./helm/roiergasias-aws/values-secret.yaml \
roiergasias-aws ./helm/roiergasias-aws >machine-learning-aws-manifest.yaml
# explore the contents of the machine-learning-aws-manifest.yaml
cat machine-learning-aws-manifest.yaml
# apply the manifest
kubectl apply -f machine-learning-aws-manifest.yaml
# browse workflow created by the manifest
kubectl get workflow -n roiergasias
# browse jobs created by the workflow
kubectl get jobs -n roiergasias
# the following jobs should be created one after another:
# roiergasias-aws-1-node1
# roiergasias-aws-2-node2
# roiergasias-aws-3-node2
# this is because of the split workflow (count = 3) spread into 2 nodes (node1 and node2)
# after all jobs are completed, browse pods created by the above jobs
kubectl get pods -n roiergasias
# check pod logs for the output and wait till the last one is completed
kubectl logs roiergasias-aws-1-node1-<STRING_FROM_PREVIOUS_STEP> -n roiergasias
kubectl logs roiergasias-aws-2-node2-<STRING_FROM_PREVIOUS_STEP> -n roiergasias
kubectl logs roiergasias-aws-3-node2-<STRING_FROM_PREVIOUS_STEP> -n roiergasias
# check the contents of s3 bucket for output files like weatherAUS.csv, processed-weatherAUS.csv and ml-model.joblib
# assumes 'roiergasias' as <PREFIX> and 'demo' as <ENVIRONMENT> values
# otherwise, change to correct <PREFIX> and <ENVIRONMENT> values in "s3://<PREFIX>-<ENVIRONMENT>-s3b01/"
aws s3 ls s3://roiergasias-demo-s3b01
# delete the manifest
kubectl delete -f machine-learning-aws-manifest.yaml
rm machine-learning-aws-manifest.yaml
# delete the roiergasias namespace (optional)
kubectl delete ns roiergasias
# uninstall the operator (optional)
helm uninstall roiergasias-operator
Notice the sequence of actions:
1. Create config map 1 + job 1 for split workflow - "process data" on "node1"
2. Wait for job 1 to complete
3. Create config map 2 + job 2 for split workflow - "train model" on "node2"
4. Wait for job 2 to complete
5. Create config map 3 + job 3 for split workflow - "evaluate model" on "node2"
6. Wait for job 3 to complete
# change to the local git directory
cd kubernetes-operator-roiergasias
# or, if coming from previous steps then
cd ../../..
# change to the infra/aws directory
cd infra/aws
# make sure your s3 bucket is empty
# assumes 'roiergasias' as <PREFIX> and 'demo' as <ENVIRONMENT> values
aws s3 rm s3://roiergasias-demo-s3b01 --recursive
# execute infrastructure de-provisioning command
terraform destroy -var-file=values-secret.tfvars
# sometimes it fails the first time. So, after a delay of 5-10 mins you can repeat the above command as many times until it succeeds
# delete terraform related files
rm -rf .terraform* terraform*
# return to the local git directory
cd ../..