snapshotEngine: DigitalOcean complete migration (#586)

* add value to skip snap web generation * add configurable value for s3 bucket * web build condition on domain name instead * add secret and configurable s3 bucket override * switch name and mountpath to match format * update secret name and use in zip and upload job * use export instead of temp var * secret name change * expect correct names on secret volume mount * correct path to secret mount * rework credential override to provide logs and error messages * use double quotes for early expansion * remove variable checking since we are feeding in files * bug: container is gone so we cant delete a volume * show commands for debug * wrong default s3 bucket var * turn of tar output for debug * undo command verbosity * Verbose variables * Enable interactive for alias to work * More useful alias message and rm debug messages * Need space after ! * expand aliases instead of interactive * add public-read and move index.html * Website redirects stay in AWS * Set alias only for filesystem artifact upload * rolling redirects working * fix volume indexing * helpful messages * Useful comments for new indexing format * Omit alias functionality in lieu of variable parameters * Fix rolling tarball filename * configmap needs fqdn * cdn isnt working so we're using bucket url * unsilence lz4 logs * wrong aws bucket name * get all snapshot metadata from do spaces * upload metadatas to alt s3 bucket * fix metadata related to website build * initial commit demo functionality * put redirects back * remove merged files * update zip and upload commands for dual creds * sleep for debug * allow override of storage class for scratch volumes * use storage class as set * Container-running OS will not resolve localhost * Remove infinite sleep from debugging * Empty-Commit to trigger CI test * bucket name change to do space * rm fqdn from cm * increase warmer timeout * increase timeout after artifact job create * DO rate limits snapshots per 10m * sleep between creation for rate limiting * need different command for site upload * block snapshot until node ready * pause scheduler if node not ready * add sleep for cpu usage reduction * fix busy waits and document why * fix busy wait on job and more better comments
oxheadalpha · Aug 15, 2023 · 7e31daf · 7e31daf
1 parent f5f784c
commit 7e31daf
Show file tree

Hide file tree

Showing 9 changed files with 140 additions and 95 deletions.
diff --git a/.gitignore b/.gitignore
@@ -15,5 +15,6 @@ build
 
 # Ignore mkchain generated files
 *_values.yaml
+*-values.yaml
 
 charts/tezos/charts
diff --git a/charts/snapshotEngine/scripts/snapshot-warmer.sh b/charts/snapshotEngine/scripts/snapshot-warmer.sh
@@ -27,6 +27,7 @@ delete_old_volumesnapshots() {
   local max_snapshots="${2##max_snapshots=}"
 
   while [ "$(getNumberOfSnapshots readyToUse=true --selector="$selector")" -gt "$max_snapshots" ]; do
+    sleep 5
     NUMBER_OF_SNAPSHOTS=$(getNumberOfSnapshots readyToUse=true --selector="$selector")
     printf "%s Number of snapshots with selector '$selector' is too high at $NUMBER_OF_SNAPSHOTS. Deleting 1.\n" "$(timestamp)"
     SNAPSHOTS=$(getSnapshotNames readyToUse=true --selector="$selector")
@@ -37,31 +38,31 @@ delete_old_volumesnapshots() {
   done
 }
 
-delete_stuck_volumesnapshots() {
-  snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}")
-  arr=(`echo ${snapshot_list}`);
-  for snapshot_name in "${arr[@]}"; do
-    snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}')
-    snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1}
-    snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s)
-    current_date_unix=$(date -u +%s)
-    snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60  ))
-    # Snapshots should never be older than 6 minutes
-    # If they are then there's a problem on AWS' end and the snapshot needs to be deleted.
-    if [ $snapshot_age_minutes -ge 6 ]; then
-      printf "%s Snasphot %s is %s minutes old.  It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes"
-      err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null)
-      if [ $? -ne 0 ]; then
-        printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name"
-        printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err"
-        sleep 10
-        exit 1
-      else
-         printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name"
-      fi
-    fi
-  done
-}
+# delete_stuck_volumesnapshots() {
+#   snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}")
+#   arr=(`echo ${snapshot_list}`);
+#   for snapshot_name in "${arr[@]}"; do
+#     snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}')
+#     snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1}
+#     snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s)
+#     current_date_unix=$(date -u +%s)
+#     snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60  ))
+#     # Snapshots should never be older than 6 minutes
+#     # If they are then there's a problem on AWS' end and the snapshot needs to be deleted.
+#     if [ $snapshot_age_minutes -ge 6 ]; then
+#       printf "%s Snasphot %s is %s minutes old.  It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes"
+#       err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null)
+#       if [ $? -ne 0 ]; then
+#         printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name"
+#         printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err"
+#         sleep 10
+#         exit 1
+#       else
+#          printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name"
+#       fi
+#     fi
+#   done
+# }
 
 HISTORY_MODE="$(echo "$NODE_CONFIG" | jq -r ".history_mode")"
 TARGET_VOLUME="$(echo "$NODE_CONFIG" | jq ".target_volume")"
@@ -83,12 +84,23 @@ yq e -i '.spec.volumeSnapshotClassName=strenv(VOLUME_SNAPSHOT_CLASS)' createVolu
 
 while true; do
 
+  # Pause if nodes are not ready
+  until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
+    printf "%s Tezos node is not ready for snapshot.  Check node pod logs.  \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
+      sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node
+      if  [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; then
+        break
+      fi
+    done
+  done
+
   # Remove unlabeled snapshots
   delete_old_volumesnapshots selector='!history_mode' max_snapshots=0
   # Maintain 4 snapshots of a certain history mode
   delete_old_volumesnapshots selector="history_mode=$HISTORY_MODE" max_snapshots=4
   # Check for and delete old stuck snapshots
-  delete_stuck_volumesnapshots
+  # delete_stuck_volumesnapshots
 
   if ! [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; then
     # EBS Snapshot name based on current time and date
@@ -113,7 +125,7 @@ while true; do
     while [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; do
       printf "%s Snapshot is still creating...\n" "$(timestamp)"
       sleep 10
-      delete_stuck_volumesnapshots
+      # delete_stuck_volumesnapshots
     done
     end_time=$(date +%s)
     elapsed=$((end_time - start_time))
@@ -122,6 +134,9 @@ while true; do
   else
     printf "%s Snapshot already in progress...\n" "$(timestamp)"
     sleep 10
-    delete_stuck_volumesnapshots
+    # delete_stuck_volumesnapshots
   fi
+
+  printf "%s Sleeping for 10m due to Digital Ocean rate limit.\n" "$(timestamp)"
+  sleep 10m  
 done
diff --git a/charts/snapshotEngine/templates/configmap.yaml b/charts/snapshotEngine/templates/configmap.yaml
@@ -15,7 +15,7 @@ data:
   SCHEMA_URL: {{ $.Values.schemaUrl }}
   S3_BUCKET: {{ $.Values.s3BucketOverride }}
   CLOUD_PROVIDER: {{ $.Values.cloudProvider }}
-  FQDN: {{ $.Values.fqdn }}
+  STORAGE_CLASS: {{$.Values.volumeSnapClass }}
 kind: ConfigMap
 metadata:
   name: snapshot-configmap

diff --git a/snapshotEngine/mainJob.yaml b/snapshotEngine/mainJob.yaml
@@ -53,17 +53,18 @@ spec:
 
               # These loops wait on the RPC to come online and prevent log from printing same line
               # over and over and over again.  This prints one line and waits for the RPC to come online for a clean log.
-              until wget -qO-  http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
+              until wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
                 printf "%s Waiting for node RPC to come online.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
-                until wget -qO-  http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
-                  if  wget -qO-  http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
+                until wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
+                  sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for rpc
+                  if  wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
                     break
                   fi
                 done
               done
 
               # If somehow we skip the above waiting loop, this kills the job if the RPC is not online.
-              if ! wget -qO-  http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
+              if ! wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
                 printf "%s RPC is not online! Exiting...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
                 exit 1
 
@@ -76,15 +77,15 @@ spec:
 
               # Tezos devs have advised us that it is safer to target HEAD~2 for rolling artifacts.
               else
-                HEAD_BLOCK=$(wget -qO-  http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/')
+                HEAD_BLOCK=$(wget -qO-  http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/')
                 TARGET="${HEAD_BLOCK}~2"
               fi
 
               # Get BLOCK_HASH from RPC
-              wget -qO-  http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH
+              wget -qO-  http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH
 
               # Get BLOCK_HEIGHT from RPC
-              wget -qO-  http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT
+              wget -qO-  http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT
 
               # We need to check if the block is finalized for archive nodes since we aren't getting
               # validation by a Tezos snapshot like our rolling tarball. We are just zipping up the data dir from an archive node.
@@ -117,13 +118,13 @@ spec:
               fi
 
               # Get BLOCK_TIMESTAMP from RPC
-              wget -qO- http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP
+              wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP
 
               # Old version string
               /usr/local/bin/octez-node --version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_VERSION
 
               # Get new version object from RPC
-              wget -qO- http://localhost:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO
+              wget -qO- http://127.0.0.1:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO
 
               # Print variables for debug
               printf "%s BLOCK_HASH is...$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH))\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
@@ -225,8 +226,10 @@ spec:
               name: snapshot-cache-volume
             - mountPath: /rolling-tarball-restore
               name: rolling-tarball-restore
-            - mountPath: /cloud-provider
-              name: cloud-provider
+            - mountPath: /aws-secrets
+              name: aws-secrets
+            - mountPath: /do-secrets
+              name: do-secrets
           env:
             - name: HISTORY_MODE
               value: ""
@@ -244,8 +247,12 @@ spec:
         - name: rolling-tarball-restore
           persistentVolumeClaim:
             claimName: rolling-tarball-restore
-        - name: cloud-provider
+        - name: aws-secrets
           secret:
-            secretName: cloud-provider
+            secretName: aws-secrets
+            optional: true
+        - name: do-secrets
+          secret:
+            secretName: do-secrets
             optional: true
   backoffLimit: 0
diff --git a/snapshotEngine/scratchVolume.yaml b/snapshotEngine/scratchVolume.yaml
@@ -4,7 +4,7 @@ metadata:
   name: snapshot-cache-volume
   namespace: ""
 spec:
-  storageClassName: ebs-sc
+  storageClassName: do-block-storage
   accessModes:
     - ReadWriteOnce
   resources:

diff --git a/snapshotEngine/snapshot-maker.sh b/snapshotEngine/snapshot-maker.sh
@@ -4,12 +4,6 @@ cd /
 
 ZIP_AND_UPLOAD_JOB_NAME=zip-and-upload-"${HISTORY_MODE}"
 
-# Pause if nodes are not ready
-while [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "False" ]; do
-    printf "%s Tezos node is not ready for snapshot.  Check node pod logs.  \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
-    sleep 30
-done
-
 # Delete zip-and-upload job
 if kubectl get job "${ZIP_AND_UPLOAD_JOB_NAME}"; then
     printf "%s Old zip-and-upload job exits.  Attempting to delete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
@@ -26,27 +20,30 @@ fi
 if [ "${HISTORY_MODE}" = rolling ]; then
     if [ "$(kubectl get pvc rolling-tarball-restore)" ]; then
     printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    sleep 5
     kubectl delete pvc rolling-tarball-restore
     sleep 5
     fi
 fi
 
 if [ "$(kubectl get pvc "${HISTORY_MODE}"-snapshot-cache-volume)" ]; then
     printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    sleep 5
     kubectl delete pvc "${HISTORY_MODE}"-snapshot-cache-volume
     sleep 5
 fi
 
 if [ "$(kubectl get pvc "${HISTORY_MODE}"-snap-volume)" ]; then
     printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+    sleep 5
     kubectl delete pvc "${HISTORY_MODE}"-snap-volume
     sleep 5
 fi
 
-while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do
-    printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
-    sleep 10
-done
+# while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do
+#     printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+#     sleep 10
+# done
 
 printf "%s EBS Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
 
@@ -60,6 +57,11 @@ printf "%s Creating scratch volume for artifact processing...\n" "$(date "+%Y-%m
 # Set namespace for both "${HISTORY_MODE}"-snapshot-cache-volume
 NAMESPACE="${NAMESPACE}" yq e -i '.metadata.namespace=strenv(NAMESPACE)' scratchVolume.yaml
 
+# Set storage class for sratch volume yaml
+STORAGE_CLASS="${STORAGE_CLASS}" yq e -i '.spec.storageClassName=strenv(STORAGE_CLASS)' scratchVolume.yaml
+
+sleep 5
+
 # Create "${HISTORY_MODE}"-snapshot-cache-volume
 printf "%s Creating PVC ${HISTORY_MODE}-snapshot-cache-volume.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
 NAME="${HISTORY_MODE}-snapshot-cache-volume" yq e -i '.metadata.name=strenv(NAME)' scratchVolume.yaml
@@ -73,6 +75,7 @@ printf "%s PVC %s created.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "${HISTORY_MOD
 
 
 if [ "${HISTORY_MODE}" = rolling ]; then
+    sleep 5
     # Create rolling-tarball-restore
     printf "%s Creating PVC rolling-tarball-restore..\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
     NAME="rolling-tarball-restore" yq e -i '.metadata.name=strenv(NAME)' scratchVolume.yaml
@@ -87,6 +90,9 @@ fi
 ## Snapshot volume namespace
 NAMESPACE="${NAMESPACE}" yq e -i '.metadata.namespace=strenv(NAMESPACE)' volumeFromSnap.yaml
 
+# Set storageclass for restored volume
+STORAGE_CLASS="${STORAGE_CLASS}" yq e -i '.spec.storageClassName=strenv(STORAGE_CLASS)' volumeFromSnap.yaml
+
 ## Snapshot volume name
 VOLUME_NAME="${HISTORY_MODE}-snap-volume"
 VOLUME_NAME="${VOLUME_NAME}" yq e -i '.metadata.name=strenv(VOLUME_NAME)' volumeFromSnap.yaml
@@ -111,6 +117,8 @@ printf "%s We're rounding up and adding 20%% , volume size will be %sGB.\n" "$(d
 
 RESTORE_VOLUME_SIZE="${RESTORE_VOLUME_SIZE}Gi" yq e -i '.spec.resources.requests.storage=strenv(RESTORE_VOLUME_SIZE)' volumeFromSnap.yaml
 
+sleep 5
+
 printf "%s Creating volume from snapshot ${NEWEST_SNAPSHOT}.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
 if ! kubectl apply -f volumeFromSnap.yaml
 then
@@ -175,22 +183,22 @@ if [ "${HISTORY_MODE}" = archive ]; then
     yq eval -i "del(.spec.template.spec.containers[0].volumeMounts[2])" mainJob.yaml
 fi
 
-# Switch alternate cloud provider secret name based on actual cloud provider
-if [[ -n "${CLOUD_PROVIDER}" ]]; then
-    # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted.
-    SECRET_NAME="${NAMESPACE}-secret"
-    # Index of zip-and-upload container changes depending on if rolling job or archive job
-    NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml)
-    # Index of mounts also changes depending on history mode
-    NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml )
-    # Secret volume mount is last item in list of volumeMounts for the zip and upload container
-    SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
-    # Index of job volumes change depending on history mode
-    NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml )
-    #  Setting job secret volume to value set by workflow
-    SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
-    SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml
-fi
+# # Switch alternate cloud provider secret name based on actual cloud provider
+# if [[ -n "${CLOUD_PROVIDER}" ]]; then
+#     # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted.
+#     SECRET_NAME="${NAMESPACE}-secret"
+#     # Index of zip-and-upload container changes depending on if rolling job or archive job
+#     NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml)
+#     # Index of mounts also changes depending on history mode
+#     NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml )
+#     # Secret volume mount is last item in list of volumeMounts for the zip and upload container
+#     SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
+#     # Index of job volumes change depending on history mode
+#     NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml )
+#     #  Setting job secret volume to value set by workflow
+#     SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
+#     SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml
+# fi
 
 # Service account to be used by entire zip-and-upload job.
 SERVICE_ACCOUNT="${SERVICE_ACCOUNT}" yq e -i '.spec.template.spec.serviceAccountName=strenv(SERVICE_ACCOUNT)' mainJob.yaml
@@ -204,12 +212,13 @@ then
     exit 1
 fi
 
-sleep 5
+sleep 20
 
 # Wait for snapshotting job to complete
 while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
     printf "%s Waiting for zip-and-upload job to complete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"    
     while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
+        sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job
         if [ "$(kubectl get pod -l job-name=zip-and-upload-"${HISTORY_MODE}" --namespace="${NAMESPACE}"| grep -i -e error -e evicted -e pending)" ] || \
         [ "$(kubectl get jobs  "zip-and-upload-${HISTORY_MODE}" --namespace="${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].type}')" ] ; then
             printf "%s Zip-and-upload job failed. This job will end and a new snapshot will be taken.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" 
@@ -226,5 +235,7 @@ if ! [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMES
 fi
 
 printf "%s Deleting temporary snapshot volume.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
+sleep 5
 kubectl delete -f volumeFromSnap.yaml  | while IFS= read -r line; do printf '%s %s\n' "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "$line"; done
+sleep 5
 kubectl delete job snapshot-maker --namespace "${NAMESPACE}"