Skip to content

Commit

Permalink
snapshotEngine: DigitalOcean complete migration (#586)
Browse files Browse the repository at this point in the history
* add value to skip snap web generation

* add configurable value for s3 bucket

* web build condition on domain name instead

* add secret and configurable s3 bucket override

* switch name and mountpath to match format

* update secret name and use in zip and upload job

* use export instead of temp var

* secret name change

* expect correct names on secret volume mount

* correct path to secret mount

* rework credential override to provide logs and error messages

* use double quotes for early expansion

* remove variable checking since we are feeding in files

* bug: container is gone so we cant delete a volume

* show commands for debug

* wrong default s3 bucket var

* turn of tar output for debug

* undo command verbosity

* Verbose variables

* Enable interactive for alias to work

* More useful alias message and rm debug messages

* Need space after !

* expand aliases instead of interactive

* add public-read and move index.html

* Website redirects stay in AWS

* Set alias only for filesystem artifact upload

* rolling redirects working

* fix volume indexing

* helpful messages

* Useful comments for new indexing format

* Omit alias functionality in lieu of variable parameters

* Fix rolling tarball filename

* configmap needs fqdn

* cdn isnt working so we're using bucket url

* unsilence lz4 logs

* wrong aws bucket name

* get all snapshot metadata from do spaces

* upload metadatas to alt s3 bucket

* fix metadata related to website build

* initial commit demo functionality

* put redirects back

* remove merged files

* update zip and upload commands for dual creds

* sleep for debug

* allow override of storage class for scratch volumes

* use storage class as set

* Container-running OS will not resolve localhost

* Remove infinite sleep from debugging

* Empty-Commit to trigger CI test

* bucket name change to do space

* rm fqdn from cm

* increase warmer timeout

* increase timeout after artifact job create

* DO rate limits snapshots per 10m

* sleep between creation for rate limiting

* need different command for site upload

* block snapshot until node ready

* pause scheduler if node not ready

* add sleep for cpu usage reduction

* fix busy waits and document why

* fix busy wait on job and more better comments
  • Loading branch information
orcutt989 authored Aug 15, 2023
1 parent f5f784c commit 7e31daf
Show file tree
Hide file tree
Showing 9 changed files with 140 additions and 95 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,6 @@ build

# Ignore mkchain generated files
*_values.yaml
*-values.yaml

charts/tezos/charts
71 changes: 43 additions & 28 deletions charts/snapshotEngine/scripts/snapshot-warmer.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ delete_old_volumesnapshots() {
local max_snapshots="${2##max_snapshots=}"

while [ "$(getNumberOfSnapshots readyToUse=true --selector="$selector")" -gt "$max_snapshots" ]; do
sleep 5
NUMBER_OF_SNAPSHOTS=$(getNumberOfSnapshots readyToUse=true --selector="$selector")
printf "%s Number of snapshots with selector '$selector' is too high at $NUMBER_OF_SNAPSHOTS. Deleting 1.\n" "$(timestamp)"
SNAPSHOTS=$(getSnapshotNames readyToUse=true --selector="$selector")
Expand All @@ -37,31 +38,31 @@ delete_old_volumesnapshots() {
done
}

delete_stuck_volumesnapshots() {
snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}")
arr=(`echo ${snapshot_list}`);
for snapshot_name in "${arr[@]}"; do
snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}')
snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1}
snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s)
current_date_unix=$(date -u +%s)
snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60 ))
# Snapshots should never be older than 6 minutes
# If they are then there's a problem on AWS' end and the snapshot needs to be deleted.
if [ $snapshot_age_minutes -ge 6 ]; then
printf "%s Snasphot %s is %s minutes old. It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes"
err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null)
if [ $? -ne 0 ]; then
printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name"
printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err"
sleep 10
exit 1
else
printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name"
fi
fi
done
}
# delete_stuck_volumesnapshots() {
# snapshot_list=$(kubectl get volumesnapshots -o jsonpath="{.items[*].metadata.name}")
# arr=(`echo ${snapshot_list}`);
# for snapshot_name in "${arr[@]}"; do
# snapshot_creation_time_iso8601=$(kubectl get volumesnapshots $snapshot_name -o jsonpath='{.metadata.creationTimestamp}')
# snapshot_creation_time_without_offset=${snapshot_creation_time_iso8601::-1}
# snapshot_creation_time_unix=$(date -ud "$(echo $snapshot_creation_time_without_offset | sed 's/T/ /')" +%s)
# current_date_unix=$(date -u +%s)
# snapshot_age_minutes=$(( (current_date_unix - snapshot_creation_time_unix) / 60 ))
# # Snapshots should never be older than 6 minutes
# # If they are then there's a problem on AWS' end and the snapshot needs to be deleted.
# if [ $snapshot_age_minutes -ge 6 ]; then
# printf "%s Snasphot %s is %s minutes old. It must be stuck. Attempting to delete...\n" "$(timestamp)" "$snapshot_name" "$snapshot_age_minutes"
# err=$(kubectl delete volumesnapshots $snapshot_name 2>&1 > /dev/null)
# if [ $? -ne 0 ]; then
# printf "%s ERROR##### Unable to delete stuck snapshot %s .\n" "$(timestamp)" "$snapshot_name"
# printf "%s Error was: \"%s\"\n" "$(timestamp)" "$err"
# sleep 10
# exit 1
# else
# printf "%s Successfully deleted stuck snapshot %s! \n" "$(timestamp)" "$snapshot_name"
# fi
# fi
# done
# }

HISTORY_MODE="$(echo "$NODE_CONFIG" | jq -r ".history_mode")"
TARGET_VOLUME="$(echo "$NODE_CONFIG" | jq ".target_volume")"
Expand All @@ -83,12 +84,23 @@ yq e -i '.spec.volumeSnapshotClassName=strenv(VOLUME_SNAPSHOT_CLASS)' createVolu

while true; do

# Pause if nodes are not ready
until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
printf "%s Tezos node is not ready for snapshot. Check node pod logs. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
until [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node
if [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "True" ]; then
break
fi
done
done

# Remove unlabeled snapshots
delete_old_volumesnapshots selector='!history_mode' max_snapshots=0
# Maintain 4 snapshots of a certain history mode
delete_old_volumesnapshots selector="history_mode=$HISTORY_MODE" max_snapshots=4
# Check for and delete old stuck snapshots
delete_stuck_volumesnapshots
# delete_stuck_volumesnapshots

if ! [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; then
# EBS Snapshot name based on current time and date
Expand All @@ -113,7 +125,7 @@ while true; do
while [ "$(getSnapshotNames readyToUse=false -l history_mode="${HISTORY_MODE}")" ]; do
printf "%s Snapshot is still creating...\n" "$(timestamp)"
sleep 10
delete_stuck_volumesnapshots
# delete_stuck_volumesnapshots
done
end_time=$(date +%s)
elapsed=$((end_time - start_time))
Expand All @@ -122,6 +134,9 @@ while true; do
else
printf "%s Snapshot already in progress...\n" "$(timestamp)"
sleep 10
delete_stuck_volumesnapshots
# delete_stuck_volumesnapshots
fi

printf "%s Sleeping for 10m due to Digital Ocean rate limit.\n" "$(timestamp)"
sleep 10m
done
2 changes: 1 addition & 1 deletion charts/snapshotEngine/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ data:
SCHEMA_URL: {{ $.Values.schemaUrl }}
S3_BUCKET: {{ $.Values.s3BucketOverride }}
CLOUD_PROVIDER: {{ $.Values.cloudProvider }}
FQDN: {{ $.Values.fqdn }}
STORAGE_CLASS: {{$.Values.volumeSnapClass }}
kind: ConfigMap
metadata:
name: snapshot-configmap
Expand Down
33 changes: 20 additions & 13 deletions snapshotEngine/mainJob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,18 @@ spec:
# These loops wait on the RPC to come online and prevent log from printing same line
# over and over and over again. This prints one line and waits for the RPC to come online for a clean log.
until wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
until wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
printf "%s Waiting for node RPC to come online.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
until wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
if wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
until wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for rpc
if wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
break
fi
done
done
# If somehow we skip the above waiting loop, this kills the job if the RPC is not online.
if ! wget -qO- http://localhost:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
if ! wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header >/dev/null 2>&1; then
printf "%s RPC is not online! Exiting...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
exit 1
Expand All @@ -76,15 +77,15 @@ spec:
# Tezos devs have advised us that it is safer to target HEAD~2 for rolling artifacts.
else
HEAD_BLOCK=$(wget -qO- http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/')
HEAD_BLOCK=$(wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/')
TARGET="${HEAD_BLOCK}~2"
fi
# Get BLOCK_HASH from RPC
wget -qO- http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH
wget -qO- http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"hash":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH
# Get BLOCK_HEIGHT from RPC
wget -qO- http://localhost:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT
wget -qO- http://127.0.0.1:8732/chains/main/blocks/"${TARGET}"/header | sed -E 's/.*"level":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HEIGHT
# We need to check if the block is finalized for archive nodes since we aren't getting
# validation by a Tezos snapshot like our rolling tarball. We are just zipping up the data dir from an archive node.
Expand Down Expand Up @@ -117,13 +118,13 @@ spec:
fi
# Get BLOCK_TIMESTAMP from RPC
wget -qO- http://localhost:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP
wget -qO- http://127.0.0.1:8732/chains/main/blocks/head/header | sed -E 's/.*"timestamp":"?([^,"]*)"?.*/\1/' > /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_TIMESTAMP
# Old version string
/usr/local/bin/octez-node --version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_VERSION
# Get new version object from RPC
wget -qO- http://localhost:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO
wget -qO- http://127.0.0.1:8732/version > /"${HISTORY_MODE}"-snapshot-cache-volume/TEZOS_RPC_VERSION_INFO
# Print variables for debug
printf "%s BLOCK_HASH is...$(cat /"${HISTORY_MODE}"-snapshot-cache-volume/BLOCK_HASH))\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
Expand Down Expand Up @@ -225,8 +226,10 @@ spec:
name: snapshot-cache-volume
- mountPath: /rolling-tarball-restore
name: rolling-tarball-restore
- mountPath: /cloud-provider
name: cloud-provider
- mountPath: /aws-secrets
name: aws-secrets
- mountPath: /do-secrets
name: do-secrets
env:
- name: HISTORY_MODE
value: ""
Expand All @@ -244,8 +247,12 @@ spec:
- name: rolling-tarball-restore
persistentVolumeClaim:
claimName: rolling-tarball-restore
- name: cloud-provider
- name: aws-secrets
secret:
secretName: cloud-provider
secretName: aws-secrets
optional: true
- name: do-secrets
secret:
secretName: do-secrets
optional: true
backoffLimit: 0
2 changes: 1 addition & 1 deletion snapshotEngine/scratchVolume.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ metadata:
name: snapshot-cache-volume
namespace: ""
spec:
storageClassName: ebs-sc
storageClassName: do-block-storage
accessModes:
- ReadWriteOnce
resources:
Expand Down
65 changes: 38 additions & 27 deletions snapshotEngine/snapshot-maker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,6 @@ cd /

ZIP_AND_UPLOAD_JOB_NAME=zip-and-upload-"${HISTORY_MODE}"

# Pause if nodes are not ready
while [ "$(kubectl get pods -n "${NAMESPACE}" -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' -l appType=octez-node -l node_class_history_mode="${HISTORY_MODE}")" = "False" ]; do
printf "%s Tezos node is not ready for snapshot. Check node pod logs. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
sleep 30
done

# Delete zip-and-upload job
if kubectl get job "${ZIP_AND_UPLOAD_JOB_NAME}"; then
printf "%s Old zip-and-upload job exits. Attempting to delete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
Expand All @@ -26,27 +20,30 @@ fi
if [ "${HISTORY_MODE}" = rolling ]; then
if [ "$(kubectl get pvc rolling-tarball-restore)" ]; then
printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
sleep 5
kubectl delete pvc rolling-tarball-restore
sleep 5
fi
fi

if [ "$(kubectl get pvc "${HISTORY_MODE}"-snapshot-cache-volume)" ]; then
printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
sleep 5
kubectl delete pvc "${HISTORY_MODE}"-snapshot-cache-volume
sleep 5
fi

if [ "$(kubectl get pvc "${HISTORY_MODE}"-snap-volume)" ]; then
printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
sleep 5
kubectl delete pvc "${HISTORY_MODE}"-snap-volume
sleep 5
fi

while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do
printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
sleep 10
done
# while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do
# printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
# sleep 10
# done

printf "%s EBS Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"

Expand All @@ -60,6 +57,11 @@ printf "%s Creating scratch volume for artifact processing...\n" "$(date "+%Y-%m
# Set namespace for both "${HISTORY_MODE}"-snapshot-cache-volume
NAMESPACE="${NAMESPACE}" yq e -i '.metadata.namespace=strenv(NAMESPACE)' scratchVolume.yaml

# Set storage class for sratch volume yaml
STORAGE_CLASS="${STORAGE_CLASS}" yq e -i '.spec.storageClassName=strenv(STORAGE_CLASS)' scratchVolume.yaml

sleep 5

# Create "${HISTORY_MODE}"-snapshot-cache-volume
printf "%s Creating PVC ${HISTORY_MODE}-snapshot-cache-volume.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
NAME="${HISTORY_MODE}-snapshot-cache-volume" yq e -i '.metadata.name=strenv(NAME)' scratchVolume.yaml
Expand All @@ -73,6 +75,7 @@ printf "%s PVC %s created.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "${HISTORY_MOD


if [ "${HISTORY_MODE}" = rolling ]; then
sleep 5
# Create rolling-tarball-restore
printf "%s Creating PVC rolling-tarball-restore..\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
NAME="rolling-tarball-restore" yq e -i '.metadata.name=strenv(NAME)' scratchVolume.yaml
Expand All @@ -87,6 +90,9 @@ fi
## Snapshot volume namespace
NAMESPACE="${NAMESPACE}" yq e -i '.metadata.namespace=strenv(NAMESPACE)' volumeFromSnap.yaml

# Set storageclass for restored volume
STORAGE_CLASS="${STORAGE_CLASS}" yq e -i '.spec.storageClassName=strenv(STORAGE_CLASS)' volumeFromSnap.yaml

## Snapshot volume name
VOLUME_NAME="${HISTORY_MODE}-snap-volume"
VOLUME_NAME="${VOLUME_NAME}" yq e -i '.metadata.name=strenv(VOLUME_NAME)' volumeFromSnap.yaml
Expand All @@ -111,6 +117,8 @@ printf "%s We're rounding up and adding 20%% , volume size will be %sGB.\n" "$(d

RESTORE_VOLUME_SIZE="${RESTORE_VOLUME_SIZE}Gi" yq e -i '.spec.resources.requests.storage=strenv(RESTORE_VOLUME_SIZE)' volumeFromSnap.yaml

sleep 5

printf "%s Creating volume from snapshot ${NEWEST_SNAPSHOT}.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
if ! kubectl apply -f volumeFromSnap.yaml
then
Expand Down Expand Up @@ -175,22 +183,22 @@ if [ "${HISTORY_MODE}" = archive ]; then
yq eval -i "del(.spec.template.spec.containers[0].volumeMounts[2])" mainJob.yaml
fi

# Switch alternate cloud provider secret name based on actual cloud provider
if [[ -n "${CLOUD_PROVIDER}" ]]; then
# Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted.
SECRET_NAME="${NAMESPACE}-secret"
# Index of zip-and-upload container changes depending on if rolling job or archive job
NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml)
# Index of mounts also changes depending on history mode
NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml )
# Secret volume mount is last item in list of volumeMounts for the zip and upload container
SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
# Index of job volumes change depending on history mode
NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml )
# Setting job secret volume to value set by workflow
SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml
fi
# # Switch alternate cloud provider secret name based on actual cloud provider
# if [[ -n "${CLOUD_PROVIDER}" ]]; then
# # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted.
# SECRET_NAME="${NAMESPACE}-secret"
# # Index of zip-and-upload container changes depending on if rolling job or archive job
# NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml)
# # Index of mounts also changes depending on history mode
# NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml )
# # Secret volume mount is last item in list of volumeMounts for the zip and upload container
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
# # Index of job volumes change depending on history mode
# NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml )
# # Setting job secret volume to value set by workflow
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml
# fi

# Service account to be used by entire zip-and-upload job.
SERVICE_ACCOUNT="${SERVICE_ACCOUNT}" yq e -i '.spec.template.spec.serviceAccountName=strenv(SERVICE_ACCOUNT)' mainJob.yaml
Expand All @@ -204,12 +212,13 @@ then
exit 1
fi

sleep 5
sleep 20

# Wait for snapshotting job to complete
while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
printf "%s Waiting for zip-and-upload job to complete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job
if [ "$(kubectl get pod -l job-name=zip-and-upload-"${HISTORY_MODE}" --namespace="${NAMESPACE}"| grep -i -e error -e evicted -e pending)" ] || \
[ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace="${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].type}')" ] ; then
printf "%s Zip-and-upload job failed. This job will end and a new snapshot will be taken.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
Expand All @@ -226,5 +235,7 @@ if ! [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMES
fi

printf "%s Deleting temporary snapshot volume.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
sleep 5
kubectl delete -f volumeFromSnap.yaml | while IFS= read -r line; do printf '%s %s\n' "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "$line"; done
sleep 5
kubectl delete job snapshot-maker --namespace "${NAMESPACE}"
Loading

0 comments on commit 7e31daf

Please sign in to comment.