Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

snapshotEngine: retire snapshot warmer #602

Merged
merged 10 commits into from
Oct 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 0 additions & 142 deletions charts/snapshotEngine/scripts/snapshot-warmer.sh

This file was deleted.

3 changes: 2 additions & 1 deletion charts/snapshotEngine/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ data:
SCHEMA_URL: {{ $.Values.schemaUrl }}
S3_BUCKET: {{ $.Values.s3BucketOverride }}
CLOUD_PROVIDER: {{ $.Values.cloudProvider }}
STORAGE_CLASS: {{$.Values.volumeSnapClass }}
STORAGE_CLASS: {{ $.Values.volumeSnapClass }}
NODES: {{ $.Values.nodes }}
kind: ConfigMap
metadata:
name: snapshot-configmap
Expand Down
47 changes: 0 additions & 47 deletions charts/snapshotEngine/templates/snapshot-warmer.yaml

This file was deleted.

112 changes: 87 additions & 25 deletions snapshotEngine/snapshot-maker.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,58 @@
#!/bin/bash

# Delete all volumesnapshots so they arent setting around accruing charges
kubectl delete vs -l history_mode=$HISTORY_MODE

PERSISTENT_VOLUME_CLAIM="var-volume-snapshot-${HISTORY_MODE}-node-0"

# For yq to work, the values resulting from the above cmds need to be exported.
# We don't export them inline because of
# https://github.com/koalaman/shellcheck/wiki/SC2155
export HISTORY_MODE
export PERSISTENT_VOLUME_CLAIM

yq e -i '.metadata.namespace=strenv(NAMESPACE)' createVolumeSnapshot.yaml
yq e -i '.metadata.labels.history_mode=strenv(HISTORY_MODE)' createVolumeSnapshot.yaml
yq e -i '.spec.source.persistentVolumeClaimName=strenv(PERSISTENT_VOLUME_CLAIM)' createVolumeSnapshot.yaml
yq e -i '.spec.volumeSnapshotClassName=strenv(STORAGE_CLASS)' createVolumeSnapshot.yaml

# Returns list of snapshots with a given status
# readyToUse true/false
getSnapshotNames() {
local readyToUse="${1##readyToUse=}"
shift
if [ -z "$readyToUse" ]; then
echo "Error: No jsonpath for volumesnapshots' ready status was provided."
exit 1
fi
kubectl get volumesnapshots -o jsonpath="{.items[?(.status.readyToUse==$readyToUse)].metadata.name}" --namespace "$NAMESPACE" "$@"
}

# SLEEP_TIME=0m

# if [ "${HISTORY_MODE}" = "archive" ]; then
# SLEEP_TIME="${ARCHIVE_SLEEP_DELAY}"
# if [ "${ARCHIVE_SLEEP_DELAY}" != "0m" ]; then
# printf "%s artifactDelay.archive is set to %s sleeping...\n" "$(date "+%Y-%m-%d %H:%M:%S")" "${ARCHIVE_SLEEP_DELAY}"
# fi
# elif [ "${HISTORY_MODE}" = "rolling" ]; then
# SLEEP_TIME="${ROLLING_SLEEP_DELAY}"
# if [ "${ROLLING_SLEEP_DELAY}" != "0m" ]; then
# printf "%s artifactDelay.rolling is set to %s sleeping...\n" "$(date "+%Y-%m-%d %H:%M:%S")" "${ROLLING_SLEEP_DELAY}"
# fi
# fi

# if [ "${SLEEP_TIME}" = "0m" ]; then
# printf "%s artifactDelay.HISTORY_MODE was not set! No delay...\n" "$(date "+%Y-%m-%d %H:%M:%S")"
# fi

# sleep "${SLEEP_TIME}"

cd /

ZIP_AND_UPLOAD_JOB_NAME=zip-and-upload-"${HISTORY_MODE}"

# Delete zip-and-upload job
# Delete zip-and-upload job if still around
if kubectl get job "${ZIP_AND_UPLOAD_JOB_NAME}"; then
printf "%s Old zip-and-upload job exits. Attempting to delete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
if ! kubectl delete jobs "${ZIP_AND_UPLOAD_JOB_NAME}"; then
Expand All @@ -16,7 +64,7 @@ else
printf "%s No old zip-and-upload job detected for cleanup.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
fi

# Delete old PVCs
# Delete old PVCs if still around
if [ "${HISTORY_MODE}" = rolling ]; then
if [ "$(kubectl get pvc rolling-tarball-restore)" ]; then
printf "%s PVC Exists.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
Expand All @@ -40,12 +88,34 @@ if [ "$(kubectl get pvc "${HISTORY_MODE}"-snap-volume)" ]; then
sleep 5
fi

# while [ "$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==false)].metadata.name}' --namespace "${NAMESPACE}" -l history_mode="${HISTORY_MODE}")" ]; do
# printf "%s Snapshot already in progress...\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
# sleep 10
# done
# Take volume snapshot
current_date=$(date "+%Y-%m-%d-%H-%M-%S" "$@")
export SNAPSHOT_NAME="$current_date-$HISTORY_MODE-node-snapshot"
# Update volume snapshot name
yq e -i '.metadata.name=strenv(SNAPSHOT_NAME)' createVolumeSnapshot.yaml

printf "%s EBS Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
printf "%s Creating snapshot ${SNAPSHOT_NAME} in ${NAMESPACE}.\n" "$(timestamp)"

# Create snapshot
if ! kubectl apply -f createVolumeSnapshot.yaml; then
printf "%s ERROR creating volumeSnapshot ${SNAPSHOT_NAME} in ${NAMESPACE} .\n" "$(timestamp)"
exit 1
fi

sleep 5

# Wait for snapshot to finish
until [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; do
printf "%s Snapshot in progress. \n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
until [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for node
if [ "$(getSnapshotNames readyToUse=true -l history_mode="${HISTORY_MODE}")" ]; then
break
fi
done
done

printf "%s Snapshot finished!\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"

SNAPSHOTS=$(kubectl get volumesnapshots -o jsonpath='{.items[?(.status.readyToUse==true)].metadata.name}' -l history_mode="${HISTORY_MODE}")
NEWEST_SNAPSHOT=${SNAPSHOTS##* }
Expand Down Expand Up @@ -126,6 +196,11 @@ then
exit 1
fi

sleep 5

# Delete all volumesnapshots so they arent setting around accruing charges
kubectl delete vs -l history_mode=$HISTORY_MODE

# TODO Check for PVC
printf "%s PersistentVolumeClaim ${HISTORY_MODE}-snap-volume created successfully in namespace ${NAMESPACE}.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"

Expand Down Expand Up @@ -183,23 +258,6 @@ if [ "${HISTORY_MODE}" = archive ]; then
yq eval -i "del(.spec.template.spec.containers[0].volumeMounts[2])" mainJob.yaml
fi

# # Switch alternate cloud provider secret name based on actual cloud provider
# if [[ -n "${CLOUD_PROVIDER}" ]]; then
# # Need to account for dynamic volumes removed above. For example if not rolling node then rolling volume is deleted.
# SECRET_NAME="${NAMESPACE}-secret"
# # Index of zip-and-upload container changes depending on if rolling job or archive job
# NUM_CONTAINERS=$(yq e '.spec.template.spec.containers | length' mainJob.yaml)
# # Index of mounts also changes depending on history mode
# NUM_CONTAINER_MOUNTS=$(yq e ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts | length" mainJob.yaml )
# # Secret volume mount is last item in list of volumeMounts for the zip and upload container
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.containers[$(( NUM_CONTAINERS - 1 ))].volumeMounts[$(( NUM_CONTAINER_MOUNTS - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
# # Index of job volumes change depending on history mode
# NUM_JOB_VOLUMES=$(yq e '.spec.template.spec.volumes | length' mainJob.yaml )
# # Setting job secret volume to value set by workflow
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].name=strenv(SECRET_NAME)" mainJob.yaml
# SECRET_NAME="${SECRET_NAME}" yq e -i ".spec.template.spec.volumes[$(( NUM_JOB_VOLUMES - 1 ))].secret.secretName=strenv(SECRET_NAME)" mainJob.yaml
# fi

# Service account to be used by entire zip-and-upload job.
SERVICE_ACCOUNT="${SERVICE_ACCOUNT}" yq e -i '.spec.template.spec.serviceAccountName=strenv(SERVICE_ACCOUNT)' mainJob.yaml

Expand All @@ -218,7 +276,7 @@ sleep 20
while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
printf "%s Waiting for zip-and-upload job to complete.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
while [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')" != "True" ]; do
sleep 1m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job
sleep 2m # without sleep, this loop is a "busy wait". this sleep vastly reduces CPU usage while we wait for job
if [ "$(kubectl get pod -l job-name=zip-and-upload-"${HISTORY_MODE}" --namespace="${NAMESPACE}"| grep -i -e error -e evicted -e pending)" ] || \
[ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace="${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].type}')" ] ; then
printf "%s Zip-and-upload job failed. This job will end and a new snapshot will be taken.\n" "$(date "+%Y-%m-%d %H:%M:%S" "$@")"
Expand Down Expand Up @@ -269,6 +327,9 @@ if ! [ "$(kubectl get jobs "zip-and-upload-${HISTORY_MODE}" --namespace "${NAMES
sleep 5
fi

# Delete all volumesnapshots so they arent setting around accruing charges
kubectl delete vs -l history_mode=$HISTORY_MODE

SLEEP_TIME=0m

if [ "${HISTORY_MODE}" = "archive" ]; then
Expand Down Expand Up @@ -296,3 +357,4 @@ sleep 5
kubectl delete -f volumeFromSnap.yaml | while IFS= read -r line; do printf '%s %s\n' "$(date "+%Y-%m-%d %H:%M:%S" "$@")" "$line"; done
sleep 5
kubectl delete job snapshot-maker --namespace "${NAMESPACE}"

Loading