From 1092ca90978ae6e0adcc02dd217a7cb56d74b0c5 Mon Sep 17 00:00:00 2001
From: Jenkins <jenkins@algol60.net>
Date: Wed, 12 Jun 2024 17:51:59 +0000
Subject: [PATCH 01/37] Automated API docs swagger to md conversion
 (https://jenkins.algol60.net/job/Cray-HPE/job/csm/job/v1.6.0-alpha.52/1/)

---
 api/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/api/README.md b/api/README.md
index aefbb758e306..587b67ef92b5 100644
--- a/api/README.md
+++ b/api/README.md
@@ -14,4 +14,3 @@
  * [Hardware State Manager API v2](./smd.md)
  * [Cray STS Token Generator v1](./sts.md)
  * [TAPMS Tenant Status API v1](./tapms-operator.md)
- * [User Access Service v1](./uas-mgr.md)

From 72af01d682ab5c51a7dfabe2feba5b96d294f8d0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 14 Jun 2024 14:09:15 -0500
Subject: [PATCH 02/37] CASMINST-6879: Add install-goss-tests.sh (#5148)

(cherry picked from commit 3a56fb1d344a9a444de6a1405ef91f73ac6dc7a8)

Co-authored-by: Mitch Harding (the weird one) <mitchell.harding@hpe.com>
---
 install/scripts/install-goss-tests.sh | 235 ++++++++++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100755 install/scripts/install-goss-tests.sh

diff --git a/install/scripts/install-goss-tests.sh b/install/scripts/install-goss-tests.sh
new file mode 100755
index 000000000000..c21eeb2cccad
--- /dev/null
+++ b/install/scripts/install-goss-tests.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2021-2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+# This script is a replacement for the script of the same name in the lib
+# directory of the CSM tarballs and repository.
+
+# This script is run twice during CSM installs. It does different things depending
+# on when it is run.
+
+# It is run the first time from the PIT node, after the other NCNs have been deployed. In this
+# case, it does the following:
+# 1) Finds the latest versions of select RPMs in the expanded CSM tarball
+# 2) Copies the RPMs into a prep subdirectory
+# 3) Installs the RPMs onto the other NCNs and the PIT node
+#
+# It is run for the second time from ncn-m001, after PIT redeploy. In this case, it does
+# the following:
+# 1) Installs the RPMs in the prep subdirectory (populated from the first time the script was
+#    executed) onto ncn-m001
+# 2) Enables and restarts goss-servers on ncn-m001
+
+# Globally disable warning about globbing and word splitting
+# shellcheck disable=SC2086
+
+set -e
+
+PITFILE="/etc/pit-release"
+
+function find_latest_rpm {
+  # $1 - RPM name prefix (e.g. csm-testing, goss-servers, etc)
+  local name vpattern rpm_regex1 rpm_regex2 filepath
+  name="$1"
+  # The first part of the version will be three .-separated numbers
+  vpattern="[0-9][0-9]*[.][0-9][0-9]*[.][0-9][0-9]*"
+
+  # After the name and version, there are two ways our RPM may be named:
+  # * It could have a -, followed by characters we do not care about, ending in .rpm
+  rpm_regex1="${name}-${vpattern}-[^/]*[.]rpm"
+  # * Or it could just have .rpm after the name and version
+  rpm_regex2="${name}-${vpattern}[.]rpm"
+
+  # List all RPM files in the rpm directory
+  filepath=$(find "$RPMDIR" -type f -name \*.rpm \
+    |
+    # Select only names fitting one of our patterns
+    grep -E "/(${rpm_regex1}|${rpm_regex2})$" \
+    |
+    # Change each line so first it shows just the RPM filename, followed by a blank space,
+    # followed by the original full path and filename
+    sed -e "s#^${RPMDIR}.*/\(${rpm_regex1}\)\$#\1 \0#" -e "s#^${RPMDIR}.*/\(${rpm_regex2}\)\$#\1 \0#" \
+    |
+    # Sort the first field (the RPM filename without path) by version
+    sort -k1V \
+    |
+    # Choose the last one listed (the one with the highest version)
+    tail -1 \
+    |
+    # Change the line, removing the RPM filename and space, leaving only the full path and filename
+    sed 's/^[^ ]* //')
+  if [[ -z ${filepath} ]]; then
+    echo "The ${name} RPM was not found at the expected location. Ensure this RPM exists under the '$RPMDIR' directory" >&2
+    return 1
+  fi
+  echo "${filepath}"
+  return 0
+}
+
+function paths_to_basenames {
+  local rpm_name_list
+  while [[ $# -gt 0 ]]; do
+    rpm_name_list="${rpm_name_list} ${1##*/}"
+    shift
+  done
+  echo "${rpm_name_list}"
+  return 0
+}
+
+function err_exit {
+  while [[ $# -gt 0 ]]; do
+    echo "$1" >&2
+    shift
+  done
+  exit 1
+}
+
+function run_on_pit {
+  [[ -n ${CSM_RELEASE} || -n ${CSM_PATH} ]] || err_exit 'Please set and export $CSM_PATH or $CSM_RELEASE and try again'
+
+  local MTOKEN STOKEN WTOKEN PREPDIR STORAGE_NCNS K8S_NCNS PREP_RPM_DIR ncn
+  local STORAGE_RPM_PATHS K8S_RPM_PATHS STORAGE_RPM_BASENAMES K8S_RPM_BASENAMES
+  local HPE_GOSS_RPM CMSTOOLS_RPM CANU_RPM CSM_TESTING_RPM GOSS_SERVERS_RPM PLATFORM_UTILS_RPM IUF_CLI_RPM
+
+  MTOKEN='ncn-m\w+'
+  STOKEN='ncn-s\w+'
+  WTOKEN='ncn-w\w+'
+
+  PITDATA=${PITDATA:-/var/www/ephemeral}
+  CSM_DIRNAME=${CSM_DIRNAME:-${PITDATA}}
+  CSM_PATH=${CSM_PATH:-${CSM_DIRNAME}/csm-${CSM_RELEASE}}
+  RPMDIR=${RPMDIR:-${CSM_PATH}/rpm}
+  PREPDIR="${PITDATA}/prep"
+  PREP_RPM_DIR="${PREPDIR}/rpms"
+
+  [[ -d ${CSM_PATH} ]] \
+    || err_exit "The csm-${CSM_RELEASE} directory was not found at the expected location." \
+      "Please set \$CSM_DIRNAME to the absolute path containing the csm-$CSM_RELEASE directory"
+
+  [[ -d ${RPMDIR} ]] \
+    || err_exit "The 'rpm' directory was not found in the base directory of the expanded CSM tarball: ${CSM_PATH}" \
+      "Please set \$CSM_PATH to the path of the base directory of the expanded CSM tarball, and verify that it contains the 'rpm' directory."
+
+  [[ -d ${PREPDIR} ]] || err_exit "The 'prep' directory was not found in its expected location: '${PREPDIR}'"
+
+  # It's okay if our RPM prep subdirectory already exists (we'll just delete and recreate it), but if it exists
+  # and isn't a directory, then that means something other than this script created it, so we should be
+  # cautious and not automatically delete it.
+  [[ ! -e ${PREP_RPM_DIR} || -d ${PREP_RPM_DIR} ]] \
+    || err_exit "ERROR: '${PREP_RPM_DIR}' already exists but it is not a directory. Move, rename, or delete it and then re-run this script"
+
+  STORAGE_NCNS=$(grep -oE "${STOKEN}" /etc/dnsmasq.d/statics.conf | grep -v m001 | sort -u)
+  K8S_NCNS=$(grep -oE "(${MTOKEN}|${WTOKEN})" /etc/dnsmasq.d/statics.conf | grep -v m001 | sort -u)
+
+  CANU_RPM=$(find_latest_rpm canu)
+  CSM_TESTING_RPM=$(find_latest_rpm csm-testing)
+  GOSS_SERVERS_RPM=$(find_latest_rpm goss-servers)
+  IUF_CLI_RPM=$(find_latest_rpm iuf-cli)
+  PLATFORM_UTILS_RPM=$(find_latest_rpm platform-utils)
+  HPE_GOSS_RPM=$(find_latest_rpm hpe-csm-goss-package)
+  CMSTOOLS_RPM=$(find_latest_rpm cray-cmstools-crayctldeploy)
+
+  # cmstools RPM is not installed on storage nodes
+  STORAGE_RPM_PATHS="${HPE_GOSS_RPM} ${CANU_RPM} ${CSM_TESTING_RPM} ${GOSS_SERVERS_RPM} ${IUF_CLI_RPM} ${PLATFORM_UTILS_RPM}"
+  K8S_RPM_PATHS="${STORAGE_RPM_PATHS} ${CMSTOOLS_RPM}"
+
+  # If the RPM prep subdirectory already exists, remove it and its contents
+  if [[ -d ${PREP_RPM_DIR} ]]; then
+    echo "Deleting existing directory: '${PREP_RPM_DIR}'"
+    rm -rf "${PREP_RPM_DIR}"
+    [[ ! -e ${PREP_RPM_DIR} ]] || err_exit "ERROR: Still exists even after deleting it: '${PREP_RPM_DIR}'"
+  fi
+
+  # Create prep subdirectory
+  echo "Creating directory: '${PREP_RPM_DIR}'"
+  mkdir -v "${PREP_RPM_DIR}"
+
+  # Copy test RPMs into it
+  cp -v ${K8S_RPM_PATHS} "${PREP_RPM_DIR}"
+
+  STORAGE_RPM_BASENAMES=$(paths_to_basenames ${STORAGE_RPM_PATHS})
+  K8S_RPM_BASENAMES=$(paths_to_basenames ${K8S_RPM_PATHS})
+
+  # Install the RPMs onto the other NCNs
+  for ncn in ${STORAGE_NCNS}; do
+    echo "Installing RPMs on ${ncn}"
+    scp ${STORAGE_RPM_PATHS} ${ncn}:/tmp/
+    # CASMINST-6779: Use rpm instead of zypper to avoid problems caused by inaccessible Zypper repos, since we are
+    # installing from local files anyway.
+    # shellcheck disable=SC2029
+    ssh ${ncn} "cd /tmp && rpm -Uvh --force ${STORAGE_RPM_BASENAMES} && systemctl enable goss-servers && systemctl restart goss-servers && systemctl daemon-reload && echo systemctl daemon-reload has been run && rm -f ${STORAGE_RPM_BASENAMES}"
+  done
+
+  for ncn in ${K8S_NCNS}; do
+    echo "Installing RPMs on ${ncn}"
+    scp ${K8S_RPM_PATHS} ${ncn}:/tmp/
+    # CASMINST-6779: Use rpm instead of zypper to avoid problems caused by inaccessible Zypper repos, since we are
+    # installing from local files anyway.
+    # shellcheck disable=SC2029
+    ssh ${ncn} "cd /tmp && rpm -Uvh --force ${K8S_RPM_BASENAMES} && systemctl enable goss-servers && systemctl restart goss-servers && systemctl daemon-reload && echo systemctl daemon-reload has been run && rm -f ${K8S_RPM_BASENAMES}"
+  done
+
+  # The RPMs should have been installed on the PIT at the same time csi was installed. Trust, but verify:
+  echo "Installing RPMs on PIT if needed"
+  # CASMINST-6779: Use rpm instead of zypper to avoid problems caused by inaccessible Zypper repos, since we are
+  # installing from local files anyway.
+  rpm -q canu || rpm -Uvh --force ${CANU_RPM}
+  rpm -q hpe-csm-goss-package || rpm -Uvh --force ${HPE_GOSS_RPM}
+  rpm -q csm-testing || rpm -Uvh --force ${CSM_TESTING_RPM}
+  rpm -q goss-servers || (rpm -Uvh --force ${GOSS_SERVERS_RPM} && systemctl enable goss-servers && systemctl restart goss-servers)
+  rpm -q platform-utils || rpm -Uvh --force ${PLATFORM_UTILS_RPM}
+  rpm -q iuf-cli || rpm -Uvh --force ${IUF_CLI_RPM}
+  systemctl daemon-reload && echo "systemctl daemon-reload has been run"
+}
+
+function run_on_m001 {
+  local PREP_RPM_DIR
+
+  PREP_RPM_DIR=/metal/bootstrap/prep/rpms
+  [[ -d ${PREP_RPM_DIR} ]] || err_exit "ERROR: Directory does not exist: '${PREP_RPM_DIR}'"
+
+  echo "Installing RPMs from '${PREP_RPM_DIR}':"
+  rpm -Uvh --force "${PREP_RPM_DIR}/"*.rpm
+
+  echo "Enabling goss-servers"
+  systemctl enable goss-servers
+
+  echo "Restarting goss-servers"
+  systemctl restart goss-servers
+
+  echo "Reloading daemons"
+  systemctl daemon-reload && echo "systemctl daemon-reload has been run"
+}
+
+if [[ -f ${PITFILE} ]]; then
+  echo "${PITFILE} exists -- running on PIT node"
+  run_on_pit
+elif [[ ${HOSTNAME} == ncn-m001 ]]; then
+  echo "Running on ncn-m001 (non-PIT)"
+  run_on_m001
+else
+  err_exit "ERROR: This script should only be run from the PIT node or ncn-m001"
+fi
+
+echo PASSED

From 27a8dac3b9ab091b4e33bc28a4e36f9214e1368f Mon Sep 17 00:00:00 2001
From: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>
Date: Sat, 15 Jun 2024 00:43:27 +0530
Subject: [PATCH 03/37] CASMTRIAGE-7069 Update Reboot_NCNs.md (#5149)

Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>
---
 operations/node_management/Reboot_NCNs.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/operations/node_management/Reboot_NCNs.md b/operations/node_management/Reboot_NCNs.md
index bf7709bf24ae..bfdf70d56cca 100644
--- a/operations/node_management/Reboot_NCNs.md
+++ b/operations/node_management/Reboot_NCNs.md
@@ -349,7 +349,7 @@ Before rebooting NCNs:
     1. (`ncn-mw#`) Cordon and drain the node.
 
        ```bash
-       kubectl drain --ignore-daemonsets=true --delete-local-data=true <node to be rebooted>
+       kubectl drain --ignore-daemonsets=true --delete-emptydir-data <node to be rebooted>
        ```
 
        There may be pods that cannot be gracefully evicted because of Pod Disruption Budgets (PDB). This will result in messages like the following:
@@ -370,7 +370,7 @@ Before rebooting NCNs:
        Then rerun the `kubectl drain` command, and it should report that the node is drained.
 
        ```bash
-       kubectl drain --ignore-daemonsets=true --delete-local-data=true <node to be rebooted>
+       kubectl drain --ignore-daemonsets=true --delete-emptydir-data <node to be rebooted>
        ```
 
     1. If booting from disk is desired, then [set the boot order](../../background/ncn_boot_workflow.md#setting-boot-order).

From 3f810aa5d7e72999c562ece6ca69351869be2e49 Mon Sep 17 00:00:00 2001
From: Shane Unruh <87081771+shunr-hpe@users.noreply.github.com>
Date: Tue, 18 Jun 2024 13:12:28 -0600
Subject: [PATCH 04/37] CASMHMS-6225 Changed the remove node procedure to allow
 for already removed nodes (#5166)

Changed the remove node procedure to allow for already removed nodes

CASMHMS-6225
---
 .../node_management/remove_standard_rack_node.sh     | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/scripts/operations/node_management/remove_standard_rack_node.sh b/scripts/operations/node_management/remove_standard_rack_node.sh
index 5fead528c8f1..4d659efeffcb 100755
--- a/scripts/operations/node_management/remove_standard_rack_node.sh
+++ b/scripts/operations/node_management/remove_standard_rack_node.sh
@@ -64,7 +64,17 @@ echo
 echo "=================================================="
 echo "Removing BMC Event subscriptions"
 echo "=================================================="
-/usr/share/doc/csm/scripts/operations/node_management/delete_bmc_subscriptions.py "${BMC_XNAME}"
+EXIT_CODE=0
+/usr/share/doc/csm/scripts/operations/node_management/delete_bmc_subscriptions.py "${BMC_XNAME}" || EXIT_CODE=$?
+if [[ $EXIT_CODE -ne 0 ]]; then
+  if [[ -z ${TOKEN+x} ]]; then
+    # delete_bmc_subscriptions.py failed because the TOKEN was not set
+    exit $EXIT_CODE
+  fi
+  echo "The redfish subscriptions were not removed from ${BMC_XNAME}. Check the messages above for the specific errors."
+  echo "This could be because the node has already been physically removed."
+  echo "The subscriptions will need to be cleaned up when the node is added back, if it is added in a new xname location, and is on a system running CSM 1.4 or older."
+fi
 
 echo
 echo "=================================================="

From 6464e4581faa8e83420ca2660cfb8b4ae928a346 Mon Sep 17 00:00:00 2001
From: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>
Date: Wed, 19 Jun 2024 00:43:46 +0530
Subject: [PATCH 05/37] CASMTRIAGE-7078 Update Rebuild_NCNs.md (#5155)

* Update Rebuild_NCNs.md

Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>

* Update Rebuild_NCNs.md

Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>

---------

Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>
---
 operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md b/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md
index dbf178f6971e..3f8498e1657f 100644
--- a/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md
+++ b/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md
@@ -87,7 +87,9 @@ export CSM_ARTI_DIR="/etc/cray/upgrade/csm/csm-${CSM_RELEASE}/tarball/csm-${CSM_
 >
 > - If the `/etc/cray/upgrade/csm/` directory is empty, create an empty directory at the same path. Download and extract CSM tarball to that directory.
 > - Update the value of `CSM_ARTI_DIR` with the newly created directory above.
+> - Download and install/upgrade the **latest** documentation on `ncn-m001` at path `/root/<rpm>`. See [Check for Latest Documentation](../../../update_product_stream/README.md#check-for-latest-documentation).
 > - Ensure the `/etc/cray/upgrade/csm/` directory is `ceph` mount using the command below (its output should show `ceph` as the type):
+
 ```bash
 mount | grep /etc/cray/upgrade/csm
 ```

From cf2d339f5c2d6885e527da6187cc12e3d6353bd7 Mon Sep 17 00:00:00 2001
From: Nick Davidson <86747615+ndavidson-hpe@users.noreply.github.com>
Date: Tue, 18 Jun 2024 13:16:11 -0600
Subject: [PATCH 06/37] CASMTRIAGE-7061: Add new known issue for keycloak
 (#5164)

---
 troubleshooting/README.md                     |  1 +
 .../Keycloak_Error_Cannot_read_properties.md  | 23 +++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 troubleshooting/known_issues/Keycloak_Error_Cannot_read_properties.md

diff --git a/troubleshooting/README.md b/troubleshooting/README.md
index 0d84134427d9..47feb8020f6a 100644
--- a/troubleshooting/README.md
+++ b/troubleshooting/README.md
@@ -43,6 +43,7 @@ to the exiting problem seen into the existing search. (The example searches for
 * [Software Management Services health check](known_issues/sms_health_check.md)
 * [QLogic driver crash](known_issues/qlogic_driver_crash.md)
 * [Nexus Fails Authentication with Keycloak Users](known_issues/Nexus_Fail_Authentication_with_Keycloak_Users.md)
+* [Keycloak Error "Cannot read properties" in Web UI](known_issues/Keycloak_Error_Cannot_read_properties.md)
 * [Gigabyte BMC Missing Redfish Data](known_issues/Gigabyte_BMC_Missing_Redfish_Data.md)
 * [`admin*client-auth` Not Found](known_issues/admin_client_auth_not_found.md)
 * [Ceph OSD latency](known_issues/ceph_osd_latency.md)
diff --git a/troubleshooting/known_issues/Keycloak_Error_Cannot_read_properties.md b/troubleshooting/known_issues/Keycloak_Error_Cannot_read_properties.md
new file mode 100644
index 000000000000..52b31bb86598
--- /dev/null
+++ b/troubleshooting/known_issues/Keycloak_Error_Cannot_read_properties.md
@@ -0,0 +1,23 @@
+# Keycloak Error "Cannot read properties" in Web UI
+
+There is a known error that occurs after upgrading CSM from 1.4 to CSM 1.5.0 and later. This error
+is shown when looking at users in Keycloak's web UI. The error occurs due to a change in how the LDAP
+configuration is done in earlier versions of Keycloak. This should not occur on fresh installs. The
+error occurs when looking at the user lists on Keycloak Web UI, and once looking at the page leaves a
+error message on the page stating "Cannot read properties of undefined (reading 0)"
+
+## Fix
+
+To recover from this situation, perform the following procedure.
+
+1. After seeing the error page you will need to refresh the page and ensure you are on the correct realm again
+
+1. Go to the `User Federation` section
+
+1. Click on the LDAP configuration page
+
+1. Click on the switch before `Enabled` to disable the LDAP configuration
+
+1. Click on `Disable` on the pop-up to disable the configuration
+
+1. Click on the switch again to enable the LDAP configuration

From 5416d6e0d41dcaf6fb90188775e69b652269b843 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 18 Jun 2024 14:20:08 -0500
Subject: [PATCH 07/37] CASMHMS-6206 - Procedure for updating vault with
 Paradise user/passwords (#5170)

* Docs for chaning Paradise Password and update to FW Updates

(cherry picked from commit f3590ee64d357f8f6d1962227d7abbe27a504f8b)

* Spell check / lint updates

(cherry picked from commit c8de805219366690d318277be814ecbe5f61a68c)

* More Lint / spell updates

(cherry picked from commit 98dd3d27a966d25d54376808e0b480e24f366648)

* Lint / spell

(cherry picked from commit 1dbc07ab412bf23d68c4a5201b8526c58738c869)

* Update operations/firmware/FAS_Paradise.md

Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>

---------

Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
Co-authored-by: Michael Buchmann <michael.buchmann@hpe.com>
Co-authored-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
---
 operations/firmware/FAS_Paradise.md           | 33 +++++++++++++-
 .../Replacing_Foxconn_User_Pass.md            | 43 +++++++++++++++++++
 2 files changed, 74 insertions(+), 2 deletions(-)
 create mode 100644 operations/node_management/Replacing_Foxconn_User_Pass.md

diff --git a/operations/firmware/FAS_Paradise.md b/operations/firmware/FAS_Paradise.md
index 4e3978cf3c1b..d7bd32c35f99 100644
--- a/operations/firmware/FAS_Paradise.md
+++ b/operations/firmware/FAS_Paradise.md
@@ -27,6 +27,8 @@ The following targets can be updated with FAS on Paradise Nodes:
 
 ## Update Paradise `bmc_active` procedure
 
+NOTE: If a reset of the BMC is required, follow [this procedure](#reset-bmc) before and after the update of each node.  *Only do this if required!*
+
 The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `bmc_active` - use recipe `foxconn_nodeBMC_bmc.json`
 
 The BMC will reboot after the update is complete.
@@ -95,7 +97,7 @@ To update using a JSON file and the Cray CLI, use this example JSON file and fol
 To do an AC power cycle, run the following command (`ncn#`).
 
 ```bash
-ssh $(xname) "ipmitool raw 0x38 0x02"
+ssh admin@$(xname) "ipmitool raw 0x38 0x02"
 ```
 
 The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `erot_active` - use recipe `foxconn_nodeBMC_erot.json`
@@ -130,7 +132,7 @@ To update using a JSON file and the Cray CLI, use this example JSON file and fol
 To do an AC power cycle, run the following command (`ncn#`).
 
 ```bash
-ssh $(xname) "ipmitool raw 0x38 0x02"
+ssh admin@$(xname) "ipmitool raw 0x38 0x02"
 ```
 
 The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `fpga_active` - use recipe `foxconn_nodeBMC_fpga.json`
@@ -370,3 +372,30 @@ If the firmware file you need is not listed, run the following command to copy t
 ```bash
 /usr/share/doc/csm/scripts/operations/firmware/upload_foxconn_images_tftp.py
 ```
+
+## Reset BMC
+
+This will reset the BMC to factory resets - including resetting the BMC username and password.
+*Only do this if required!*
+
+Before BMC firmware update (`ncn#`):
+
+The nodes must be **OFF** before updating BMC (when doing a reset)
+
+```bash
+ssh admin@$(xname) 'fw_setenv openbmconce "factory-reset"'
+```
+
+**Update BMC firmware using one of the methods above**
+NOTE: If the password changes after the boot of BMC, FAS will no longer be able to verify the update and will fail after the time limit.
+
+After firmware update(`ncn#`):
+
+If the password changed to something other than the what is stored in vault, update the BMC password:
+
+```bash
+ssh admin@$(xname) 'ipmitool user set password 1 "password"'
+```
+
+Boot the node.
+
diff --git a/operations/node_management/Replacing_Foxconn_User_Pass.md b/operations/node_management/Replacing_Foxconn_User_Pass.md
new file mode 100644
index 000000000000..66852c9fd095
--- /dev/null
+++ b/operations/node_management/Replacing_Foxconn_User_Pass.md
@@ -0,0 +1,43 @@
+# Replacing `Foxconn` Username and Passwords in Vault
+
+`Foxconn` (Paradise) nodes may be shipped with a different default username and password then the system password.
+Because of the difference in user/password, these nodes will not be able to be discovered.
+Vault needs to be updated with the `Foxconn` username and password using the `FoxconnUserPass.py` script or manually.
+
+## Procedure using the `FoxconnUserPass.py` script
+
+1. (`ncn-mw#`) Set up API token.
+
+    ```bash
+    export TOKEN=$(curl -k -s -S -d grant_type=client_credentials -d client_id=admin-client -d client_secret=$(kubectl get secrets admin-client-auth -o jsonpath='{.data.client-secret}' | base64 -d) https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token | jq -r '.access_token')
+    ```
+
+1. (`ncn-mw#`) Set helper variable.
+
+    ```bash
+    DOCS_DIR=/usr/share/doc/csm/scripts
+    ```
+
+1. (`ncn-mw#`) Run the `Foxconn` update script
+
+    ```bash
+    $DOCS_DIR/hardware_state_manager/FoxconnUserPass.py
+    ```
+
+    This will ask for the BMC username and password for the Paradise nodes.
+    The scirpt will look for undiscovered nodes, if it finds a `Foxconn` node, update vault with correct credentials.
+
+1. (`ncn-mw#`) Wait 10+ minutes for changes to take affect and nodes to be discovered.  To check nodes which have failed to be discovered:
+
+   ```bash
+   cray hsm inventory redfishEndpoints list --format json | jq '.[] | .[] | select (.DiscoveryInfo.LastDiscoveryStatus!="DiscoverOK")'
+   ```
+
+## Manual procedure to update credentials in vault
+
+1. (`ncn-mw#`) Use the Cray CLI to update vault through HSM (replace `BMC_xname` with the xname of the BMC, `Foxconn_user` with the `Foxconn` default username, and `Foxconn_pass` with the `Foxconn` default password):
+    NOTE: `BMC_xname` needs to be in the line twice
+
+   ```bash
+   cray hsm inventory redfishEndpoints update BMC_xname -id BMC_xname --user Foxconn_user --password Foxconn_pass
+   ```

From f91da084d36a92cda545ce75bfb257dd86d85799 Mon Sep 17 00:00:00 2001
From: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>
Date: Wed, 19 Jun 2024 00:52:05 +0530
Subject: [PATCH 08/37] CASMTRIAGE-7081 Update
 Access_the_Keycloak_User_Management_UI.md (#5160)

* Update Access_the_Keycloak_User_Management_UI.md

Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>

* Update Access_the_Keycloak_User_Management_UI.md

Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>

* Update operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md

Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>

---------

Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>
Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
Co-authored-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
---
 .../Access_the_Keycloak_User_Management_UI.md | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md b/operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md
index c89f2da43b54..b481c837b02b 100644
--- a/operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md
+++ b/operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md
@@ -7,7 +7,7 @@ See [Create Internal User Accounts in the Keycloak Shasta Realm](Create_Internal
 
 - This procedure uses `SYSTEM_DOMAIN_NAME` as an example for the DNS name of the non-compute node \(NCN\). Replace this name with the actual NCN's DNS name while executing this procedure.
 - This procedure assumes that the password for the Keycloak `admin` account is known. The Keycloak password is set during the software installation process.
-  - (`ncn-mw#`) The password can be obtained with the following command:
+  (`ncn-mw#`) The password can be obtained with the following command:
 
       ```bash
       kubectl get secret -n services keycloak-master-admin-auth --template={{.data.password}} | base64 --decode
@@ -19,14 +19,21 @@ See [Create Internal User Accounts in the Keycloak Shasta Realm](Create_Internal
 
     The following is an example URL for a system: `https://auth.cmn.system1.us.cray.com/keycloak/`
 
+    The value of `SYSTEM_DOMAIN_NAME` for a given cluster is obtained as shown in the following example:
+
+         ```bash
+         # echo $SYSTEM_DOMAIN
+         system1.us.cray.com
+         ```
+
     The browser may return an error message similar to the following when `auth.cmn.SYSTEM_DOMAIN_NAME/keycloak` is launched for the first time:
 
-    ```text
-    This Connection Is Not Private
+        ```text
+        This Connection Is Not Private
 
-    This website may be impersonating "hostname" to steal your personal or financial information.
-    You should go back to the previous page.
-    ```
+        This website may be impersonating "hostname" to steal your personal or financial information.
+        You should go back to the previous page.
+        ```
 
     See [Make HTTPS Requests from Sources Outside the Management Kubernetes Cluster](Make_HTTPS_Requests_from_Sources_Outside_the_Management_Kubernetes_Cluster.md)
     for more information on getting the Certificate Authority \(CA\) certificate on the system.

From c27a0aca423f8ace833876506b32566ffc370428 Mon Sep 17 00:00:00 2001
From: Rambabu Bolla <rambabubolla@gmail.com>
Date: Wed, 19 Jun 2024 00:53:21 +0530
Subject: [PATCH 09/37] =?UTF-8?q?CASMMON-394:=20CSM1.5.1:=20"grok-exporter?=
 =?UTF-8?q?"=20pod=20status=20showing=20as=20"Contain=E2=80=A6=20(#5157)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CASMMON-394: CSM1.5.1: "grok-exporter" pod status showing as "ContainerStatusUnknown" and "Error"

* Update operations/system_management_health/Grok-Exporter_Error.md

Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>

---------

Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
Co-authored-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
---
 operations/README.md                          |  2 +-
 .../Grok-Exporter_Error.md                    | 25 +++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 operations/system_management_health/Grok-Exporter_Error.md

diff --git a/operations/README.md b/operations/README.md
index e483f5b965a1..dbaef7073ab7 100644
--- a/operations/README.md
+++ b/operations/README.md
@@ -36,7 +36,6 @@ The following administrative topics can be found in this guide:
       - [MetalLB in BGP-mode](#metallb-in-bgp-mode)
 - [Spire](#spire)
 - [Update firmware with FAS](#update-firmware-with-fas)
-- [User Access Service (UAS)](#user-access-service-uas)
 - [System Admin Toolkit (SAT)](#system-admin-toolkit-sat)
 - [Install and Upgrade Framework (IUF)](#install-and-upgrade-framework-iuf)
 - [Backup and recovery](#backup-and-recovery)
@@ -466,6 +465,7 @@ confident that a lack of issues indicates the system is operating normally.
 - [Grafterm](system_management_health/Grafterm.md)
 - [Remove Kiali](system_management_health/Remove_Kiali.md)
 - [`prometheus-kafka-adapter` errors during installation](system_management_health/Prometheus_Kafka_Error.md)
+- [`grok-exporter` errors during installation](system_management_health/Grok-Exporter_Error.md)
 - [Troubleshoot Prometheus Alerts](system_management_health/Troubleshoot_Prometheus_Alerts.md)
 - [Configure UAN Node Exporter](system_management_health/uan_node_exporter_configs.md)
 
diff --git a/operations/system_management_health/Grok-Exporter_Error.md b/operations/system_management_health/Grok-Exporter_Error.md
new file mode 100644
index 000000000000..6e2665f40256
--- /dev/null
+++ b/operations/system_management_health/Grok-Exporter_Error.md
@@ -0,0 +1,25 @@
+# `grok-exporter` pod status showing as `ContainerStatusUnknown` Error
+
+## Symptom
+
+On CSM upgrade, the grok-exporter pod log has errors similar to the following:
+
+```text
+The node was low on resource: ephemeral-storage. Container grok-exporter was using 127200Ki, which exceeds its request of 0.
+```
+
+## Solution
+
+This Kafka service does not exist, because the [System Monitoring Application (SMA)](../../glossary.md#system-monitoring-application-sma)
+has not been installed yet. This causes the above errors for retry to be logged. Prometheus can operate without SMA Kafka and it will
+periodically retry the connection to Kafka. These errors will be logged until SMA is installed. Therefore, if they are seen before SMA is
+installed, then disregard them.
+
+The root file system on master is at more than 80% but keeps hitting the threshold to raise `NodeHasDiskPressure`(85%) which causes the
+node to then attempt to reclaim ephemeral-storage.
+
+Increase/clean the root filesystem and delete the grok exporter pod as follows:
+
+```bash
+kubectl delete pod -l app=grok-exporter -n sysmgmt-health
+```

From eec5a34eb69f938c269d33d6e464734af47b7298 Mon Sep 17 00:00:00 2001
From: Srinivas-Anand-HPE
 <119280543+Srinivas-Anand-HPE@users.noreply.github.com>
Date: Wed, 19 Jun 2024 01:15:37 +0530
Subject: [PATCH 10/37] =?UTF-8?q?CASMTRIAGE-6990=20update=20the=20IUF=20ma?=
 =?UTF-8?q?nual=20configuration=20instructions=20UAS=20and=20Badger=20conf?=
 =?UTF-8?q?i=E2=80=A6=20(#5142)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

update the IUF manual configuration instructions UAS and Badger configurations to match the network settings for SLURM and/or PBS Pro

Co-authored-by: Don Bahls <114519367+don-bahls-hpe@users.noreply.github.com>
Signed-off-by: Srinivas-Anand-HPE <119280543+Srinivas-Anand-HPE@users.noreply.github.com>

update the IUF manual configuration instructions UAS and Badger configurations to match the network settings for SLURM and/or PBS Pro
---
 operations/iuf/workflows/configuration.md | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/operations/iuf/workflows/configuration.md b/operations/iuf/workflows/configuration.md
index 8b7d7c58ed30..bbaa36baf438 100644
--- a/operations/iuf/workflows/configuration.md
+++ b/operations/iuf/workflows/configuration.md
@@ -203,11 +203,18 @@ required for initial installation scenarios.
   - Configure SAT authentication via `sat auth`
   - Generate SAT S3 credentials
   - Configure system revision information via `sat setrev`
-- UAS
-  - Configure UAS network settings 
-    - The network settings for UAS must match the WLM to allow job submission from UAIs
-- Badger
-  - Update CSM Diags network attachment definition
+- SLURM
+  - UAS
+    - Configure UAS network settings 
+      - The network settings for UAS must match the SLURM WLM to allow job submission from UAIs
+  - CSM Diags
+    - Update CSM Diags network attachment definition
+- PBS Pro
+  - UAS
+    - Configure UAS network settings 
+      - The network settings for UAS must match the PBS Pro WLM to allow job submission from UAIs
+  - CSM Diags
+    - Update CSM Diags network attachment definition
 
 
 Once this step has completed:

From ee3230ac96d66add282dca554b83f826a314e8d2 Mon Sep 17 00:00:00 2001
From: Michael Tupitsyn <mikhail.tupitsyn@hpe.com>
Date: Tue, 25 Jun 2024 12:02:53 -0700
Subject: [PATCH 11/37] CASMTRIAGE-7092 Workaround for inconsistent skopeo
 image name shortcuts (#5173)

It looks like podman works differently with container image aliases on different systems.
The `podman load -i skopeo.tar` creates image named `skopeo:xxx`, which resolves as shortcut
to `docker.io/library/skopeo.xxx`. However, command `podman run skopeo.xxx` may try to run
`docker.io/library/skopeo.xxx` or `quauy.io/skopeo.xxx`, it appears to be unpredictable.

The fix is to capture actual image name from `podman load` output and use it in `podman run`.
We actually use this technique in hpc-shastarelm-release (load-vendor-image procedure).
---
 upgrade/scripts/upgrade/prerequisites.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh
index a8f87ccfc163..f35d5172367f 100755
--- a/upgrade/scripts/upgrade/prerequisites.sh
+++ b/upgrade/scripts/upgrade/prerequisites.sh
@@ -523,14 +523,14 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then
     fi
     set -e
 
-    # Skopeo image is stored as "skopeo:csm-${CSM_RELEASE}"
-    podman load -i "${CSM_ARTI_DIR}/vendor/skopeo.tar"
+    # Skopeo image is stored as "skopeo:csm-${CSM_RELEASE}", which may resolve to docker.io/lirary/skopeo or quay.io/skopeo, depending on configured shortcuts
+    SKOPEO_IMAGE=$(podman load -q -i "${CSM_ARTI_DIR}/vendor/skopeo.tar" 2> /dev/null | sed -e 's/^.*: //')
     nexus_images=$(yq r -j "${CSM_MANIFESTS_DIR}/platform.yaml" 'spec.charts.(name==cray-precache-images).values.cacheImages' | jq -r '.[] | select( . | contains("nexus"))')
     worker_nodes=$(grep -oP "(ncn-w\d+)" /etc/hosts | sort -u)
     while read -r nexus_image; do
       echo "Uploading $nexus_image into Nexus ..."
       podman run --rm -v "${CSM_ARTI_DIR}/docker":/images \
-        "skopeo:csm-${CSM_RELEASE}" \
+        "${SKOPEO_IMAGE}" \
         --override-os=linux --override-arch=amd64 \
         copy \
         --remove-signatures \

From 810c5ae40aada0335623716309d9bf32244ec3f0 Mon Sep 17 00:00:00 2001
From: Shane Unruh <87081771+shunr-hpe@users.noreply.github.com>
Date: Tue, 25 Jun 2024 13:03:49 -0600
Subject: [PATCH 12/37] CASMHMS-5864 Changed subscription removal order in
 remove blade doc (#5176)

In the documentation on how to remove a liquid cooled blade, this
moves the step to clear the BMC subscriptions before the step to
disable the redfish endpoint.
---
 ...ing_a_Liquid-cooled_blade_from_a_System.md | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/operations/node_management/Removing_a_Liquid-cooled_blade_from_a_System.md b/operations/node_management/Removing_a_Liquid-cooled_blade_from_a_System.md
index cb225137f844..5013657299c3 100644
--- a/operations/node_management/Removing_a_Liquid-cooled_blade_from_a_System.md
+++ b/operations/node_management/Removing_a_Liquid-cooled_blade_from_a_System.md
@@ -36,16 +36,7 @@ This procedure will remove a liquid-cooled blades from an HPE Cray EX system.
     cray bos v2 sessions create --template-name $BOS_TEMPLATE --operation shutdown --limit x9000c3s0b0n0,x9000c3s0b0n1,x9000c3s0b1n0,x9000c3s0b1n1
     ```
 
-### 2. Disable the Redfish endpoints for the nodes
-
-1. (`ncn-mw#`) Temporarily disable the Redfish endpoints for `NodeBMCs` present in the blade.
-
-    ```bash
-    cray hsm inventory redfishEndpoints update --enabled false x9000c3s0b0 --id x9000c3s0b0
-    cray hsm inventory redfishEndpoints update --enabled false x9000c3s0b1 --id x9000c3s0b1
-    ```
-
-### 3. Clear Redfish event subscriptions from BMCs on the blade
+### 2. Clear Redfish event subscriptions from BMCs on the blade
 
 1. (`ncn-mw#`) Set the environment variable `SLOT` to the blade's location.
 
@@ -76,6 +67,15 @@ This procedure will remove a liquid-cooled blades from an HPE Cray EX system.
     Successfully deleted https://x3000c0s9b0/redfish/v1/EventService/Subscriptions/1
     ```
 
+### 3. Disable the Redfish endpoints for the nodes
+
+1. (`ncn-mw#`) Temporarily disable the Redfish endpoints for `NodeBMCs` present in the blade.
+
+    ```bash
+    cray hsm inventory redfishEndpoints update --enabled false x9000c3s0b0 --id x9000c3s0b0
+    cray hsm inventory redfishEndpoints update --enabled false x9000c3s0b1 --id x9000c3s0b1
+    ```
+
 ### 4. Clear the node controller settings
 
 1. (`ncn-mw#`) Remove the system-specific settings from each node controller on the blade.

From f1f41879aa5a727afbb8bb65bd2cf48ac26e2638 Mon Sep 17 00:00:00 2001
From: David Laine <77020169+dlaine-hpe@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:04:47 -0500
Subject: [PATCH 13/37] CASMCMS-9028 - clarify rbd instructions for an IMS
 remote build node. (#5177)

---
 .../Configure_a_Remote_Build_Node.md          | 38 ++++++++++++-------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/operations/image_management/Configure_a_Remote_Build_Node.md b/operations/image_management/Configure_a_Remote_Build_Node.md
index f720adc4d58c..af4dcbcbc580 100644
--- a/operations/image_management/Configure_a_Remote_Build_Node.md
+++ b/operations/image_management/Configure_a_Remote_Build_Node.md
@@ -40,6 +40,8 @@ the K8S pods. There are two primary reasons to choose to run jobs on a remote bu
     run on the native architecture of the remote node. Running `aarch64` image builds on an `aarch64` remote node
     can see over a 10 fold performance increase versus running the same job under emulation.
 
+Multiple remote build nodes may be created in any mix of architectures.
+
 Any job with an architecture matching a defined remote build node will be run remotely with no other changes
 needed. If there are multiple remote build nodes with the same architecture, there is a basic load balancing
 algorithm in place to spread the workload between all active remote build nodes.
@@ -47,8 +49,8 @@ algorithm in place to spread the workload between all active remote build nodes.
 When a new IMS job is created, the defined remote build nodes are checked to ensure SSH access is available
 and the required software is present on the node. If either of these checks fail, the node will not be used
 for the new job. If all matching remote nodes fail this check, the job will be created to run within the
-K8S environment as a standard local job. There is output in the `cray-ims` pod that will indicate why defined
-remote nodes are not being used if these checks fail.
+K8S environment as a standard local job. There is output in the `cray-ims` pod log that will indicate why
+defined remote nodes are not being used if these checks fail.
 
 See [Troubleshoot Remote Build Node](Troubleshoot_Remote_Build_Node.md) for issues running remote jobs.
 
@@ -128,7 +130,7 @@ used to work with images, or if it can still run compute jobs while building ima
 ### Create a barebones IMS builder image
 
 If there is no existing compute image to boot a node with, one can be created based on the barebones
-image that is installed with CSM.
+image that is installed with CSM. This image may be used to boot multiple remote build nodes.
 
 1. (`ncn-mw#`) Find the latest CSM install on the system.
 
@@ -213,7 +215,7 @@ image that is installed with CSM.
 
         Expected output will be something similar to:
 
-    ```json
+        ```json
         {
             "last_updated": "2024-04-23T16:44:55Z",
             "layers": [
@@ -348,16 +350,16 @@ image that is installed with CSM.
         {
             "boot_sets": {
                 "compute": {
-                "arch": "X86",
-                "etag": "9bbdebd4e51f32a2db8f8dd3e6124166",
-                "kernel_parameters": "ip=dhcp quiet spire_join_token=${SPIRE_JOIN_TOKEN} root=live:s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/rootfs nmd_data=url=s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/rootfs,etag=9bbdebd4e51f32a2db8f8dd3e6124166",
-                "node_roles_groups": [
-                    "Compute"
-                ],
-                "path": "s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/manifest.json",
-                "rootfs_provider": "",
-                "rootfs_provider_passthrough": "",
-                "type": "s3"
+                    "arch": "X86",
+                    "etag": "9bbdebd4e51f32a2db8f8dd3e6124166",
+                    "kernel_parameters": "ip=dhcp quiet spire_join_token=${SPIRE_JOIN_TOKEN} root=live:s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/rootfs nmd_data=url=s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/rootfs,etag=9bbdebd4e51f32a2db8f8dd3e6124166",
+                    "node_roles_groups": [
+                        "Compute"
+                    ],
+                    "path": "s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/manifest.json",
+                    "rootfs_provider": "",
+                    "rootfs_provider_passthrough": "",
+                    "type": "s3"
                 }
             },
             "name": "bos_ims_remote_node",
@@ -386,6 +388,14 @@ directly into the IMS builder node.
 
 Below is a procedure to provide the IMS builder node with additional storage.
 
+NOTE: The Ceph storage described below has several important characteristics to keep in mind:
+
+* This RBD device is created globally.
+* Each RBD device will still exist after the remote build node is rebooted.
+* Each RBD device must have a unique name, but may be re-used after the node is rebooted.
+* This type of RBD device may only be mounted on one node - one must be created for each remote build node.
+* If the remote build node is rebooted, the RBD device must be manually mounted again.
+
 1. Set an environment variable for the xname of the remote build node.
 
     ```bash

From 1520fb697568e82086b0287ccebb732e9fb44dc9 Mon Sep 17 00:00:00 2001
From: Srinivas-Anand-HPE
 <119280543+Srinivas-Anand-HPE@users.noreply.github.com>
Date: Wed, 26 Jun 2024 00:41:16 +0530
Subject: [PATCH 14/37] =?UTF-8?q?CASMTRIAGE-7055:=20Check=20for=20the=20la?=
 =?UTF-8?q?test=20docs-csm=20before=20starting=20the=20up=E2=80=A6=20(#518?=
 =?UTF-8?q?0)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* CASMTRIAGE-7055: Check for the latest docs-csm before starting the upgrade

* Style check

Signed-off-by: Russell Bunch <doomslayer@hpe.com>

* Fix indentation

Signed-off-by: Russell Bunch <doomslayer@hpe.com>

* Spellcheck

Signed-off-by: Russell Bunch <doomslayer@hpe.com>

---------

Signed-off-by: Russell Bunch <doomslayer@hpe.com>
Co-authored-by: Russell Bunch <doomslayer@hpe.com>
---
 operations/iuf/workflows/preparation.md | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/operations/iuf/workflows/preparation.md b/operations/iuf/workflows/preparation.md
index 90411a380639..272e46cc548e 100644
--- a/operations/iuf/workflows/preparation.md
+++ b/operations/iuf/workflows/preparation.md
@@ -31,9 +31,15 @@ This section defines environment variables and directory content that is used th
     mkdir -p "${ACTIVITY_DIR}" "${MEDIA_DIR}" "${ADMIN_DIR}"
     ```
 
-Once this step has completed:
+    Once this step has completed:
 
-- Environment variables have been set and required IUF directories have been created
+    - Environment variables have been set and required IUF directories have been created
+
+1. Ensure that the
+   [latest version of `docs-csm`](https://github.com/Cray-HPE/docs-csm/blob/release/1.6/update_product_stream/README.md#check-for-latest-documentation)
+    is installed for the target CSM version being installed or upgraded.
+
+    For example: when upgrading from CSM version 1.5.0 to version 1.5.1, install `docs-csm-1.5.1.noarch`
 
 ## 2. Use of `iuf activity`
 

From 8b7fdd68f397a8971b39cc9fa2724f0ae92830d5 Mon Sep 17 00:00:00 2001
From: Mitch Harding <mitchell.harding@hpe.com>
Date: Wed, 26 Jun 2024 15:27:24 -0400
Subject: [PATCH 15/37] CASMINST-6902: Improve/automate PIT data backup (#5182)

---
 install/deploy_final_non-compute_node.md |  78 ++++---------
 install/scripts/backup-pit-data.sh       | 139 +++++++++++++++++++++++
 2 files changed, 159 insertions(+), 58 deletions(-)
 create mode 100755 install/scripts/backup-pit-data.sh

diff --git a/install/deploy_final_non-compute_node.md b/install/deploy_final_non-compute_node.md
index e88d458fe7be..0d96d4007d06 100644
--- a/install/deploy_final_non-compute_node.md
+++ b/install/deploy_final_non-compute_node.md
@@ -190,70 +190,34 @@ The steps in this section load hand-off data before a later procedure reboots th
 
 It is important to backup some files from `ncn-m001` before it is rebooted.
 
-1. (`pit#`) Set up passwordless SSH **to** the PIT node from `ncn-m002`.
-
-    > The `ssh` command below may prompt for the NCN root password.
-
-    ```bash
-    ssh ncn-m002 cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys &&
-        chmod 600 /root/.ssh/authorized_keys
-    ```
-
 1. (`pit#`) Stop the typescript session.
 
     ```bash
     exit
     ```
 
-1. (`pit#`) Preserve logs and configuration files if desired.
+1. (`pit#`) Create PIT backup and copy it off.
 
-    The following commands create a `tar` archive of select files on the PIT node. This archive is located
-    in a directory that will be backed up in the next steps.
+    This script creates a backup of select files on the PIT node, copying them to both
+    another master NCN and to S3.
+
+    > The script below may prompt for the NCN root password.
 
     ```bash
-    mkdir -pv "${PITDATA}"/prep/logs &&
-         ls -d \
-            /etc/dnsmasq.d \
-            /etc/os-release \
-            /etc/sysconfig/network \
-            /opt/cray/tests/cmsdev.log \
-            /opt/cray/tests/install/logs \
-            /opt/cray/tests/logs \
-            /root/.canu \
-            /root/.config/cray/logs \
-            /root/csm*.{log,txt} \
-            /tmp/*.log \
-            /usr/share/doc/csm/install/scripts/csm_services/yapl.log \
-            /var/log/conman \
-            /var/log/zypper.log 2>/dev/null |
-         sed 's_^/__' |
-         xargs tar -C / -czvf "${PITDATA}/prep/logs/pit-backup-$(date +%Y-%m-%d_%H-%M-%S).tgz"
+    /usr/share/doc/csm/install/scripts/backup-pit-data.sh
     ```
 
-1. (`pit#`) Copy some of the installation files to `ncn-m002`.
-
-    These files will be copied back to `ncn-m001` after the PIT node is rebooted.
+    Ensure that the script output ends with `COMPLETED`, indicating that the procedure was successful.
 
-    ```bash
-    ssh ncn-m002 \
-        "mkdir -pv /metal/bootstrap
-         rsync -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -rltD -P --delete pit.nmn:'${PITDATA}'/prep /metal/bootstrap/
-         rsync -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -rltD -P --delete pit.nmn:'${CSM_PATH}'/images/pre-install-toolkit/pre-install-toolkit*.iso /metal/bootstrap/"
-    ```
+1. In the output of the script run in the previous step, note the value it reports for the `first-master-hostname`.
+   This will be needed in a later step.
 
-1. (`pit#`) Upload install files to S3 in the cluster.
+    Example output excerpt:
 
-    ```bash
-    PITBackupDateTime=$(date +%Y-%m-%d_%H-%M-%S)
-    tar -czvf "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" "${PITDATA}/prep" "${PITDATA}/configs" "${CSM_PATH}/images/pre-install-toolkit/pre-install-toolkit"*.iso &&
-    cray artifacts create config-data \
-        "PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" \
-        "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" &&
-    rm -v "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" && echo COMPLETED
+    ```text
+    first-master-hostname: ncn-m002
     ```
 
-    Ensure that the previous command chain output ends with `COMPLETED`, indicating that the procedure was successful.
-
 ## 4. Reboot
 
 1. (`external#`) Open a serial console to the PIT node, if one is not already open.
@@ -327,13 +291,15 @@ It is important to backup some files from `ncn-m001` before it is rebooted.
 1. (`ncn-m001#`) Restore and verify the site link.
 
     Restore networking files from the manual backup taken during the
-    [Backup](#33-backup) step.
+    [Backup](#33-backup) step. Set the `FM` variable to the `first-master-hostname`
+    value noted in that section.
 
     > **`NOTE`** Do NOT change any default NCN hostname; otherwise, unexpected deployment or upgrade errors may happen.
 
     ```bash
     SYSTEM_NAME=eniac
-    rsync "ncn-m002:/metal/bootstrap/prep/${SYSTEM_NAME}/pit-files/ifcfg-lan0" /etc/sysconfig/network/ && \
+    FM=ncn-m002
+    rsync "${FM}:/metal/bootstrap/prep/${SYSTEM_NAME}/pit-files/ifcfg-lan0" /etc/sysconfig/network/ && \
         wicked ifreload lan0 && \
         wicked ifstatus lan0
     ```
@@ -378,19 +344,15 @@ It is important to backup some files from `ncn-m001` before it is rebooted.
         exit
         ```
 
-    1. (`ncn-m002#`) Copy install files back to `ncn-m001`.
+    1. If `ncn-m002` is not the `first-master-hostname` noted in the [Backup](#33-backup) step, then SSH to that node.
 
-        ```bash
-        rsync -rltDv -P /metal/bootstrap ncn-m001:/metal/ && rm -rfv /metal/bootstrap
-        ```
-
-    1. (`ncn-m002#`) Log out of `ncn-m002`.
+    1. (`first-master-hostname#`) Copy install files back to `ncn-m001`.
 
         ```bash
-        exit
+        rsync -rltDv -P /metal/bootstrap ncn-m001:/metal/ && rm -rfv /metal/bootstrap
         ```
 
-    1. Log in to `ncn-m001`.
+    1. Log out of the other nodes and log in to `ncn-m001`.
 
         SSH back into `ncn-m001` or log in at the console.
 
diff --git a/install/scripts/backup-pit-data.sh b/install/scripts/backup-pit-data.sh
new file mode 100755
index 000000000000..cf97d8d6518c
--- /dev/null
+++ b/install/scripts/backup-pit-data.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+set -euo pipefail
+
+# This means that something like /tmp/*.log will evaluate to an empty string if no files fit the pattern
+shopt -s nullglob
+
+# This script is a replacement for the steps that were previously done manually
+# during the "Deploy Final NCN" step of CSM installs.
+
+function err_exit {
+  echo "ERROR: $*" >&2
+  exit 1
+}
+
+function dir_exists {
+  [[ -e $1 ]] || err_exit "Directory '$1' does not exist"
+  [[ -d $1 ]] || err_exit "'$1' exists but is not a directory"
+}
+
+function run_cmd {
+  echo "# $*"
+  "$@" || err_exit "Command failed with exit code $?: $*"
+}
+
+# Ensure that PITDATA and CSM_PATH variables are set
+[[ -v PITDATA && -n ${PITDATA} ]] || err_exit "PITDATA variable must be set"
+[[ -v CSM_PATH && -n ${CSM_PATH} ]] || err_exit "CSM_PATH variable must be set"
+
+# Make sure that expected directories exist and are actually directories
+for DIR in "${PITDATA}" "${PITDATA}/prep" "${PITDATA}/configs" "${CSM_PATH}" \
+  "${CSM_PATH}/images" "${CSM_PATH}/images/pre-install-toolkit"; do
+
+  dir_exists "${DIR}"
+
+done
+
+PIT_ISO_DIR="${CSM_PATH}/images/pre-install-toolkit"
+
+# Make sure that expected PIT iso file can be found
+compgen -G "${PIT_ISO_DIR}/pre-install-toolkit*.iso" > /dev/null 2>&1 || err_exit "PIT ISO file (${PIT_ISO_DIR}/pre-install-toolkit*.iso) not found"
+
+# Make sure we can figure out the first master node
+DATA_JSON="${PITDATA}/configs/data.json"
+[[ -e ${DATA_JSON} ]] || err_exit "File does not exist: '${DATA_JSON}'"
+[[ -f ${DATA_JSON} ]] || err_exit "Exists but is not a regular file: '${DATA_JSON}'"
+[[ -s ${DATA_JSON} ]] || err_exit "File exists but is empty: '${DATA_JSON}'"
+
+FM=$(jq -r '."Global"."meta-data"."first-master-hostname"' < "${DATA_JSON}") || err_exit "Error getting first-master-hostname from '${DATA_JSON}'"
+[[ -n ${FM} ]] || err_exit "No first-master-hostname found in '${DATA_JSON}'"
+echo "first-master-hostname: $FM"
+
+# Set up passwordless SSH **to** the PIT node from the first-master node
+echo "If prompted, enter the $(whoami) password for ${FM}"
+ssh "${FM}" cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys || err_exit "Unable to read ${FM}:/root/.ssh/id_rsa.pub and/or write to /root/.ssh/authorized_keys"
+run_cmd chmod 600 /root/.ssh/authorized_keys
+
+# Okay, everything seems good
+run_cmd mkdir -pv "${PITDATA}"/prep/logs
+
+# Because some of these files are log files that are changing during this procedure, any call to directly
+# tar them may result in the tar command failing. Thus, we first copy all of these files into a temporary
+# directory, and from there we create the tar archive
+
+TEMPDIR=$(mktemp -d) || err_exit "Command failed: mktemp -d"
+
+echo "Copying selected files to temporary directory"
+
+for BACKUP_TARGET in \
+  /etc/conman.conf \
+  /etc/dnsmasq.d \
+  /etc/os-release \
+  /etc/sysconfig/network \
+  /opt/cray/tests/cmsdev.log \
+  /opt/cray/tests/install/logs \
+  /opt/cray/tests/logs \
+  /root/.bash_history \
+  /root/.canu \
+  /root/.config/cray/logs \
+  /root/csm*.{log,txt} \
+  /tmp/*.log \
+  /usr/share/doc/csm/install/scripts/csm_services/yapl.log \
+  /var/log; do
+
+  [[ -e ${BACKUP_TARGET} ]] || continue
+  DIRNAME=$(dirname "${BACKUP_TARGET}")
+  TARG_DIR="${TEMPDIR}${DIRNAME}"
+  run_cmd mkdir -pv "${TARG_DIR}"
+  run_cmd cp -pr "${BACKUP_TARGET}" "${TARG_DIR}"
+
+done
+
+echo "Creating PIT backup tarfile"
+
+pushd "${TEMPDIR}"
+run_cmd tar -czvf "${PITDATA}/prep/logs/pit-backup-$(date +%Y-%m-%d_%H-%M-%S).tgz" --remove-files *
+popd
+run_cmd rmdir -v "${TEMPDIR}"
+
+echo "Copying files to ${FM}"
+ssh "${FM}" \
+  "mkdir -pv /metal/bootstrap &&
+   rsync -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -rltD -P --delete pit.nmn:'${PITDATA}'/prep /metal/bootstrap/ &&
+   rsync -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -rltD -P --delete pit.nmn:'${PIT_ISO_DIR}'/pre-install-toolkit*.iso /metal/bootstrap/"
+
+PITBackupDateTime=$(date +%Y-%m-%d_%H-%M-%S)
+run_cmd tar -czvf "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" "${PITDATA}/prep" "${PITDATA}/configs" "${PIT_ISO_DIR}/pre-install-toolkit"*.iso
+run_cmd cray artifacts create config-data \
+  "PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" \
+  "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz"
+run_cmd rm -v "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz"
+
+# Since the installer needs to take note of this value, we will display it again here at the end of the script
+echo "first-master-hostname: $FM"
+
+echo COMPLETED

From 941238e86ac144eb82727084f2c8dc8c20fa90de Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 2 Jul 2024 14:02:09 -0500
Subject: [PATCH 16/37] CASMINST-6718: Create VCS import/export scripts (#5192)

* CASMINST-6718: Create VCS import/export scripts

(cherry picked from commit 2173fe18af9fdeb294d338035fd0507712a64c59)
(cherry picked from commit a2e78bac4aa7d6241a24ac3de488fbae136277e3)

* Placate modified linter whims

(cherry picked from commit ae797d98643e2fd41030fcd1cae9bce665930d30)
(cherry picked from commit 9bfc2ca78e5211158e92a01d03528234757a5ca9)

* CASMINST-6718: Allow user to specify working directory location for VCS backups

(cherry picked from commit 030d3f871d00668292c52338d7dd262b53195a50)

* CASMINST-6718: setup_cms_minio_mount: Allow user to specify mount point

(cherry picked from commit 945ac0bffa1a19e68e51b4cdfbb3549c10b7a22e)

* CASMINST-6718: setup_cms_minio_mount: Add option to create cms bucket without creating s3fs mount

(cherry picked from commit 6920179c604c49bd5bee887737465f410464d59a)

---------

Co-authored-by: Mitch Harding (the weird one) <mitchell.harding@hpe.com>
---
 .../Version_Control_Service_VCS.md            |  74 ++++++-
 .../operations/configuration/backup_vcs.sh    | 133 ++++++++++++
 .../operations/configuration/bash_lib/vcs.sh  |  45 ++++
 .../operations/configuration/restore_vcs.sh   | 203 ++++++++++++++++++
 .../system_recovery/setup_cms_minio_mount.sh  | 128 +++++++++++
 5 files changed, 575 insertions(+), 8 deletions(-)
 create mode 100755 scripts/operations/configuration/backup_vcs.sh
 create mode 100644 scripts/operations/configuration/bash_lib/vcs.sh
 create mode 100755 scripts/operations/configuration/restore_vcs.sh
 create mode 100755 scripts/operations/system_recovery/setup_cms_minio_mount.sh

diff --git a/operations/configuration_management/Version_Control_Service_VCS.md b/operations/configuration_management/Version_Control_Service_VCS.md
index afbd75e950f4..f227cd14c1a9 100644
--- a/operations/configuration_management/Version_Control_Service_VCS.md
+++ b/operations/configuration_management/Version_Control_Service_VCS.md
@@ -6,10 +6,14 @@
     * [Change VCS administrative user password](#change-vcs-administrative-user-password)
 * [Access the `cray` Gitea organization](#access-the-cray-gitea-organization)
 * [Backup and restore data](#backup-and-restore-data)
-    * [Backup Postgres data](#backup-postgres-data)
-    * [Backup PVC data](#backup-pvc-data)
-    * [Restore Postgres data](#restore-postgres-data)
-    * [Restore PVC data](#restore-pvc-data)
+    * [Automated backup and restore](#automated-backup-and-restore)
+        * [Automated backup](#automated-backup)
+        * [Automated restore](#automated-restore)
+    * [Manual backup and restore](#manual-backup-and-restore)
+        * [Manually backup Postgres data](#manually-backup-postgres-data)
+        * [Manually backup PVC data](#manually-backup-pvc-data)
+        * [Manually restore Postgres data](#manually-restore-postgres-data)
+        * [Manually restore PVC data](#manually-restore-pvc-data)
     * [Alternative backup/restore strategy](#alternative-backuprestore-strategy)
         * [Alternative export method](#alternative-export-method)
         * [Alternative import method](#alternative-import-method)
@@ -227,7 +231,58 @@ Select the permissions appropriately, and then navigate to the following URL to
 Data for Gitea is stored in two places: Git content is stored directly in a PVC, while structural data, such as Gitea users and the list and attributes of repositories, is stored
 in a Postgres database. Because of this, both sources must be backed up and restored together.
 
-### Backup Postgres data
+* [Automated backup and restore](#automated-backup-and-restore)
+    * [Automated backup](#automated-backup)
+    * [Automated restore](#automated-restore)
+* [Manual backup and restore](#manual-backup-and-restore)
+    * [Manually backup Postgres data](#manually-backup-postgres-data)
+    * [Manually backup PVC data](#manually-backup-pvc-data)
+    * [Manually restore Postgres data](#manually-restore-postgres-data)
+    * [Manually restore PVC data](#manually-restore-pvc-data)
+* [Alternative backup/restore strategy](#alternative-backuprestore-strategy)
+    * [Alternative export method](#alternative-export-method)
+    * [Alternative import method](#alternative-import-method)
+
+### Automated backup and restore
+
+* [Automated backup](#automated-backup)
+* [Automated restore](#automated-restore)
+
+#### Automated backup
+
+(`ncn-mw#`) Running the following script creates a tar archive containing both the Postgres and PVC data.
+
+> The argument to the script is the directory where the resulting archive should be created.
+
+```bash
+/usr/share/doc/csm/scripts/operations/configuration/backup_vcs.sh /root
+```
+
+The end of the output will include the path to the backup archive. For example:
+
+```text
+Gitea/VCS data successfully backed up to /root/gitea-vcs-20240626192742-dRW95b.tgz
+```
+
+Be sure to save the resulting archive file to a safe location.
+
+#### Automated restore
+
+(`ncn-mw#`) The archive generated by the [Automated backup](#automated-backup) script can be used as
+input to the following automated restore script.
+
+```bash
+/usr/share/doc/csm/scripts/operations/configuration/restore_vcs.sh /root/gitea-vcs-20240626192742-dRW95b.tgz
+```
+
+### Manual backup and restore
+
+* [Manually backup Postgres data](#manually-backup-postgres-data)
+* [Manually backup PVC data](#manually-backup-pvc-data)
+* [Manually restore Postgres data](#manually-restore-postgres-data)
+* [Manually restore PVC data](#manually-restore-pvc-data)
+
+#### Manually backup Postgres data
 
 1. (`ncn-mw#`) Determine which Postgres member is the leader.
 
@@ -290,7 +345,7 @@ in a Postgres database. Because of this, both sources must be backed up and rest
 
 1. Copy all files to a safe location.
 
-### Backup PVC data
+#### Manually backup PVC data
 
 (`ncn-mw#`) The VCS Postgres backups should be accompanied by backups of the VCS PVC. The export process can be run at any time
 while the service is running using the following commands:
@@ -303,11 +358,11 @@ kubectl -n services cp ${POD}:/tmp/vcs.tar ./vcs.tar
 
 Be sure to save the resulting `tar` file to a safe location.
 
-### Restore Postgres data
+#### Manually restore Postgres data
 
 See [Restore Postgres for VCS](../../operations/kubernetes/Restore_Postgres.md#restore-postgres-for-vcs).
 
-### Restore PVC data
+#### Manually restore PVC data
 
 (`ncn-mw#`) When restoring the VCS Postgres database, the PVC should also be restored to the same point in time. The restore
 process can be run at any time while the service is running using the following commands:
@@ -327,6 +382,9 @@ and may need to be recreated manually if the VCS deployment is lost.
 
 The following scripts create and use a `vcs-content` directory that contains all Git data. This should be copied to a safe location after export, and moved back to the system before import.
 
+* [Alternative export method](#alternative-export-method)
+* [Alternative import method](#alternative-import-method)
+
 #### Alternative export method
 
 > **WARNING:** The following example uses the VCS `admin` username and password in plaintext on the command line, meaning it will be stored in the shell history as
diff --git a/scripts/operations/configuration/backup_vcs.sh b/scripts/operations/configuration/backup_vcs.sh
new file mode 100755
index 000000000000..882a2afe3380
--- /dev/null
+++ b/scripts/operations/configuration/backup_vcs.sh
@@ -0,0 +1,133 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=./bash_lib/common.sh
+. "${locOfScript}/bash_lib/common.sh"
+
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=./bash_lib/vcs.sh
+. "${locOfScript}/bash_lib/vcs.sh"
+
+set -uo pipefail
+
+function backup_postgres {
+  local leader_pod json secrets secret num_secrets tmpfile field
+
+  sql_outfile="${TMPDIR}/${SQL_BACKUP_NAME}"
+  sec_outfile="${TMPDIR}/${SEC_BACKUP_NAME}"
+
+  json=$(run_cmd kubectl exec gitea-vcs-postgres-0 -n services -c postgres -it -- patronictl list -f json) || err_exit
+  leader_pod=$(run_cmd jq -r '.[] | select(.Role == "Leader") | .Member' <<< "${json}") || err_exit
+  [[ -n ${leader_pod} ]] || err_exit "No gitea-vcs-postgres leader pod found"
+  echo "Backing up data from gitea-vcs-postgres leader pod ${leader_pod} to ${sql_outfile}"
+
+  run_cmd kubectl exec -it "${leader_pod}" -n services -c postgres -- pg_dumpall --if-exists -c -U postgres > "${sql_outfile}" \
+    || err_exit "Error writing to file '${sql_outfile}'"
+
+  echo "Backing up gitea-vcs-postgres Kubernetes secrets to ${sec_outfile}"
+
+  num_secrets=0
+  secrets=$(run_cmd kubectl get secrets -n services -l cluster-name=gitea-vcs-postgres -o custom-columns=":metadata.name" --no-headers) || err_exit
+  tmpfile=$(run_mktemp -p "$TMPDIR") || err_exit
+  echo "---" > "${sec_outfile}" || err_exit "Error writing to '${sec_outfile}'"
+  for secret in ${secrets}; do
+    let num_secrets+=1
+    echo "Backing up secret: ${secret}"
+    run_cmd kubectl get secret "${secret}" -n services -o yaml > "${tmpfile}" || err_exit "Error writing to '${tmpfile}'"
+    for field in creationTimestamp resourceVersion selfLink uid; do
+      run_cmd yq d -i "${tmpfile}" "metadata.${field}"
+    done
+    run_cmd cat "${tmpfile}" >> "${sec_outfile}" || err_exit "Error appending to '${sec_outfile}'"
+    echo "---" >> "${sec_outfile}" || err_exit "Error appending to '${sec_outfile}'"
+  done
+  run_cmd rm "${tmpfile}"
+  [[ ${num_secrets} -ge 3 ]] || err_exit "Expected at least 3 secrets, but only found ${num_secrets}"
+}
+
+function backup_pvc {
+  local pvc_outfile gitea_pod
+
+  pvc_outfile="${TMPDIR}/${PVC_BACKUP_NAME}"
+
+  # Set the gitea_pod variable to the name of the gitea pod
+  get_gitea_pod
+
+  echo "Backing up PVC data from gitea pod ${gitea_pod}"
+  run_cmd kubectl -n services exec "${gitea_pod}" -- tar -cf /tmp/vcs.tar /var/lib/gitea/
+  echo "Copying backed up data out of the pod to ${pvc_outfile}"
+  run_cmd kubectl -n services cp "${gitea_pod}":/tmp/vcs.tar "${pvc_outfile}"
+}
+
+function usage {
+  echo "Usage: backup_vcs.sh [-t workdir_location] [output_directory]" >&2
+  echo
+  echo "If no output directory is specified, one is created under the user's home directory" >&2
+  echo "If no working directory is specified, one is created under the user's home directory" >&2
+}
+
+OUTDIR=""
+WORKDIR_BASE=""
+
+if [[ $# -eq 1 ]] && [[ $1 == "-h" || $1 == "--help" ]]; then
+  usage
+  exit 2
+fi
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    "-t")
+      [[ $# -gt 1 ]] || usage_err_exit "The $1 parameter requires an argument"
+      [[ -n ${WORKDIR_BASE} ]] && usage_err_exit "The $1 parameter may only be specified once"
+      shift
+      [[ -n $1 ]] || usage_err_exit "Work directory may not be blank"
+      [[ -e $1 ]] || usage_err_exit "Specified work directory ($1) does not exist"
+      [[ -d $1 ]] || usage_err_exit "Specified work directory ($1) exists but is not a directory"
+      WORKDIR_BASE="$1"
+      ;;
+    *)
+      [[ $# -eq 1 ]] || usage_err_exit "Too many arguments"
+      [[ -n $1 ]] || usage_err_exit "Output directory argument may not be blank"
+      [[ -e $1 ]] || usage_err_exit "Specified output directory ($1) does not exist"
+      [[ -d $1 ]] || usage_err_exit "Specified output directory ($1) exists but is not a directory"
+      OUTDIR="$1"
+      ;;
+  esac
+  shift
+done
+
+[[ -n ${OUTDIR} ]] || OUTDIR=~
+[[ -n ${WORKDIR_BASE} ]] || WORKDIR_BASE=~
+
+TMPDIR=$(run_mktemp -d "${WORKDIR_BASE}/gitea_vcs_backup.$(date +%Y%m%d%H%M%S).XXX") || err_exit
+
+echo "Backing up Gitea/VCS data"
+backup_postgres
+backup_pvc
+BACKUP_TARFILE=$(run_mktemp "${OUTDIR}/gitea-vcs-$(date +%Y%m%d%H%M%S)-XXXXXX.tgz") || err_exit
+run_cmd tar -C "${TMPDIR}" -czf "${BACKUP_TARFILE}" --remove-files "${SQL_BACKUP_NAME}" "${SEC_BACKUP_NAME}" "${PVC_BACKUP_NAME}"
+rmdir "${TMPDIR}" || echo "WARNING: Unable to remove temporary directory '${TMPDIR}'"
+echo "Gitea/VCS data successfully backed up to ${BACKUP_TARFILE}"
diff --git a/scripts/operations/configuration/bash_lib/vcs.sh b/scripts/operations/configuration/bash_lib/vcs.sh
new file mode 100644
index 000000000000..c79add810db8
--- /dev/null
+++ b/scripts/operations/configuration/bash_lib/vcs.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=./common.sh
+. "${locOfScript}/common.sh"
+
+# Shared function and variable definitions between VCS backup and restore scripts
+
+# These variables are not used in this file, but are used by scripts which source this file
+#shellcheck disable=SC2034
+SQL_BACKUP_NAME=gitea-vcs-postgres.sql
+#shellcheck disable=SC2034
+SEC_BACKUP_NAME=gitea-vcs-postgres.manifest
+#shellcheck disable=SC2034
+PVC_BACKUP_NAME=vcs.tar
+
+function get_gitea_pod {
+  # Sets $gitea_pod to the name of the gitea pod, or exits if it cannot be found
+  gitea_pod=$(run_cmd kubectl -n services get pod -l app.kubernetes.io/instance=gitea -o custom-columns=":metadata.name" --no-headers) || err_exit
+  [[ -n ${gitea_pod} ]] || err_exit "No gitea pod found"
+}
diff --git a/scripts/operations/configuration/restore_vcs.sh b/scripts/operations/configuration/restore_vcs.sh
new file mode 100755
index 000000000000..4b350d572508
--- /dev/null
+++ b/scripts/operations/configuration/restore_vcs.sh
@@ -0,0 +1,203 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=./bash_lib/common.sh
+. "${locOfScript}/bash_lib/common.sh"
+
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=./bash_lib/vcs.sh
+. "${locOfScript}/bash_lib/vcs.sh"
+
+set -uo pipefail
+
+DUMPFILE=""
+MANIFEST=""
+TARFILE=""
+
+function wait_for_pods_to_start {
+  # Usage: wait_for_pods_to_start <argument to kubectl get pods -l flag> <# of pods expected>
+  local num_pods
+  [[ $# -ne 2 ]] && err_exit "$0: Function requires exactly 2 arguments but received $#. Invalid arguments: $*"
+  [[ -z $1 ]] && err_exit "$0: First argument may not be blank"
+  [[ -z $1 ]] && err_exit "$0: Second argument may not be blank"
+  [[ ! $2 -gt 0 ]] && err_exit "$0: Second argument must be an integer greated than 0. Invalid second argument: $2"
+
+  echo "Wait for $2 pod(s) to be running."
+  num_pods=$(kubectl get pods -l "$1" -n services | grep Running | wc -l)
+  while [[ ${num_pods} -lt $2 ]]; do
+    echo "  ${num_pods} running; waiting for $2 pod(s) to be running"
+    sleep 5
+    num_pods=$(kubectl get pods -l "$1" -n services | grep Running | wc -l)
+  done
+}
+
+function wait_for_pods_to_terminate {
+  # Usage: wait_for_pods_to_terminate <argument to kubectl get pods -l flag>
+  [[ $# -ne 1 ]] && err_exit "$0: Function requires exactly 1 argument but received $#. Invalid arguments: $*"
+  [[ -z $1 ]] && err_exit "$0: Argument may not be blank"
+  echo "Wait for pods to terminate ($1)"
+  while kubectl get pods -n services -l "$1" | grep -qv NAME; do
+    echo "  waiting for pods to terminate"
+    sleep 5
+  done
+}
+
+# It seems that shellcheck doesn't like that we defensively check to make sure the function
+# did not accidentally get passed arguments. Sorry not sorry, shellcheck
+#shellcheck disable=SC2120
+function wait_for_postgres_cluster_running {
+  # Takes no arguments
+  local status
+  [[ $# -ne 0 ]] && err_exit "$0: Function takes no arguments but received $#. Invalid arguments: $*"
+
+  echo "Wait for the gitea-vcs-postgres Postgres cluster to start running."
+  while true; do
+    status=$(kubectl get postgresql gitea-vcs-postgres -n services -o json | jq -r '.status.PostgresClusterStatus')
+    [[ ${status} == "Running" ]] && return
+    echo "  waiting for postgresql to start running"
+    sleep 5
+  done
+}
+
+function restore_sql_and_secrets {
+  local tmp_outfile postgres_cr_json postgres_cr_single_json
+
+  echo "Scale VCS service to 0"
+  run_cmd kubectl scale deployment gitea-vcs -n services --replicas=0
+
+  wait_for_pods_to_terminate app.kubernetes.io/name=vcs
+
+  echo "Delete VCS Postgres cluster"
+
+  tmp_outfile=$(run_mktemp -p ~) || exit 1
+  run_cmd kubectl get postgresql gitea-vcs-postgres -n services -o json > "${tmp_outfile}" || err_exit "Error creating ${tmp_outfile}"
+
+  postgres_cr_json=$(run_mktemp -p ~ postgres-cr.XXX.json) || exit 1
+  run_cmd jq 'del(.spec.selector) | del(.spec.template.metadata.labels."controller-uid") | del(.status)' "${tmp_outfile}" > "${postgres_cr_json}" || err_exit "Error creating ${postgres_cr_json}"
+
+  run_cmd kubectl delete -f "${postgres_cr_json}"
+
+  wait_for_pods_to_terminate application=spilo,cluster-name=gitea-vcs-postgres
+
+  echo "Create a new single instance VCS Postgres cluster."
+  postgres_cr_single_json=$(run_mktemp -p ~ postgres-cr-single.XXX.json) || exit 1
+
+  run_cmd jq '.spec.numberOfInstances = 1' "${postgres_cr_json}" > "${postgres_cr_single_json}" || err_exit "Error creating ${postgres_cr_single_json}"
+
+  run_cmd kubectl create -f "${postgres_cr_single_json}"
+
+  wait_for_pods_to_start application=spilo,cluster-name=gitea-vcs-postgres 1
+
+  wait_for_postgres_cluster_running
+
+  echo "Restore the database from ${DUMPFILE}"
+  run_cmd kubectl exec gitea-vcs-postgres-0 -c postgres -n services -it -- psql -U postgres < "${DUMPFILE}" || err_exit "Error reading from $DUMPFILE"
+
+  echo "Delete the gitea-vcs-postgres secrets"
+  run_cmd kubectl delete -f "${MANIFEST}"
+
+  echo "Recreate the gitea-vcs-postgres secrets using the manifest (${MANIFEST})"
+  run_cmd kubectl apply -f "${MANIFEST}"
+
+  echo "Restart the Postgres cluster."
+  run_cmd kubectl delete pod -n services gitea-vcs-postgres-0
+
+  wait_for_pods_to_start application=spilo,cluster-name=gitea-vcs-postgres 1
+
+  echo "Scale the Postgres cluster back to 3 instances."
+  run_cmd kubectl patch postgresql gitea-vcs-postgres -n services --type=json -p='[{"op" : "replace", "path":"/spec/numberOfInstances", "value" : 3}]'
+
+  wait_for_postgres_cluster_running
+
+  echo "Scale the Gitea service back up."
+  run_cmd kubectl scale deployment gitea-vcs -n services --replicas=1
+
+  wait_for_pods_to_start app.kubernetes.io/name=vcs 1
+
+  rm "${tmp_outfile}" "${postgres_cr_json}" "${postgres_cr_single_json}" > /dev/null 2>&1
+}
+
+function restore_pvc_data {
+  local gitea_pod
+
+  # Set the gitea_pod variable to the name of the gitea pod
+  get_gitea_pod
+
+  echo "Copy PVC data tarfile into pod (${gitea_pod})"
+  run_cmd kubectl -n services cp "${TARFILE}" "${gitea_pod}":/tmp/vcs.tar
+
+  echo "Expand PVC data tarfile in pod"
+  run_cmd kubectl -n services exec "${gitea_pod}" -- tar -C / -xf /tmp/vcs.tar
+}
+
+function usage {
+  echo "Usage: restore_vcs.sh <gitea-vcs backup tgz file>"
+  echo
+  echo "This file is the one produced by the backup_vcs.sh script" >&2
+}
+
+function input_file_exists_nonempty {
+  [[ $# -eq 1 ]] || err_exit "Programming logic error: $0 function takes exactly 1 argument but received $#: $*"
+  [[ -n $1 ]] || err_exit "Programming logic error: $0 function argument may not be blank"
+  [[ -e $1 ]] || usage_err_exit "File does not exist: '$1'"
+  [[ -f $1 ]] || usage_err_exit "Exists but is not a regular file: '$1'"
+  [[ -s $1 ]] || usage_err_exit "File is 0 size: '$1'"
+}
+
+[[ $# -eq 0 ]] && usage_err_exit "Missing required arguments"
+[[ $# -gt 1 ]] && usage_err_exit "Too many arguments"
+[[ -n $1 ]] || usage_err_exit "Argument may not be blank"
+input_file_exists_nonempty "$1"
+
+TMPDIR=$(run_mktemp -d -p ~) || err_exit
+run_cmd tar -C "${TMPDIR}" -xvf "$1"
+
+DUMPFILE="${TMPDIR}/${SQL_BACKUP_NAME}"
+MANIFEST="${TMPDIR}/${SEC_BACKUP_NAME}"
+TARFILE="${TMPDIR}/${PVC_BACKUP_NAME}"
+input_file_exists_nonempty "${DUMPFILE}"
+input_file_exists_nonempty "${MANIFEST}"
+input_file_exists_nonempty "${TARFILE}"
+
+# A very quick check just to help catch cases where the completely wrong file is somehow found
+grep -q 'PostgreSQL database cluster dump' "${DUMPFILE}" || usage_err_exit "Does not appear to be a SQL database cluster dump: '${DUMPFILE}'"
+grep -Eq '^apiVersion:' "${MANIFEST}" || usage_err_exit "Does not appear to be a manifest file: '${MANIFEST}'"
+file "${TARFILE}" | grep -q 'tar archive' || usage_err_exit "Does not appear to be a tar archive: '${TARFILE}'"
+
+restore_sql_and_secrets
+restore_pvc_data
+
+echo "Restart gitea-vcs deployment"
+run_cmd kubectl -n services rollout restart deployment gitea-vcs
+
+echo "Wait for restart to complete"
+run_cmd kubectl -n services rollout status deployment gitea-vcs
+
+rm "${DUMPFILE}" "${MANIFEST}" "${TARFILE}"
+rmdir "${TMPDIR}"
+
+echo "Gitea/VCS restore completed!"
diff --git a/scripts/operations/system_recovery/setup_cms_minio_mount.sh b/scripts/operations/system_recovery/setup_cms_minio_mount.sh
new file mode 100755
index 000000000000..d5d40360648f
--- /dev/null
+++ b/scripts/operations/system_recovery/setup_cms_minio_mount.sh
@@ -0,0 +1,128 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+DEFAULT_CMS_MINIO_MNT=/etc/cray/minio/cms
+AWS_CREDFILE=/root/.aws/credentials
+
+locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=../configuration/bash_lib/common.sh
+. "${locOfScript}/../configuration/bash_lib/common.sh"
+
+set -o pipefail
+
+function usage {
+  echo "Usage: setup_cms_minio_mount.sh {--rw | --ro} [--init] [mount_point]" >&2
+  echo "       setup_cms_minio_mount.sh --nomount --init" >&2
+  echo >&2
+  echo "If --init is specified, the cms bucket will be created, if it does not exist." >&2
+  echo "The --rw / --ro arguments govern whether it will be mounted read-write or read-only" >&2
+  echo "If mount_point is not specified, it defaults to '${DEFAULT_CMS_MINIO_MNT}'" >&2
+  echo >&2
+  echo "The --nomount --init option creates the cms bucket (if needed) but does not create a mount" >&2
+  echo >&2
+}
+
+CMS_MINIO_MNT=""
+MOUNT_OPT=""
+INIT=""
+[[ $# -eq 0 ]] && usage_err_exit "At least 1 argument is required"
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    "--ro")
+      [[ ${MOUNT_OPT} == "ro" ]] && usage_err_exit "Argument --$1 may only be specified once"
+      [[ -n ${MOUNT_OPT} ]] && usage_err_exit "Arguments $1 and --${MOUNT_OPT} are mutually exclusive"
+      MOUNT_OPT=ro
+      ;;
+    "--rw")
+      [[ ${MOUNT_OPT} == "rw" ]] && usage_err_exit "Argument --$1 may only be specified once"
+      [[ -n ${MOUNT_OPT} ]] && usage_err_exit "Arguments $1 and --${MOUNT_OPT} are mutually exclusive"
+      MOUNT_OPT=rw
+      ;;
+    "--nomount")
+      [[ ${MOUNT_OPT} == "nomount" ]] && usage_err_exit "Argument --$1 may only be specified once"
+      [[ -n ${MOUNT_OPT} ]] && usage_err_exit "Arguments $1 and --${MOUNT_OPT} are mutually exclusive"
+      MOUNT_OPT=nomount
+      ;;
+    "--init")
+      [[ -n ${INIT} ]] && usage_err_exit "Argument --init may only be specified once"
+      INIT=Y
+      ;;
+    *)
+      [[ $# -gt 1 ]] && usage_err_exit "Too many arguments"
+      [[ ${MOUNT_OPT} == "nomount" ]] && usage_err_exit "Invalid to specify a mount point when --nomount specified"
+      [[ -n $1 ]] || usage_err_exit "Mount point may not be blank"
+      [[ $1 =~ ^/.* ]] || usage_err_exit "Cannot use relative path for mount point"
+      CMS_MINIO_MNT="$1"
+      ;;
+  esac
+  shift
+done
+
+[[ -z ${MOUNT_OPT} ]] && usage_err_exit "One of the following options must be specified: --nomount, --ro, --rw"
+
+[[ ${MOUNT_OPT} == "nomount" && -z ${INIT} ]] && usage_err_exit "Invalid to specify --nomount without --init"
+
+# Make sure the credentials file exists and is not empty
+[[ -e ${AWS_CREDFILE} ]] || err_exit "AWS credentials file (${AWS_CREDFILE}) does not exist"
+[[ -f ${AWS_CREDFILE} ]] || err_exit "AWS credentials file (${AWS_CREDFILE}) exists but is not a regular file"
+[[ -s ${AWS_CREDFILE} ]] || err_exit "AWS credentials file (${AWS_CREDFILE}) exists but is empty"
+
+# Check for existence of CMS bucket
+if ! aws s3api list-buckets --endpoint-url http://ncn-m001.nmn:8000 | jq -r '.Buckets[] | .Name' | grep -Eq '^cms$'; then
+  [[ -z ${INIT} ]] && err_exit "'cms' bucket does not exist in Minio"
+  echo "Creating cms bucket"
+  run_cmd aws s3api create-bucket --bucket cms --endpoint-url http://ncn-m001.nmn:8000
+  echo "cms minio bucket created"
+else
+  echo "cms minio bucket already exists"
+fi
+
+[[ ${MOUNT_OPT} == "nomount" ]] && exit 0
+
+[[ -n ${CMS_MINIO_MNT} ]] || CMS_MINIO_MNT="${DEFAULT_CMS_MINIO_MNT}"
+
+# Unmount, if it is currently mounted
+umount "${CMS_MINIO_MNT}" > /dev/null 2>&1
+
+if [[ ! -d ${CMS_MINIO_MNT} ]]; then
+  echo "Creating directory '${CMS_MINIO_MNT}'"
+  run_cmd mkdir -pv "${CMS_MINIO_MNT}"
+fi
+
+credfile=$(run_mktemp /root/.XXXXXX.minio.s3fs) || exit 1
+
+AKEY=$(grep '^aws_access_key_id = ' "${AWS_CREDFILE}" | awk '{ print $NF }') || err_exit "Error getting aws_access_key_id from ${AWS_CREDFILE}"
+SKEY=$(grep '^aws_secret_access_key = ' "${AWS_CREDFILE}" | awk '{ print $NF }') || err_exit "Error getting aws_secret_access_key from ${AWS_CREDFILE}"
+
+cat << EOF > "${credfile}" || err_exit "Error writing to '${credfile}'"
+${AKEY}:${SKEY}
+EOF
+
+run_cmd chmod 600 "${credfile}"
+run_cmd s3fs cms "${CMS_MINIO_MNT}" -o "_netdev,${MOUNT_OPT},allow_other,passwd_file=${credfile},url=http://ncn-m001.nmn:8000,use_path_request_style,use_xattr"
+
+echo "CMS minio mount (${CMS_MINIO_MNT}) created"
+exit 0

From 169b8e257869af34f5e9f4f405ddcc6b05218cb0 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 2 Jul 2024 14:06:39 -0500
Subject: [PATCH 17/37] CASMINST-6906: Update RPMs on all NCNs in
 prerequisites.sh; allow vendor changes in test RPM updater (#5196)

* CASMINST-6906: Update RPMs on all NCNs in prerequisites.sh; allow vendor changes in test RPM updater

(cherry picked from commit 0af679c695cb57ad24874e20289ac5391e25e269)

* Update Validate_CSM_Health_During_Upgrade.md

Fix pre-existing issue that makes the linter sad

Signed-off-by: Mitch Harding <mitchell.harding@hpe.com>

---------

Signed-off-by: Mitch Harding <mitchell.harding@hpe.com>
Co-authored-by: Mitch Harding (the weird one) <mitchell.harding@hpe.com>
---
 upgrade/Validate_CSM_Health_During_Upgrade.md     | 10 ++++------
 upgrade/scripts/upgrade/prerequisites.sh          |  5 +++--
 upgrade/scripts/upgrade/util/upgrade-test-rpms.sh | 10 ++++++----
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/upgrade/Validate_CSM_Health_During_Upgrade.md b/upgrade/Validate_CSM_Health_During_Upgrade.md
index 34f459a683fc..a5f4100cb9a3 100644
--- a/upgrade/Validate_CSM_Health_During_Upgrade.md
+++ b/upgrade/Validate_CSM_Health_During_Upgrade.md
@@ -18,15 +18,12 @@
     If additional shells are opened during this procedure, then record those with typescripts as well. When resuming a procedure
     after a break, always be sure that a typescript is running before proceeding.
 
-1. Validate CSM health.
+1. (`ncn-m002#`) Validate CSM health.
 
-    Run the combined health check script, which runs a variety of health checks that should pass at this stage of the upgrade:
-
-    - Kubernetes health checks
-    - NCN health checks
+    Run the combined health check script, which runs a variety of health checks that should pass at this stage of the upgrade.
 
     ```bash
-    /opt/cray/tests/install/ncn/automated/ncn-k8s-combined-healthcheck-post-service-upgrade
+    /opt/cray/tests/install/ncn/automated/ncn-k8s-combined-healthcheck
     ```
 
     Review the output and follow the instructions provided to resolve any test failures. With the exception of
@@ -59,6 +56,7 @@
         ```bash
         cray artifacts create config-data "${TARFILE}" "/root/${TARFILE}"
         ```
+
 1. Update ceph node-exporter config for SNMP counters.
 
     > **OPTIONAL:** This is an optional step.
diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh
index f35d5172367f..dad2997fec42 100755
--- a/upgrade/scripts/upgrade/prerequisites.sh
+++ b/upgrade/scripts/upgrade/prerequisites.sh
@@ -1289,8 +1289,9 @@ if [[ ${state_recorded} == "0" ]]; then
     systemctl enable goss-servers
     systemctl restart goss-servers
 
-    # Install above RPMs and restart goss-servers on ncn-w001
-    ssh ncn-w001 "rpm --force -Uvh ${url_list[*]}; systemctl enable goss-servers; systemctl restart goss-servers;"
+    # Install above RPMs and restart goss-servers on all other NCNs
+    ncns=$(grep -oP 'ncn-\w\d+' /etc/hosts | sort -u | grep -Ev "^$(hostname -s)$" | tr -t '\n' ',')
+    pdsh -S -b -w ${ncns} "rpm --force -Uvh ${url_list[*]}; systemctl enable goss-servers; systemctl restart goss-servers;"
 
     # get all installed CSM version into a file
     kubectl get cm -n services cray-product-catalog -o json | jq -r '.data.csm' | yq r - -d '*' -j | jq -r 'keys[]' > /tmp/csm_versions
diff --git a/upgrade/scripts/upgrade/util/upgrade-test-rpms.sh b/upgrade/scripts/upgrade/util/upgrade-test-rpms.sh
index 6f18da0140aa..58bdae3d4744 100755
--- a/upgrade/scripts/upgrade/util/upgrade-test-rpms.sh
+++ b/upgrade/scripts/upgrade/util/upgrade-test-rpms.sh
@@ -27,21 +27,23 @@
 # If --local is not specified, upgrade the test RPMs on all NCNs
 # If --local is specified, upgrade the test RPMs just on the system where the script is being executed
 
+RPM_LIST="hpe-csm-goss-package csm-testing goss-servers craycli cray-cmstools-crayctldeploy"
+
 set -euo pipefail
 if [[ $# -eq 0 ]]; then
 
   ncns=$(grep -oP 'ncn-\w\d+' /etc/hosts | sort -u | tr -t '\n' ',')
 
-  echo "Installing updated versions of hpe-csm-goss-package csm-testing goss-servers craycli RPMs"
-  pdsh -S -b -w ${ncns} 'zypper install -y hpe-csm-goss-package csm-testing goss-servers craycli'
+  echo "Installing updated versions of RPMs on all NCNs: ${RPM_LIST}"
+  pdsh -S -b -w ${ncns} "zypper install -y --allow-vendor-change ${RPM_LIST}"
 
   echo "Enabling and restarting goss-servers"
   pdsh -S -b -w ${ncns} 'systemctl enable goss-servers && systemctl restart goss-servers'
 
 elif [[ $# -eq 1 && $1 == --local ]]; then
 
-  echo "Installing updated versions of hpe-csm-goss-package csm-testing goss-servers craycli RPMs"
-  zypper install -y hpe-csm-goss-package csm-testing goss-servers craycli
+  echo "Installing updated versions of RPMs: ${RPM_LIST}"
+  zypper install -y --allow-vendor-change ${RPM_LIST}
 
   echo "Enabling and restarting goss-servers"
   systemctl enable goss-servers && systemctl restart goss-servers

From 4d445f1a276111f62e607a7ca49b3622dd9da612 Mon Sep 17 00:00:00 2001
From: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>
Date: Wed, 3 Jul 2024 00:40:16 +0530
Subject: [PATCH 18/37] CASMTRIAGE-7110 Update configure_cray_cli.md (#5200)

Update configure_cray_cli.md

Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com>
---
 operations/configure_cray_cli.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/operations/configure_cray_cli.md b/operations/configure_cray_cli.md
index f0ca173e8345..fd4ffa62fb1a 100644
--- a/operations/configure_cray_cli.md
+++ b/operations/configure_cray_cli.md
@@ -73,6 +73,11 @@ install and should be removed when the install is complete.
 
 As the script leverages Keycloak administrative APIs, the `--keycloakHost` command line option must be set to use the CMN load balancer, as detailed below.
 
+> **`NOTES:`**
+>
+> - This script creates a `temporary user` that can be used for basic `cray` CLI command only until Keycloak is populated with real users. At which point, the `cray` CLI should be re-initialized with a real user.
+> - The `temporary user` that was created is only in Keycloak - it is not a `real` user with login shells and home directories.
+
 ### Procedure for temporary Keycloak user
 
 1. (`ncn-mws#`) Unset the `CRAY_CREDENTIALS` environment variable, if previously set.

From c884308e2ddcbc7f06ac615a1b7928e60936d758 Mon Sep 17 00:00:00 2001
From: Ryan Haasken <77809410+haasken-hpe@users.noreply.github.com>
Date: Tue, 2 Jul 2024 14:10:39 -0500
Subject: [PATCH 19/37] CASMTRIAGE-7016: Fix update_tags.sh by removing bad
 `echo` (#5201)

The `echo` command inside of `get_latest_tag_for_image` pollutes the
output of this function, which is then fed into `update_tags_in_file`,
which then results in the incorrect arguments being passed to a `sed`
command which results in `sed` failing with a "No such file or
directory" error.

Remove this `echo` command. It's not clear what its purpose is.

Test Description:
Removed this echo command from a copy of the script and executed the
script and verified that it correctly updated image tags in Argo
workflow templates.
---
 workflows/update_tags.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/workflows/update_tags.sh b/workflows/update_tags.sh
index b6f1857c6cae..76227afc7b77 100755
--- a/workflows/update_tags.sh
+++ b/workflows/update_tags.sh
@@ -75,7 +75,6 @@ function get_latest_tag_for_image() {
     THIS_PREFIX="${THIS_REGISTRY_NAME}/"
     THIS_IMAGE=$(echo "${THIS_IMAGE}" | sed "s#^${DEFAULT_REGISTRY_REGEX}/##")
   fi
-  echo $THIS_PODMAN_TLS $THIS_PREFIX$THIS_IMAGE
   podman search $THIS_PODMAN_TLS $THIS_PREFIX$THIS_IMAGE --list-tags --format=json | jq -r '
     def opt(f):
       . as $in | try f catch $in;

From 406e70d9121a363b99365936b63197f40afbfc8c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 9 Jul 2024 14:11:17 -0500
Subject: [PATCH 20/37] CASMINST-6916: Create CPC exporter & combined CMS/CPC
 export scripts (#5204)

* CASMINST-6916: Create script to dump Cray Product Catalog

(cherry picked from commit 1cb1a41f8180120a83b14167972d50ecf2d27712)

* CASMINST-6916: Create disaster recovery scripts for CMS/CPC exports

(cherry picked from commit 9314599f4defca4a0d1ee00e235772605e6f45af)

* Minor tweaks

(cherry picked from commit c40d78c961bd83d95c33a09d9548281adb26319a)

* Unmount s3fs mount before exiting

(cherry picked from commit d9203c3d4493e8385547c9478a2cae9d396dd34c)

---------

Co-authored-by: Mitch Harding (the weird one) <mitchell.harding@hpe.com>
---
 scripts/operations/configuration/dump_cpc.sh  |  65 +++++++
 .../system_recovery/bash_lib/ims.sh           |  30 +++
 .../cms_minio_export_helper.sh                | 106 +++++++++++
 .../export_cms_cpc_to_minio.sh                | 176 ++++++++++++++++++
 4 files changed, 377 insertions(+)
 create mode 100755 scripts/operations/configuration/dump_cpc.sh
 create mode 100644 scripts/operations/system_recovery/bash_lib/ims.sh
 create mode 100755 scripts/operations/system_recovery/cms_minio_export_helper.sh
 create mode 100755 scripts/operations/system_recovery/export_cms_cpc_to_minio.sh

diff --git a/scripts/operations/configuration/dump_cpc.sh b/scripts/operations/configuration/dump_cpc.sh
new file mode 100755
index 000000000000..66bfef7fc29b
--- /dev/null
+++ b/scripts/operations/configuration/dump_cpc.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=./bash_lib/common.sh
+. "${locOfScript}/bash_lib/common.sh"
+
+set -uo pipefail
+
+function usage {
+  echo "Usage: dump_cpc.sh [output_directory]" >&2
+  echo
+  echo "If no output directory is specified, it defaults to the user's home directory" >&2
+}
+
+OUTDIR=""
+
+if [[ $# -eq 1 ]] && [[ $1 == "-h" || $1 == "--help" ]]; then
+  usage
+  exit 2
+fi
+
+if [[ $# -gt 1 ]]; then
+  usage_err_exit "Too many arguments"
+elif [[ $# -eq 0 ]]; then
+  OUTDIR=~
+elif [[ -z $1 ]]; then
+  usage_err_exit "Output directory argument may not be blank"
+elif [[ ! -e $1 ]]; then
+  usage_err_exit "Specified output directory ($1) does not exist"
+elif [[ ! -d $1 ]]; then
+  usage_err_exit "Specified output directory ($1) exists but is not a directory"
+else
+  OUTDIR="$1"
+fi
+
+OUTFILE=$(run_mktemp "${OUTDIR}/cray-product-catalog-$(date +%Y%m%d%H%M%S)-XXXXXX.yaml") || err_exit
+echo "Dumping Cray Product Catalog to '${OUTFILE}'"
+
+run_cmd kubectl get cm -n services cray-product-catalog -o yaml > "${OUTFILE}" || err_exit "Error writing to '${OUTFILE}'"
+
+echo "Cray Product Catalog dumped to '${OUTFILE}'"
diff --git a/scripts/operations/system_recovery/bash_lib/ims.sh b/scripts/operations/system_recovery/bash_lib/ims.sh
new file mode 100644
index 000000000000..de7fcab2f55f
--- /dev/null
+++ b/scripts/operations/system_recovery/bash_lib/ims.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+# Shared function and variable definitions between IMS backup and restore scripts
+
+# These variables are not used in this file, but are used by scripts which source this file
+#shellcheck disable=SC2034
+IMS_FS_MNT=/opt/cray/pit/ims
diff --git a/scripts/operations/system_recovery/cms_minio_export_helper.sh b/scripts/operations/system_recovery/cms_minio_export_helper.sh
new file mode 100755
index 000000000000..94952aede0b9
--- /dev/null
+++ b/scripts/operations/system_recovery/cms_minio_export_helper.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+CONFIG_SCRIPT_DIR="${locOfScript}/../configuration"
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=../configuration/bash_lib/common.sh
+. "${CONFIG_SCRIPT_DIR}/bash_lib/common.sh"
+
+set -uo pipefail
+
+function usage {
+  echo "Usage: cms_minio_export_helper.sh <bos | cfs | cpc | ims | vcs>" >&2
+}
+
+if [[ $# -eq 1 ]] && [[ $1 == "-h" || $1 == "--help" ]]; then
+  usage
+  exit 2
+fi
+
+[[ $# -ne 0 ]] || usage_err_exit "Missing required argument"
+[[ $# -le 1 ]] || usage_err_exit "Too many arguments"
+[[ -n $1 ]] || usage_err_exit "Export area cannot be blank"
+
+# Set defaults
+area="$1"
+EXPORT_SCRIPT_ARGS=()
+TMPDIR_BASE=~
+BACKUP_EXT=tgz
+
+case "${area}" in
+  bos)
+    EXPORT_SCRIPT_NAME="export_bos_data.sh"
+    BACKUP_PREFIX=bos-export
+    ;;
+  cfs)
+    EXPORT_SCRIPT_NAME="export_cfs_data.sh"
+    BACKUP_PREFIX=cfs-export
+    ;;
+  cpc)
+    EXPORT_SCRIPT_NAME="dump_cpc.sh"
+    BACKUP_PREFIX=cray-product-catalog
+    BACKUP_EXT=yaml
+    ;;
+  ims)
+    EXPORT_SCRIPT_NAME="export_ims_data.py"
+    # Unlike the other export scripts, we use an additional argument with the IMS exporter
+    EXPORT_SCRIPT_ARGS=("--no-tar")
+    # We don't set backup prefix and ext for IMS, because it is handled differently
+
+    # IMS uses a different temp location for its backup, because of how large it is
+    # Inform ShellCheck about the file we are sourcing
+    # shellcheck source=./bash_lib/ims.sh
+    . "${locOfScript}/bash_lib/ims.sh"
+
+    [[ -e ${IMS_FS_MNT} ]] || err_exit "Directory does not exist: '${IMS_FS_MNT}'"
+    [[ -d ${IMS_FS_MNT} ]] || err_exit "Exists but is not a directory: '${IMS_FS_MNT}'"
+    TMPDIR_BASE="${IMS_FS_MNT}"
+    ;;
+  vcs)
+    EXPORT_SCRIPT_NAME="backup_vcs.sh"
+    BACKUP_PREFIX=gitea-vcs
+    ;;
+  *)
+    usage_err_exit "Unknown export area: '${area}'"
+    ;;
+esac
+
+TMPDIR=$(run_mktemp -d "${TMPDIR_BASE}/export-${area}.XXX") || err_exit
+run_cmd "${CONFIG_SCRIPT_DIR}/${EXPORT_SCRIPT_NAME}" "${EXPORT_SCRIPT_ARGS[@]}" "${TMPDIR}"
+
+# Copying the data over to minio is different for IMS versus the others
+if [[ ${area} == ims ]]; then
+  run_cmd aws s3 sync "${TMPDIR}" s3://cms/ims --endpoint-url http://localhost:8000
+  # We want to fail the script if this fails, because it will leave a lot of data on disk otherwise
+  run_cmd rm -rf "${TMPDIR}"
+else
+  run_cmd aws s3 mv "${TMPDIR}/${BACKUP_PREFIX}"*".${BACKUP_EXT}" s3://cms --endpoint-url http://localhost:8000
+  # Non-IMS backups are much smaller, plus we are using s3 mv, so it's not the end of the world if
+  # we don't clean up the temporary directory
+  rmdir "${TMPDIR}" || echo "WARNING: Unable to remove directory '${TMPDIR}'" >&2
+fi
+
+echo "${area} export completed successfully"
diff --git a/scripts/operations/system_recovery/export_cms_cpc_to_minio.sh b/scripts/operations/system_recovery/export_cms_cpc_to_minio.sh
new file mode 100755
index 000000000000..3788f4ce50f6
--- /dev/null
+++ b/scripts/operations/system_recovery/export_cms_cpc_to_minio.sh
@@ -0,0 +1,176 @@
+#!/bin/bash
+#
+# MIT License
+#
+# (C) Copyright 2024 Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+#
+
+locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)
+CONFIG_SCRIPT_DIR="${locOfScript}/../configuration"
+# Inform ShellCheck about the file we are sourcing
+# shellcheck source=../configuration/bash_lib/common.sh
+. "${CONFIG_SCRIPT_DIR}/bash_lib/common.sh"
+
+CMS_EXPORT_SCRIPT="${locOfScript}/cms_minio_export_helper.sh"
+
+set -uo pipefail
+
+function usage {
+  echo "Usage: export_cms_cpc.sh [bos] [cfs] [cpc] [ims] [vcs]" >&2
+  echo >&2
+  echo "If no areas are specified, all areas are exported." >&2
+  echo "Otherwise, only the specified areas are exported." >&2
+}
+
+if [[ $# -eq 1 ]] && [[ $1 == "-h" || $1 == "--help" ]]; then
+  usage
+  exit 2
+fi
+
+backup_areas=()
+backup_pids=()
+
+function add_area {
+  local a IMS_FS_MNT
+  [[ $1 =~ ^(bos|cfs|cpc|ims|vcs)$ ]] || usage_err_exit "Unrecognized export area '$1'"
+  for a in "${backup_areas[@]}"; do
+    # no need to add it if we already have it
+    [[ $a == "$1" ]] && return
+  done
+  backup_areas+=("$1")
+  [[ $1 == ims ]] || return
+  # Since we're exporting IMS, make sure /opt/cray/pit/ims exists
+  # Inform ShellCheck about the file we are sourcing
+  # shellcheck source=./bash_lib/ims.sh
+  . "${locOfScript}/bash_lib/ims.sh"
+  [[ -e ${IMS_FS_MNT} ]] || err_exit "Directory does not exist: '${IMS_FS_MNT}'"
+  [[ -d ${IMS_FS_MNT} ]] || err_exit "Exists but is not a directory: '${IMS_FS_MNT}'"
+}
+
+if [[ $# -eq 0 ]]; then
+  add_area bos
+  add_area cfs
+  add_area cpc
+  add_area ims
+  add_area vcs
+else
+  while [[ $# -gt 0 ]]; do
+    add_area "$1"
+    shift
+  done
+fi
+
+# Create mount point for CMS minio s3fs
+CMS_MINIO_MNT=$(run_mktemp -d ~/.export_cms_cpc_minio_mnt.XXX) || err_exit
+
+echo "Initializing CMS bucket in minio (if needed)"
+run_cmd "${locOfScript}/setup_cms_minio_mount.sh" --rw --init "${CMS_MINIO_MNT}"
+
+LOG_REL_DIR="logs/exports/$(date +%Y%m%d%H%M%S)"
+LOG_DIR="${CMS_MINIO_MNT}/${LOG_REL_DIR}"
+echo "Create log directory in minio://cms/${LOG_REL_DIR}"
+run_cmd mkdir -p "${LOG_DIR}"
+
+function launch_area_export {
+  local epid logbase area
+  area="$1"
+  logbase="${area}.log"
+  echo "$(date) Starting ${area} export (log: minio://cms/${LOG_REL_DIR}/${logbase})"
+  nohup "${CMS_EXPORT_SCRIPT}" "${area}" > "${LOG_DIR}/${logbase}" 2>&1 &
+  epid=$!
+  echo "${area} export PID is ${epid}"
+  backup_pids+=("${epid}")
+}
+
+for area in "${backup_areas[@]}"; do
+  launch_area_export "${area}"
+done
+
+echo "Waiting for exports to complete"
+
+errors=0
+running=${#backup_pids[@]}
+last_print=$SECONDS
+need_newline=""
+while [[ ${running} -gt 0 ]]; do
+  sleep 1
+  old_running=${running}
+  running=0
+  still_running=()
+  i=0
+  while [[ $i -lt ${#backup_pids[@]} ]]; do
+    bpid=${backup_pids[$i]}
+    area=${backup_areas[$i]}
+
+    # If the PID is 0, it means we have previously seen that this
+    # backup completed and checked it
+    if [[ ${bpid} == 0 ]]; then
+      let i+=1
+      continue
+    fi
+
+    # Don't let the scary kill fool you -- with signal 0, this just checks
+    # if the process is still running -- no killing involved!
+    if kill -0 "${bpid}" > /dev/null 2>&1; then
+      let i+=1
+      let running+=1
+      still_running+=("${area} (${bpid})")
+      continue
+    fi
+
+    # The process seems to be done, so let's get its exit code
+    wait "${bpid}"
+    rc=$?
+    # Mark that it is done
+    backup_pids[$i]=0
+    let i+=1
+    [[ -n ${need_newline} ]] && echo
+    last_print=$SECONDS
+    need_newline=""
+    if [[ $rc -eq 0 ]]; then
+      echo "$(date) ${area} export (PID ${bpid}) completed successfully"
+    else
+      echo "$(date) ${area} export (PID ${bpid}) FAILED with exit code $rc (logfile: ${LOG_DIR}/${area}.log)"
+      let errors+=1
+    fi
+  done
+  if [[ ${running} -gt 0 && ${running} -ne ${old_running} ]]; then
+    [[ -n ${need_newline} ]] && echo
+    last_print=$SECONDS
+    need_newline=""
+    echo "Still running: ${still_running[*]}"
+    continue
+  fi
+  # Print some progress characters while waiting, occasionally
+  [[ $((SECONDS - last_print)) -ge 180 ]] || continue
+  printf .
+  need_newline=y
+  last_print=$SECONDS
+done
+
+umount "${CMS_MINIO_MNT}" || echo "WARNING: Unable to unmount '${CMS_MINIO_MNT}'" >&2
+
+if [[ $errors -ne 0 ]]; then
+  err_exit "${errors} of the exports failed. See individual log files for details"
+  exit 1
+fi
+
+echo "All exports completed successfully"

From aca5aa164405303323468bc2fbf1f417972b3a55 Mon Sep 17 00:00:00 2001
From: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com>
Date: Tue, 9 Jul 2024 14:13:35 -0500
Subject: [PATCH 21/37] CASMPET-6904 edit prerequisites.sh for CSM 1.6
 certmanager upgrade (#5206)

* CASMPET-6904 edit prerequisites.sh for CSM 1.6 certmanager upgrade

* `shfmt`

---------

Co-authored-by: Russell Bunch <doomslayer@hpe.com>
---
 upgrade/scripts/upgrade/prerequisites.sh | 49 ++++++++++--------------
 1 file changed, 20 insertions(+), 29 deletions(-)

diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh
index dad2997fec42..ed2a77f7a448 100755
--- a/upgrade/scripts/upgrade/prerequisites.sh
+++ b/upgrade/scripts/upgrade/prerequisites.sh
@@ -613,20 +613,25 @@ else
   echo "====> ${state_name} has been completed" | tee -a "${LOG_FILE}"
 fi
 
+# upgrade all charts dependent on cray-certmanager chart
+# it is neccessary to upgrade these before upgrade
+do_upgrade_csm_chart cray-istio platform.yaml
+do_upgrade_csm_chart cray-keycloak platform.yaml
+do_upgrade_csm_chart cray-oauth2-proxies platform.yaml
+do_upgrade_csm_chart spire sysmgmt.yaml
+do_upgrade_csm_chart cray-spire sysmgmt.yaml
+do_upgrade_csm_chart cray-tapms-crd sysmgmt.yaml
+do_upgrade_csm_chart cray-tapms-operator sysmgmt.yaml
+
 # Note for csm 1.5/k8s 1.22 only if ANY chart depends on /v1 cert-manager api
 # usage it *MUST* come after this or prerequisites will fail on an upgrade.
 # Helper functions for cert-manager upgrade
-has_cm_init() {
-  ns="${1?no namespace provided}"
-  helm list -n "${ns}" --filter cray-certmanager-init | grep cray-certmanager-init > /dev/null 2>&1
-}
-
 has_craycm() {
   ns="${1?no namespace provided}"
   helm list -n "${ns}" --filter 'cray-certmanager$' | grep cray-certmanager > /dev/null 2>&1
 }
 
-state_name="UPGRADE_CERTMANAGER_0141_CHART"
+state_name="UPGRADE_CERTMANAGER_155_CHART"
 state_recorded=$(is_state_recorded "${state_name}" "$(hostname)")
 if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then
   echo "====> ${state_name} ..." | tee -a "${LOG_FILE}"
@@ -636,7 +641,7 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then
     # work due to helm hooks. Making this work on both isn't really worth the
     # time so just constrain this block of logic to 0.14.1 where we know its
     # needed.
-    gate="0.14.1"
+    gate="1.5.5"
     found=$(helm list -n cert-manager --filter 'cray-certmanager$' | awk '/deployed/ {print $10}')
 
     needs_upgrade=0
@@ -647,20 +652,19 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then
     else
       printf "note: cert-manager helm chart version %s\n" "${found}" >&2
 
-      # We might be rerunning from a pre 1.5.x install and there is no
+      # We might be rerunning from a pre 1.6.x install and there is no
       # cert-manager installed due to a prior removal
       if [ "${found}" = "" ]; then
         printf "note: no helm install appears to exist for cert-manager, likely this state is being run again\n" >&2
         ((needs_upgrade += 1))
       else
-        printf "note: no cert-manager upgrade steps needed, cert-manager 0.14.1 is not installed\n" >&2
+        printf "note: no cert-manager upgrade steps needed, cert-manager 1.5.5 is not installed\n" >&2
       fi
     fi
 
-    # Only run if we need to and detected not 0.14.1 or ""
+    # Only run if we need to and detected not 1.12.9 or ""
     if [ "${needs_upgrade}" -gt 0 ]; then
       cmns="cert-manager"
-      cminitns="cert-manager-init"
 
       backup_secret="cm-restore-data"
 
@@ -671,10 +675,6 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then
         ((needs_backup += 1))
       fi
 
-      if has_cm_init ${cminitns}; then
-        ((needs_backup += 1))
-      fi
-
       # Ok so the gist of this "backup" is we back up all the cert-manager data as
       # guided by them. The secret we use for this is only kept around until this
       # prereq state completes.
@@ -688,35 +688,26 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then
         fi
       fi
 
-      # Only remove these charts if installed
+      # Only remove cray-certmanager if installed
       if has_craycm ${cmns}; then
         helm uninstall -n "${cmns}" cray-certmanager
       fi
 
-      if has_cm_init ${cminitns}; then
-        helm uninstall -n "${cminitns}" cray-certmanager-init
-      fi
-
       # Note: These should *never* fail as we depend on helm uninstall doing
       # its job, but if it didn't exit early here as something is amiss.
       cm=1
-      cminit=1
 
       if ! helm list -n "${cmns}" --filter 'cray-certmanager$' | grep cray-certmanager > /dev/null 2>&1; then
         cm=0
       fi
 
-      if ! helm list -n "${cminitns}" --filter cray-certmanager-init | grep cray-certmanager-init > /dev/null; then
-        cminit=0
-      fi
-
-      if [ "${cm}" = "1" ] || [ "${cminit}" = "1" ]; then
-        printf "fatal: helm uninstall did not remove expected charts, cert-manager %s cert-manager-init %s\n" "${cm}" "${cminit}" >&2
+      if [ "${cm}" = "1" ]; then
+        printf "fatal: helm uninstall did not remove expected chart cert-manager %s\n" "${cm}" >&2
         exit 1
       fi
 
       # Ensure the cert-manager namespace is deleted in a case of both helm charts
-      # removed but there might be detritus leftover in the namespace.
+      # removed but there might be detritous leftover in the namespace.
       kubectl delete namespace "${cmns}" || :
 
       tmp_manifest=/tmp/certmanager-tmp-manifest.yaml
@@ -747,7 +738,7 @@ EOF
       done
 
       platform="${CSM_MANIFESTS_DIR}/platform.yaml"
-      for chart in cray-drydock cray-certmanager cray-certmanager-issuers; do
+      for chart in cray-certmanager cray-certmanager-issuers; do
         printf "    -\n" >> "${tmp_manifest}"
         yq r "${platform}" 'spec.charts.(name=='${chart}')' | sed 's/^/      /' >> "${tmp_manifest}"
       done

From 1160121e2b614813c7adf5ad436815367b87121f Mon Sep 17 00:00:00 2001
From: Mitch Harding <mitchell.harding@hpe.com>
Date: Tue, 9 Jul 2024 15:17:25 -0400
Subject: [PATCH 22/37] CASMTRIAGE-7122: ncn-upgrade-master-nodes.sh: Add
 retries of docs-csm RPM install (#5208)

---
 .../upgrade/ncn-upgrade-master-nodes.sh        | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh b/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh
index afb872d87bde..65518654abbf 100755
--- a/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh
+++ b/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh
@@ -187,9 +187,23 @@ state_recorded=$(is_state_recorded "${state_name}" ${target_ncn})
 if [[ $state_recorded == "0" ]]; then
   echo "====> ${state_name} ..."
   {
-    record_state "${state_name}" ${target_ncn}
     scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null /root/docs-csm-latest.noarch.rpm $target_ncn:/root/docs-csm-latest.noarch.rpm
-    ssh $target_ncn "rpm --force -Uvh /root/docs-csm-latest.noarch.rpm"
+    # CASMTRIAGE-7122: This RPM install can fail if it happens while CFS is installing RPMs
+    # Therefore, we retry a limited number of times before giving up
+    attempt=0
+    while [[ true ]]; do
+      if [[ ${attempt} -gt 0 ]]; then
+        # Wait briefly before trying again
+        sleep 5
+      fi
+      if [[ ${attempt} -lt 12 ]]; then
+        let attempt+=1
+        ssh $target_ncn "rpm --force -Uvh /root/docs-csm-latest.noarch.rpm" && break || continue
+      fi
+      # Final attempt. The lack of the || continue means that this will cause the script to
+      # fail (since it runs with set -e) if the command fails
+      ssh $target_ncn "rpm --force -Uvh /root/docs-csm-latest.noarch.rpm" && break
+    done
   } >> ${LOG_FILE} 2>&1
   record_state "${state_name}" ${target_ncn}
 else

From f923c3f4f78a6a96d441bdc127c235e2629bb0a7 Mon Sep 17 00:00:00 2001
From: Mitch Harding <mitchell.harding@hpe.com>
Date: Tue, 9 Jul 2024 15:23:54 -0400
Subject: [PATCH 23/37] CASMTRIAGE-7131: Correct BOS CLI commands (#5216)

* CASMTRIAGE-7131: Correct BOS CLI commands; minor linting

* Placate altered linter rules
---
 .../boot_orchestration/BOS_Workflows.md       | 97 ++++++++++++-------
 operations/boot_orchestration/Cheatsheet.md   | 16 +--
 ...Template_to_Boot_Compute_Nodes_with_CPS.md |  2 +-
 ...emplate_to_Boot_Compute_Nodes_with_SBPS.md |  2 +-
 .../Manage_a_BOS_Session.md                   |  2 +-
 5 files changed, 71 insertions(+), 48 deletions(-)

diff --git a/operations/boot_orchestration/BOS_Workflows.md b/operations/boot_orchestration/BOS_Workflows.md
index 26791deca819..713313e02b1e 100644
--- a/operations/boot_orchestration/BOS_Workflows.md
+++ b/operations/boot_orchestration/BOS_Workflows.md
@@ -1,25 +1,28 @@
 # BOS Workflows
 
 The following workflows present a high-level overview of common Boot Orchestration Service \(BOS\) operations.
-These workflows depict how services interact with each other when booting, configuring, or shutting down nodes. They also help provide a quicker and deeper understanding of how the system functions.
+These workflows depict how services interact with each other when booting, configuring, or shutting down nodes.
+They also help provide a quicker and deeper understanding of how the system functions.
 
 * [Terminology](#terminology)
 * [Workflows]
-  * [Boot nodes](#boot-nodes)
-  * [Reboot nodes](#reboot-nodes)
-  * [Power off nodes](#power-off-nodes)
+    * [Boot nodes](#boot-nodes)
+    * [Reboot nodes](#reboot-nodes)
+    * [Power off nodes](#power-off-nodes)
 
 ## Terminology
 
 The following are mentioned in the workflows:
 
-* Boot Orchestration Service \(BOS\) is responsible for booting, configuring, and shutting down collections of nodes. The Boot Orchestration Service has the following components:
-  * A BOS session template is a collection of one or more boot sets. A boot set defines a collection of nodes and the information about the boot artifacts and parameters.
-    Session templates also include information on which [Configuration Framework Service (CFS)](../../glossary.md#configuration-framework-service-cfs) configuration should be applied.
-  * BOS sessions provide a way to apply a template across a group of nodes and monitor the progress of those nodes as they move toward their desired state.
-  * BOS operators interact with other services to perform actions on nodes, moving them toward their desired state.
-* [Cray Advanced Platform Monitoring and Control (CAPMC)](../../glossary.md#cray-advanced-platform-monitoring-and-control-capmc) service provides system-level power control for nodes in the system.
-  CAPMC interfaces directly with the Redfish APIs to the controller infrastructure to effect power and environmental changes on the system.
+* Boot Orchestration Service \(BOS\) is responsible for booting, configuring, and shutting down collections of nodes.
+  The Boot Orchestration Service has the following components:
+    * A BOS session template is a collection of one or more boot sets. A boot set defines a collection of nodes and the information about the boot artifacts and parameters.
+      Session templates also include information on which [Configuration Framework Service (CFS)](../../glossary.md#configuration-framework-service-cfs) configuration should
+      be applied.
+    * BOS sessions provide a way to apply a template across a group of nodes and monitor the progress of those nodes as they move toward their desired state.
+    * BOS operators interact with other services to perform actions on nodes, moving them toward their desired state.
+* [Cray Advanced Platform Monitoring and Control (CAPMC)](../../glossary.md#cray-advanced-platform-monitoring-and-control-capmc) service provides system-level power control
+  for nodes in the system. CAPMC interfaces directly with the Redfish APIs to the controller infrastructure to effect power and environmental changes on the system.
 * [Hardware State Manager (HSM)](../../glossary.md#hardware-state-manager-hsm) tracks the state of each node and its group and role associations.
 * [Boot Script Service (BSS)](../../glossary.md#boot-script-service-bss) stores per-node information about the iPXE boot script.
   When booting or rebooting, nodes consult BSS for boot artifacts \(kernel, `initrd`, image root\) and boot parameters.
@@ -71,28 +74,32 @@ The following workflows are included in this section:
 
 1. **Administrator creates a BOS session template**
 
-    A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and  configuration that should be applied to them. A session template can be created from a JSON structure. It returns a Session Template ID if successful.
+    A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them.
+    A session template can be created from a JSON structure. It returns a session template ID if successful.
 
-    See [Manage a Session Template](Manage_a_Session_Template.md) for more information.
+    See [Manage a session template](Manage_a_Session_Template.md) for more information.
 
 1. **Administrator creates a session**
 
-    Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, Administrator creates a session with operation as Boot and specifies the session template ID.
+    Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case,
+    the administrator creates a session with operation as `boot` and specifies the session template ID.
 
     (`ncn-mw#`)
 
     ```bash
-    cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation Boot
+    cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation boot
     ```
 
 1. **Session setup operator**
 
     The creation of a session causes the session-setup operator to set a desired state on all components listed in the session template.
-    This includes pulling files from S3 to determine boot artifacts like kernel, `initrd`, and root file system. The session setup operator also enables the relevant components at this time.
+    This includes pulling files from S3 to determine boot artifacts like kernel, `initrd`, and root file system. The session setup operator also enables the relevant
+    components at this time.
 
 1. **Status operator (powering-on)**
 
-    The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with HSM to determine the current power status of the node.
+    The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with HSM
+    to determine the current power status of the node.
 
     In this example of booting nodes, the first phase is `powering-on`. If queried at this point, the nodes will have a status of `power-on-pending`.
     For more on component phase and status, see [Component Status](Component_Status.md)
@@ -103,7 +110,8 @@ The following workflows are included in this section:
     If configuration is enabled for the node, the power-on operator will also call CFS to set the desired configuration and disable the node with CFS.
     The node must be disabled within CFS so that CFS does not try to configure node until it has booted.
     The power-on operator then calls CAPMC to power-on the node.
-    Lastly, the power-on operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-on-called`.
+    Lastly, the power-on operator will update the state of the node in BOS, including setting the last action. If queried at this point,
+    the nodes will have a status of `power-on-called`.
 
 1. **CAPMC boots nodes**
 
@@ -120,7 +128,8 @@ The following workflows are included in this section:
 1. **Status operator (configuring)**
 
     The status operator monitors a node's power state until HSM reports that the power state is on.  
-    When the power state for a node is on, the status operator will either set the phase to `configuring` if CFS configuration is required or it will clear the current phase if the node is in its final state.
+    When the power state for a node is on, the status operator will either set the phase to `configuring` if CFS configuration is required or it will clear the current phase
+    if the node is in its final state.
 
 1. **CFS applies configuration**
 
@@ -173,28 +182,32 @@ The following workflows are included in this section:
 
 1. **Administrator creates a BOS session template**
 
-    A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and  configuration that should be applied to them. A session template can be created from a JSON structure. It returns a Session Template ID if successful.
+    A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them.
+    A session template can be created from a JSON structure. It returns a session template ID if successful.
 
-    See [Manage a Session Template](Manage_a_Session_Template.md) for more information.
+    See [Manage a session template](Manage_a_Session_Template.md) for more information.
 
 1. **Administrator creates a session**
 
-    Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, the administrator creates a session with operation as Boot and specifies the session template ID.
+    Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case,
+    the administrator creates a session with operation as `reboot` and specifies the session template ID.
 
     (`ncn-mw#`)
 
     ```bash
-    cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation Reboot
+    cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation reboot
     ```
 
 1. **Session setup operator**
 
     The creation of a session causes the session-setup operator to set a desired state on all components listed in the session template.
-    This includes pulling files from S3 to determine boot artifacts like kernel, `initrd`, and root file system. The session setup operator also enables the relevant components at this time.
+    This includes pulling files from S3 to determine boot artifacts like kernel, `initrd`, and root file system. The session setup operator also enables the relevant
+    components at this time.
 
 1. **Status operator (powering-off)**
 
-    The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with HSM to determine the current power status of the node.
+    The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with
+    HSM to determine the current power status of the node.
 
     In this example of rebooting nodes, the first phase is `powering-off`. If queried at this point, the nodes will have a status of `power-off-pending`.
     For more on component phase and status, see [Component Status](Component_Status.md)
@@ -202,12 +215,14 @@ The following workflows are included in this section:
 1. **Graceful-power-off operator**
 
     The power-off operator will detect nodes with a `power-off-pending` status, calls CAPMC to power-off the node.
-    Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-off-gracefully-called`.
+    Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of
+    `power-off-gracefully-called`.
 
 1. **Forceful-power-off operator**
 
     If powering-off is taking too long, the forceful-power-off will take over. It also calls CAPMC to power-off the node, but with the addition of the forceful flag.
-    Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-off-forcefully-called`.
+    Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of
+    `power-off-forcefully-called`.
 
 1. **CAPMC powers off nodes**
 
@@ -216,7 +231,8 @@ The following workflows are included in this section:
 1. **Status operator (powering-on)**
 
     The status operator monitors a node's power state until HSM reports that the power state is off.
-    When the power state for a node is off, the status operator will set the phase to `powering-on`. If queried at this point, the nodes will have a status of `power-on-pending`.
+    When the power state for a node is off, the status operator will set the phase to `powering-on`. If queried at this point, the nodes will have a status of
+    `power-on-pending`.
 
 1. **Power-on operator**
 
@@ -224,7 +240,8 @@ The following workflows are included in this section:
     If configuration is enabled for the node, the power-on operator will also call CFS to set the desired configuration and disable the node with CFS.
     The node must be disabled within CFS so that CFS does not try to configure node until it has booted.
     The power-on operator then calls CAPMC to power-on the node.
-    Lastly, the power-on operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-on-called`.
+    Lastly, the power-on operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of
+    `power-on-called`.
 
 1. **CAPMC boots nodes**
 
@@ -241,7 +258,8 @@ The following workflows are included in this section:
 1. **Status operator (configuring)**
 
     The status operator monitors a node's power state until HSM reports that the power state is on.
-    When the power state for a node is on, the status operator will either set the phase to `configuring` if CFS configuration is required or it will clear the current phase if the node is in its final state.
+    When the power state for a node is on, the status operator will either set the phase to `configuring` if CFS configuration is required or it will clear the current
+    phase if the node is in its final state.
 
 1. **CFS applies configuration**
 
@@ -268,18 +286,20 @@ The following workflows are included in this section:
 
 1. **Administrator creates a BOS session template**
 
-    A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them. A session template can be created from a JSON structure. It returns a Session Template ID if successful.
+    A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them. A session
+    template can be created from a JSON structure. It returns a session template ID if successful.
 
-    See [Manage a Session Template](Manage_a_Session_Template.md) for more information.
+    See [Manage a session template](Manage_a_Session_Template.md) for more information.
 
 1. **Administrator creates a session**
 
-    Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, the administrator creates a session with operation as Boot and specifies the session template ID.
+    Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case,
+    the administrator creates a session with operation as `shutdown` and specifies the session template ID.
 
     (`ncn-mw#`)
 
     ```bash
-    cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation Reboot
+    cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation shutdown
     ```
 
 1. **Session setup operator**
@@ -290,7 +310,8 @@ The following workflows are included in this section:
 
 1. **Status operator (powering-off)**
 
-    The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with HSM to determine the current power status of the node.
+    The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating
+    with HSM to determine the current power status of the node.
 
     In this example of booting nodes, the first phase is `powering-off`. If queried at this point, the nodes will have a status of `power-off-pending`.
     For more on component phase and status, see [Component Status](Component_Status.md)
@@ -298,12 +319,14 @@ The following workflows are included in this section:
 1. **Graceful-power-off operator**
 
     The power-off operator will detect nodes with a `power-off-pending` status, calls CAPMC to power-off the node.
-    Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-off-gracefully-called`.
+    Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of
+    `power-off-gracefully-called`.
 
 1. **Forceful-power-off operator**
 
     If powering-off is taking too long, the forceful-power-off will take over. It also calls CAPMC to power-off the node, but with the addition of the forceful flag.
-    Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-off-forcefully-called`.
+    Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of
+    `power-off-forcefully-called`.
 
 1. **CAPMC powers off nodes**
 
diff --git a/operations/boot_orchestration/Cheatsheet.md b/operations/boot_orchestration/Cheatsheet.md
index 5eb422041523..7a658c429bb5 100644
--- a/operations/boot_orchestration/Cheatsheet.md
+++ b/operations/boot_orchestration/Cheatsheet.md
@@ -9,25 +9,25 @@ To find the API versions of any commands listed, add `-vvv` to the end of the CL
 * (`ncn-mw#`) Boot all nodes in a template:
 
     ```bash
-    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Boot
+    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation boot
     ```
 
 * (`ncn-mw#`) Reboot all nodes in a template:
 
     ```bash
-    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Reboot
+    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation reboot
     ```
 
 * (`ncn-mw#`) Shutdown all nodes in a template:
 
     ```bash
-    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Shutdown
+    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation shutdown
     ```
 
 * (`ncn-mw#`) Stage a reboot for all nodes in a template:
 
     ```bash
-    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Reboot --staged True
+    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation reboot --staged True
     ```
 
 ## Single node commands
@@ -35,25 +35,25 @@ To find the API versions of any commands listed, add `-vvv` to the end of the CL
 * (`ncn-mw#`) Boot a single node:
 
     ```bash
-    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Boot --limit <node's xname>
+    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation boot --limit <node's xname>
     ```
 
 * (`ncn-mw#`) Reboot a single node:
 
     ```bash
-    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Reboot --limit <node's xname>
+    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation reboot --limit <node's xname>
     ```
 
 * (`ncn-mw#`) Shutdown a single node:
 
     ```bash
-    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Shutdown --limit <node's xname>
+    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation shutdown --limit <node's xname>
     ```
 
 * (`ncn-mw#`) Stage a reboot for a single node:
 
     ```bash
-    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Reboot --staged True --limit <node's xname>
+    cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation reboot --staged True --limit <node's xname>
     ```
 
 * (`ncn-mw#`) Monitor the overall boot progress of a single node:
diff --git a/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_CPS.md b/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_CPS.md
index 193764c62b28..3721f481d525 100644
--- a/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_CPS.md
+++ b/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_CPS.md
@@ -125,7 +125,7 @@ Refer to [Manage a Session Template](Manage_a_Session_Template.md) for more info
 (`ncn-mw#`) The new CPS-based session template can be used when creating a BOS session. The following is an example of creating a reboot session using the CLI:
 
 ```bash
-cray bos v2 sessions create --template-name cps_rootfs_template --operation Reboot
+cray bos v2 sessions create --template-name cps_rootfs_template --operation reboot
 ```
 
 ## Appendix: `root=` kernel parameter
diff --git a/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_SBPS.md b/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_SBPS.md
index fba11720c208..adcb1f2554bb 100644
--- a/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_SBPS.md
+++ b/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_SBPS.md
@@ -165,7 +165,7 @@ Refer to [Manage a Session Template](Manage_a_Session_Template.md) for more info
 (`ncn-mw#`) The new CPS-based session template can be used when creating a BOS session. The following is an example of creating a reboot session using the CLI:
 
 ```bash
-cray bos v2 sessions create --template-name cps_rootfs_template --operation Reboot
+cray bos v2 sessions create --template-name cps_rootfs_template --operation reboot
 ```
 
 ## Appendix: `root=` kernel parameter
diff --git a/operations/boot_orchestration/Manage_a_BOS_Session.md b/operations/boot_orchestration/Manage_a_BOS_Session.md
index 4a027b1be85d..08c43e432377 100644
--- a/operations/boot_orchestration/Manage_a_BOS_Session.md
+++ b/operations/boot_orchestration/Manage_a_BOS_Session.md
@@ -19,7 +19,7 @@ Creating a new BOS session requires the following command-line options:
 (`ncn-mw#`): The following is a boot operation:
 
 ```bash
-cray bos v2 sessions create --template-name <TEMPLATE_NAME> --operation Boot --format json
+cray bos v2 sessions create --template-name <TEMPLATE_NAME> --operation boot --format json
 ```
 
 Example output:

From 9ca86fe927646e5c9e8ddf4a91b3bf98cbebe334 Mon Sep 17 00:00:00 2001
From: Chris Spiller <86013738+spillerc-hpe@users.noreply.github.com>
Date: Tue, 9 Jul 2024 20:29:16 +0100
Subject: [PATCH 24/37] CASMNET-2179 - Document kea arm64 and node-specific
 boot file functionality (#5214)

* CASMNET-2179 - Document kea arm64 and node-specific boot file functionality

* Update operations/network/dhcp/Customize_boot_file.md

Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>

* Syntax Highlighting

Signed-off-by: Russell Bunch <doomslayer@hpe.com>

---------

Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
Signed-off-by: Russell Bunch <doomslayer@hpe.com>
Co-authored-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
Co-authored-by: Russell Bunch <doomslayer@hpe.com>
---
 operations/README.md                          |   1 +
 .../network/dhcp/Customize_boot_file.md       | 154 ++++++++++++++++++
 2 files changed, 155 insertions(+)
 create mode 100644 operations/network/dhcp/Customize_boot_file.md

diff --git a/operations/README.md b/operations/README.md
index dbaef7073ab7..8aacc77939ba 100644
--- a/operations/README.md
+++ b/operations/README.md
@@ -654,6 +654,7 @@ The DHCP service on the HPE Cray EX system uses the Internet Systems Consortium
 
 - [DHCP](network/dhcp/DHCP.md)
 - [Troubleshoot DHCP Issues](network/dhcp/Troubleshoot_DHCP_Issues.md)
+- [DHCP boot file customization](network/dhcp/Customize_boot_file.md)
 
 ### Domain Name Service (DNS)
 
diff --git a/operations/network/dhcp/Customize_boot_file.md b/operations/network/dhcp/Customize_boot_file.md
new file mode 100644
index 000000000000..eaee15f64bfc
--- /dev/null
+++ b/operations/network/dhcp/Customize_boot_file.md
@@ -0,0 +1,154 @@
+# DHCP boot file customization
+
+* [DHCP Boot file customization](#dhcp-boot-file-customization)
+    * [Background](#background)
+    * [Override the boot file name](#override-the-boot-file-name)
+    * [Verify the node DHCP configuration has been updated](#verify-the-node-dhcp-configuration-has-been-updated)
+    * [Reset the boot file to the default option](#reset-the-boot-file-name-to-default)
+
+## Background
+
+The `cray-dhcp-kea` service is configured to send a CPU architecture appropriate boot file based on the value received in
+the client system architecture field (option 93) of the incoming DHCP request. By default `cray-dhcp-kea` will send
+the following in the DHCP boot file name field (option 67) of the DHCP response.
+
+| Option 93 value         | Filename         |
+|-------------------------|------------------|
+| `0x7` - x64 UEFI        | `ipxe.efi`       |
+| `0xb` - ARM 64-bit UEFI | `ipxe.arm64.efi` |
+
+It may be desirable to use a different boot file to the default one for testing or debugging purposes. This document
+describes how the boot file name may be overridden on a per-node basis.
+
+## Override the boot file name
+
+1. (`ncn#`) Determine the HSM `ethernetInterfaces` record for the node.
+
+   ```bash
+   cray hsm inventory ethernetInterfaces list --component-id x3000c0s17b4n0
+   ```
+
+   Example output:
+
+   ```toml
+   [[results]]
+   ID = "b42e99dfec47"
+   Description = ""
+   MACAddress = "b4:2e:99:df:ec:47"
+   LastUpdate = "2024-07-01T11:31:24.942557Z"
+   ComponentID = "x3000c0s17b4n0"
+   Type = "Node"
+   [[results.IPAddresses]]
+   IPAddress = "10.106.0.15"
+   ```
+
+1. (`ncn#`) Set the desired boot file name by adding the `ipxe` option to `Description` field of the HSM `ethernetInterfaces` record.
+
+   This example will set the boot file name to `ipxe.test`.
+
+   ```bash
+   cray hsm inventory ethernetInterfaces update b42e99dfec47 --description="ipxe=ipxe.test"
+   ```
+
+   Example output:
+
+   ```toml
+   ID = "b42e99dfec47"
+   Description = "ipxe=ipxe.test"
+   MACAddress = "b4:2e:99:df:ec:47"
+   LastUpdate = "2024-04-25T06:28:34.825112Z"
+   ComponentID = "x3000c0s17b4n0"
+   Type = "Node"
+   [[IPAddresses]]
+   IPAddress = "10.106.0.15"
+   ```
+
+## Verify the node DHCP configuration has been updated
+
+1. (`ncn#`) Retrieve a token.
+
+   ```bash
+   export TOKEN=$(curl -s -k -S -d grant_type=client_credentials -d client_id=admin-client \
+                    -d client_secret=`kubectl get secrets admin-client-auth -o jsonpath='{.data.client-secret}' | base64 -d` \
+                    https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token | jq -r '.access_token')
+   ```
+
+2. (`ncn#`) Dump the DHCP server configuration.
+
+   **`NOTE`** It make take up to two minutes for the change to HSM to be reflected in the DHCP server configuration as the DHCP helper has to run to update the configuration.
+
+   ```bash
+   curl -H "Authorization: Bearer ${TOKEN}" -X POST -H "Content-Type: application/json" \
+     -d '{ "command": "config-get",  "service": [ "dhcp4" ] }' \
+     https://api-gw-service-nmn.local/apis/dhcp-kea | jq
+   ```
+
+   The `boot-file-name` field for the node should reflect the desired boot file name.
+
+   Example output:
+
+   ```json
+   {
+   "boot-file-name": "ipxe.test",
+   "client-classes": [],
+   "hostname": "nid000004",
+   "hw-address": "b4:2e:99:df:ec:47",
+   "ip-address": "10.106.0.15",
+   "next-server": "0.0.0.0",
+   "option-data": [],
+   "server-hostname": ""
+   }
+   ```
+
+When the node boots, it should now boot using the desired boot file.
+
+Example output:
+
+```text
+2024-06-05 12:33:18 >>Start PXE over IPv4 on MAC: B4-2E-99-DF-EC-47. Press ESC key to abort PXE boot.
+2024-06-05 12:33:26   Station IP address is 10.106.0.15
+2024-06-05 12:33:26
+2024-06-05 12:33:26   Server IP address is 10.92.100.60
+2024-06-05 12:33:26   NBP filename is ipxe.test
+```
+
+## Reset the boot file name to default
+
+1. (`ncn#`) Remove the `ipxe=` setting from the HSM `ethernetInterfaces` record.
+
+   ```bash
+   cray hsm inventory ethernetInterfaces update b42e99dfec47 --description=""
+   ```
+
+   Example output:
+
+   ```toml
+   ID = "b42e99dfec47"
+   Description = ""
+   MACAddress = "b4:2e:99:df:ec:47"
+   LastUpdate = "2024-04-25T06:28:34.825112Z"
+   ComponentID = "x3000c0s17b4n0"
+   Type = "Node"
+   [[IPAddresses]]
+   IPAddress = "10.106.0.15"
+   ```
+
+1. Verify the node configuration.
+
+   Use the [Verify the node DHCP configuration has been updated](#verify-the-node-dhcp-configuration-has-been-updated) procedure to verify the configuration for the node.
+   The `boot-file-name` field should be empty indicating that the DHCP service will supply the default boot file name.
+
+   Example output:
+
+   ```json
+   {
+   "boot-file-name": "",
+   "client-classes": [],
+   "hostname": "nid000004",
+   "hw-address": "b4:2e:99:df:ec:47",
+   "ip-address": "10.106.0.15",
+   "next-server": "0.0.0.0",
+   "option-data": [],
+   "server-hostname": ""
+   }
+   ```

From 3aa3661dfaa01b08c21ca15f86e31ca87b9a8fe7 Mon Sep 17 00:00:00 2001
From: shreni123 <53111642+shreni123@users.noreply.github.com>
Date: Thu, 11 Jul 2024 00:40:21 +0530
Subject: [PATCH 25/37] CASMMON-401: Upgrade and fresh install changes for
 Victoria-metrics (#5207)

CASMMON-401: Upgrade and fresh install changes into csm and docs-csm for Victoriametrics

Co-authored-by: Rambabu Bolla <rambabubolla@gmail.com>
---
 .../upgrade/util/sysmgmt-health-upgrade.sh    | 116 +++++-------------
 .../upgrade/util/update-customizations.sh     |  45 +++----
 2 files changed, 42 insertions(+), 119 deletions(-)

diff --git a/upgrade/scripts/upgrade/util/sysmgmt-health-upgrade.sh b/upgrade/scripts/upgrade/util/sysmgmt-health-upgrade.sh
index a24714050ece..4d53b82e39e9 100755
--- a/upgrade/scripts/upgrade/util/sysmgmt-health-upgrade.sh
+++ b/upgrade/scripts/upgrade/util/sysmgmt-health-upgrade.sh
@@ -23,100 +23,40 @@
 # OTHER DEALINGS IN THE SOFTWARE.
 #
 
-# Function to check cray-sysmgmt-health chart with app version 9.3.1 for prometheus-operator and retain old PVs data.
+# Function to check cray-sysmgmt-health chart with app version 45.1 for kube-prometheus-stack and delete old PVCs.
 
 function sysmgmt_health() {
   echo "Checking for chart version of cray-sysmgmt-health"
   version="45.1"
-  if [ ! -z "$(helm ls -o json --namespace sysmgmt-health | jq -r --argjson version $version '.[] | select(.app_version | sub(".[0-9]$";"") | tonumber | . < $version).name')" ]; then
-    prom_pvc="prometheus-cray-sysmgmt-health-promet-prometheus-db-prometheus-cray-sysmgmt-health-promet-prometheus-0"
-    alert_pvc="alertmanager-cray-sysmgmt-health-promet-alertmanager-db-alertmanager-cray-sysmgmt-health-promet-alertmanager-0"
-    echo "Get PV for both prometheus and Alertmanager"
-    prom_pv=$(kubectl get pvc -n sysmgmt-health -o jsonpath='{.spec.volumeName}' $prom_pvc)
-    alert_pv=$(kubectl get pvc -n sysmgmt-health -o jsonpath='{.spec.volumeName}' $alert_pvc)
-    prom_pv="${prom_pv//[\",]/}"
-    alert_pv="${alert_pv//[\",]/}"
-    echo "Prometheus PV: $prom_pv"
-    echo "Alertmanager PV: $alert_pv"
-
-    # Patch the PersistenceVolume created/used by the prometheus-operator and alertmanager to Retain claim policy
-    prom_pv_reclaim=$(kubectl get pv -o jsonpath='{.spec.persistentVolumeReclaimPolicy}' $prom_pv)
-    alert_pv_reclaim=$(kubectl get pv -o jsonpath='{.spec.persistentVolumeReclaimPolicy}' $alert_pv)
-    prom_pv_reclaim="${prom_pv_reclaim//[\",]/}"
-    alert_pv_reclaim="${alert_pv_reclaim//[\",]/}"
-    if [ "$prom_pv_reclaim" != Retain ] && [ "$alert_pv_reclaim" != Retain ]; then
-      kubectl patch pv/$prom_pv -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}'
-      kubectl patch pv/$alert_pv -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}'
-    else
-      echo "PVs persistentVolumeReclaimPolicy is already Retain"
-    fi
-
-    # Uninstall the cray-sysmgmt-health release
-    helm ls -o json --namespace sysmgmt-health | jq -r --argjson version $version '.[] | select(.app_version | sub(".[0-9]$";"") | tonumber | . < $version).name' | xargs -L1 helm uninstall --namespace sysmgmt-health
-
-    # Delete the existing PersistentVolumeClaim, and verify PV become Released.
-    prom_pv_reclaim=$(kubectl get pv -o jsonpath='{.spec.persistentVolumeReclaimPolicy}' $prom_pv)
-    alert_pv_reclaim=$(kubectl get pv -o jsonpath='{.spec.persistentVolumeReclaimPolicy}' $alert_pv)
-    prom_pv_reclaim="${prom_pv_reclaim//[\",]/}"
-    alert_pv_reclaim="${alert_pv_reclaim//[\",]/}"
-    if [ "$prom_pv_reclaim" == Retain ] && [ "$alert_pv_reclaim" == Retain ]; then
-      kubectl delete pvc/$prom_pvc -n sysmgmt-health
-      kubectl delete pvc/$alert_pvc -n sysmgmt-health
-      prom_pv_phase=$(kubectl get pv -o jsonpath='{.status.phase}' $prom_pv)
-      alert_pv_phase=$(kubectl get pv -o jsonpath='{.status.phase}' $alert_pv)
-      prom_pv_phase="${prom_pv_phase//[\",]/}"
-      alert_pv_phase="${alert_pv_phase//[\",]/}"
-      echo "Verifying whether PVs became Released or not."
-      sleep 5
-      if [ "$alert_pv_phase" == Released ] && [ "$prom_pv_phase" == Released ]; then
-        echo "Both Prometheus and Alertmanager PVs are Released"
-      else
-        echo >&2 "PVs are not Released. Verify if PV exists or not."
-        echo "Prometheus PV: $prom_pv"
-        echo "Alertmanager PV: $alert_pv"
-        exit
-      fi
-
-      # Remove the cray-sysmgmt-health-promet-kubelet service.
-      echo "Deleting cray-sysmgmt-health-promet-kubelet service in kube-system namespace."
-      kubectl delete service/cray-sysmgmt-health-promet-kubelet -n kube-system
-
-      # Remove all the existing CRDs (ServiceMonitors, Podmonitors, etc.)
-      echo "Deleting sysmgmt-health existing CRDs"
-      for c in $(kubectl get crds -A -o jsonpath='{range .items[?(@.metadata.annotations.controller-gen\.kubebuilder\.io\/version=="v0.2.4")]}{.metadata.name}{"\n"}{end}'); do
-        kubectl delete crd ${c}
-      done
-    else
-      echo >&2 "PersistenceVolume created/used by the prometheus-operator and alertmanager is not Retain claim policy"
-      echo >&2 "Exiting"
-      exit
-    fi
-
-    # Remove current spec.claimRef values to change the PV's status from Released to Available.
-    if [ "$alert_pv_phase" == Released ] && [ "$prom_pv_phase" == Released ]; then
-      kubectl patch pv/$prom_pv --type json -p='[{"op": "remove", "path": "/spec/claimRef"}]'
-      kubectl patch pv/$alert_pv --type json -p='[{"op": "remove", "path": "/spec/claimRef"}]'
-      prom_pv_phase=$(kubectl get pv -o jsonpath='{.status.phase}' $prom_pv)
-      alert_pv_phase=$(kubectl get pv -o jsonpath='{.status.phase}' $alert_pv)
-      prom_pv_phase="${prom_pv_phase//[\",]/}"
-      alert_pv_phase="${alert_pv_phase//[\",]/}"
-      echo "Verifying whether PV became Available or not."
-      sleep 5
-      if [ "$alert_pv_phase" == Available ] && [ "$prom_pv_phase" == Available ]; then
-        echo "Both Prometheus and Alertmanager PVs are Available. Ready to deploy the latest cray-sysmgmt-chart now."
-      else
-        echo >&2 "PVs are not Available. Verify if PV exists or not."
-        echo "Prometheus PV: $prom_pv"
-        echo "Alertmanager PV: $alert_pv"
-        exit
-      fi
-    else
-      echo "PV's status is not Released. Exiting"
-      exit
-    fi
+  if [ "$(helm ls -o json --namespace sysmgmt-health | jq -r --argjson version $version '.[] | select(.app_version | sub(".[0-9]$";"") | tonumber | . = $version).name')" ]; then
+    prom0_pvc="prometheus-cray-sysmgmt-health-kube-p-prom-db-prometheus-cray-sysmgmt-health-kube-p-prom-0"
+    prom1_pvc="prometheus-cray-sysmgmt-health-kube-p-prom-db-prometheus-cray-sysmgmt-health-kube-p-prom-1"
+    prom0_shard_pvc="prometheus-cray-sysmgmt-health-kube-p-prom-db-prometheus-cray-sysmgmt-health-kube-p-prom-shard-1-0"
+    prom1_shard_pvc="prometheus-cray-sysmgmt-health-kube-p-prom-db-prometheus-cray-sysmgmt-health-kube-p-prom-shard-1-1"
+    alert_pvc="alertmanager-cray-sysmgmt-health-kube-p-alertmanager-db-alertmanager-cray-sysmgmt-health-kube-p-alertmanager-0"
+    thanos_ruler_pvc="thanos-ruler-kube-prometheus-stack-thanos-ruler-data-thanos-ruler-kube-prometheus-stack-thanos-ruler-0"
+
+    # Uninstall the cray-sysmgmt-health and delete PVCs
+    helm ls -o json --namespace sysmgmt-health | jq -r --argjson version $version '.[] | select(.app_version | sub(".[0-9]$";"") | tonumber | . = $version).name' | xargs -L1 helm uninstall --namespace sysmgmt-health
+
+    kubectl delete pvc/$prom0_pvc -n sysmgmt-health
+    kubectl delete pvc/$prom1_pvc -n sysmgmt-health
+    kubectl delete pvc/$prom0_shard_pvc -n sysmgmt-health
+    kubectl delete pvc/$prom1_shard_pvc -n sysmgmt-health
+    kubectl delete pvc/$alert_pvc -n sysmgmt-health
+    kubectl delete pvc/$thanos_ruler_pvc -n sysmgmt-health
+
+    # Remove the cray-sysmgmt-health-promet-kubelet service.
+    echo "Deleting cray-sysmgmt-health-kube-p-kubelet service in kube-system namespace."
+    kubectl delete service/cray-sysmgmt-health-kube-p-kubelet -n kube-system
+
+    # Remove all the existing CRDs (ServiceMonitors, Podmonitors, etc.)
+    echo "Deleting sysmgmt-health existing CRDs"
+    for c in $(kubectl get crds -A -o jsonpath='{range .items[?(@.metadata.annotations.controller-gen\.kubebuilder\.io\/version=="v0.2.4")]}{.metadata.name}{"\n"}{end}'); do
+      kubectl delete crd ${c}
+    done
   fi
 }
 
 # sysmgmt_health function call
-
 sysmgmt_health
diff --git a/upgrade/scripts/upgrade/util/update-customizations.sh b/upgrade/scripts/upgrade/util/update-customizations.sh
index ff3dd532f574..f9986166f10e 100755
--- a/upgrade/scripts/upgrade/util/update-customizations.sh
+++ b/upgrade/scripts/upgrade/util/update-customizations.sh
@@ -127,40 +127,23 @@ if [[ -z "$(yq r "$c" "spec.network.netstaticips.nmn_ncn_storage_mons")" ]]; the
   done
   yq w -i --style=single "$c" spec.kubernetes.services.cray-sysmgmt-health.cephExporter.endpoints '{{ network.netstaticips.nmn_ncn_storage_mons }}'
 fi
-if [[ "$(yq r "$c" "spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.enabled")" ]]; then
-  idx=0
-  temp=1
-  mon_node=$(yq r "$c" 'spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.params.conf.target' | awk '{print $2}')
-  for node in ${mon_node}; do
-    yq w -i "$c" "spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.params[${idx}].name" "snmp$temp"
-    yq w -i "$c" "spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.params[${idx}].target" "${node}"
-    idx=$((idx + 1))
-    temp=$((temp + 1))
-  done
-fi
-
-# Cray-sysmgmt-health
-yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.thanosCompactor.resolutionraw = "15d"' -i $c
-yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.thanosCompactor.resolution5m = "15d"' -i $c
-yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.thanosCompactor.resolution1h = "15d"' -i $c
 
 # Kube-prometheus-stack
-if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator' $c)" != null ]; then
-  if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack' $c)" != null ]; then
-    yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator = (.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack * .spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator)' -i $c
+if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack' $c)" != null ]; then
+  if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack' $c)" != null ]; then
+    yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack = (.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack * .spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack)' -i $c
   fi
-  yq4 eval 'del(.spec.proxiedWebAppExternalHostnames.customerManagement.[] | select(. == "*prometheus-operator*"))' -i $c
-  yq4 eval ".spec.proxiedWebAppExternalHostnames.customerManagement += \"{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].prometheus.prometheusSpec.externalAuthority }}\"" -i $c
-  yq4 eval ".spec.proxiedWebAppExternalHostnames.customerManagement += \"{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].alertmanager.alertmanagerSpec.externalAuthority }}\"" -i $c
-  yq4 eval ".spec.proxiedWebAppExternalHostnames.customerManagement += \"{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].grafana.externalAuthority }}\"" -i $c
-  yq4 eval ".spec.proxiedWebAppExternalHostnames.customerManagement += \"{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].thanos.thanosSpec.externalAuthority }}\"" -i $c
-  yq4 eval ".spec.kubernetes.services.cray-kiali.kiali-operator.cr.spec.external_services.grafana.url = \"https://{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].grafana.externalAuthority }}/\"" -i $c
-  yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack = .spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator | del(.spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator)" -i $c
-  yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.prometheus.prometheusSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].prometheus.prometheusSpec.externalAuthority }}/\"" -i $c
-  yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.alertmanager.alertmanagerSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].alertmanager.alertmanagerSpec.externalAuthority }}/\"" -i $c
-  yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.thanos.thanosSpec.externalAuthority = "thanos.cmn.{{ network.dns.external }}"' -i $c
-  yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.thanos.thanosSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].thanos.thanosSpec.externalAuthority }}/\"" -i $c
-  yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.thanos.s3_endpoint =  "{{network.dns.internal_s3 }}"' -i $c
+  yq4 eval 'del(.spec.proxiedWebAppExternalHostnames.customerManagement[] | select(. == "{{ kubernetes.services['\''cray-sysmgmt-health'\'']['\''kube-prometheus-stack'\''].thanos.thanosSpec.externalAuthority }}"))' -i $c
+  yq4 ".spec.proxiedWebAppExternalHostnames.customerManagement[3] = \"{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].vmselect.vmselectSpec.externalAuthority }}\"" -i $c
+  yq4 ".spec.proxiedWebAppExternalHostnames.customerManagement[4] = \"{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].alertmanager.externalAuthority }}\"" -i $c
+  yq4 ".spec.proxiedWebAppExternalHostnames.customerManagement[5] = \"{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].grafana.externalAuthority }}\"" -i $c
+  yq4 eval ".spec.kubernetes.services.cray-kiali.kiali-operator.cr.spec.external_services.grafana.url = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].grafana.externalAuthority }}/\"" -i $c
+  yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack = .spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack | del(.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack)" -i $c
+  yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.vmselect.vmselectSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].vmselect.vmselectSpec.externalAuthority }}/\"" -i $c
+  yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].alertmanager.externalAuthority }}/\"" -i $c
+  yq4 'del(.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.alertmanagerSpec)' -i $c
+  yq4 'del(.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.prometheus)' -i $c
+  yq4 'del(.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.thanos)' -i $c
 fi
 
 #sma-pcim

From 5c5b4ee98f54b15cc2c69aba5fe83fcd71c14bf0 Mon Sep 17 00:00:00 2001
From: shreni123 <53111642+shreni123@users.noreply.github.com>
Date: Sat, 13 Jul 2024 01:17:31 +0530
Subject: [PATCH 26/37] CASMMON-412: cray-oauth2-proxies upgrade fix due to
 victoria-metrics upgrade (#5228)

Co-authored-by: Rambabu Bolla <rambabubolla@gmail.com>
---
 upgrade/scripts/upgrade/util/update-customizations.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/upgrade/scripts/upgrade/util/update-customizations.sh b/upgrade/scripts/upgrade/util/update-customizations.sh
index f9986166f10e..bf6d3f59cbd7 100755
--- a/upgrade/scripts/upgrade/util/update-customizations.sh
+++ b/upgrade/scripts/upgrade/util/update-customizations.sh
@@ -139,6 +139,8 @@ if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-
   yq4 ".spec.proxiedWebAppExternalHostnames.customerManagement[5] = \"{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].grafana.externalAuthority }}\"" -i $c
   yq4 eval ".spec.kubernetes.services.cray-kiali.kiali-operator.cr.spec.external_services.grafana.url = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].grafana.externalAuthority }}/\"" -i $c
   yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack = .spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack | del(.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack)" -i $c
+  yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.vmselect.vmselectSpec.externalAuthority = "vmselect.cmn.{{ network.dns.external }}"' -i $c
+  yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.externalAuthority = "alertmanager.cmn.{{ network.dns.external }}"' -i $c
   yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.vmselect.vmselectSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].vmselect.vmselectSpec.externalAuthority }}/\"" -i $c
   yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].alertmanager.externalAuthority }}/\"" -i $c
   yq4 'del(.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.alertmanagerSpec)' -i $c

From 544eee31dd5c0366e10c4ceb09fa12a1e081a654 Mon Sep 17 00:00:00 2001
From: Joel Landsteiner <76180635+jsl-hpe@users.noreply.github.com>
Date: Tue, 16 Jul 2024 14:04:50 -0500
Subject: [PATCH 27/37] CASMCMS-8894 Include new image management workflow
 instructions for updating image labels (#5222)

* Include new image management workflow instructions for updating image labels

* Remove trailing spaces

* Separate command from expected output

* add more spacing by tripleback ticks.

* Apply suggestions from code review

`s/bash/toml/`

Signed-off-by: Russell Bunch <doomslayer@hpe.com>

* Update operations/image_management/Image_Management_Workflows.md

Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>

---------

Signed-off-by: Russell Bunch <doomslayer@hpe.com>
Signed-off-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
Co-authored-by: Russell Bunch <doomslayer@hpe.com>
Co-authored-by: Nathan Rockershousen <nathan.rockershousen@hpe.com>
---
 .../Image_Management_Workflows.md             | 64 ++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/operations/image_management/Image_Management_Workflows.md b/operations/image_management/Image_Management_Workflows.md
index b43f42fa3c4b..56fc10c2f065 100644
--- a/operations/image_management/Image_Management_Workflows.md
+++ b/operations/image_management/Image_Management_Workflows.md
@@ -1,6 +1,6 @@
 # Image Management Workflows
 
-Overview of how to create an image and how to customize and image.
+Overview of how to create an image and how to customize an image.
 
 The following workflows are intended to be high-level overviews of image management tasks. These workflows depict how services interact with each other during image management and help to provide a quicker and deeper
 understanding of how the system functions.
@@ -9,6 +9,7 @@ The workflows in this section include:
 
 * [Create a new image](#create-a-new-image)
 * [Customize an image](#customize-an-image)
+* [Manage image labels](#manage-image-labels)
 
 ## Create a new image
 
@@ -172,3 +173,64 @@ Mentioned in this workflow:
 1. Upload the new image artifacts to Ceph S3.
 
     The new image artifacts are uploaded to Ceph S3.
+
+## Manage Image Labels
+
+**Use Case:** The system administrator would like to apply user supplied information about IMS images or remove metadata that has been previously set.
+
+**Components:** This workflow is based on the interaction of the Image Management Service \(IMS\) with other services after the image build process completes. The information added or removed can be used by separate
+APIs or processes for whatever specific purposes they implement for, as it involves specific images. Generally, downstream APIs define specific keys and values that can be associated with an image, then perform specific
+actions against those image records in a way that is consistent with their API's behavior. Typically, this allows administrators to attach general purpose information about IMS images that will help them manage the lifecycle
+of images that IMS maintains.
+
+**Workflow Overview:** The following sequence of steps occurs during this workflow.
+
+1. (`ncn-mw#`) Administrator identifies the image to add metadata information to.
+
+    Administrators may already know the image ID in question to label. If not, examining the existing images may be of help.
+
+    ```bash
+    cray ims images list
+    ```
+
+1. (`ncn-mw#`) Administrators may set a new label for an existing IMS image.
+
+    One label may be changed (added or removed) during each update operation. Existing values for the provided key may be overwritten if already part of the image record.
+
+    ```bash
+    cray ims images update a506a6f6-54d9-4e5a-9e8d-1fc052d62504 --metadata-operation set --metadata-value value --metadata-key key
+    ```
+
+1. (`ncn-mw#`) Administrators and downstream APIs may obtain the active record for a given image.
+
+    Image metadata information is also available via the `list` command for all images.
+
+    ```bash
+    cray ims images describe a506a6f6-54d9-4e5a-9e8d-1fc052d62504
+    ```
+
+    Expected output:
+
+    ```toml
+    arch = "x86_64"
+    created = "2024-06-27T15:41:22.467177"
+    id = "a506a6f6-54d9-4e5a-9e8d-1fc052d62504"
+    [metadata]
+    key = "value"
+    ```
+
+1. (`ncn-mw#`) Administrators may remove previously set image metadata.
+
+    Downstream APIs and Administrators using the CLI may affect these changes. During `--metadata-operation remove`, users may omit `--metadata-value` command line arguments.
+
+    ```bash
+    cray ims images update a506a6f6-54d9-4e5a-9e8d-1fc052d62504 --metadata-operation remove --metadata-key key
+    ```
+
+    Expected output:
+
+    ```toml
+    arch = "x86_64"
+    created = "2024-06-27T15:41:22.467177"
+    id = "a506a6f6-54d9-4e5a-9e8d-1fc052d62504"
+    ```

From e613c67056e4d7036d384038b3add168f1c139df Mon Sep 17 00:00:00 2001
From: Chris Spiller <86013738+spillerc-hpe@users.noreply.github.com>
Date: Tue, 16 Jul 2024 20:45:52 +0100
Subject: [PATCH 28/37] CASMINST-5556 - Remove outdated and unreferenced NTP
 procedure (#5223)

* CASMINST-5556 - Remove outdated and unreferenced NTP procedure

* Fix table of contents
---
 operations/README.md                    |  1 -
 operations/resiliency/NTP_Resiliency.md | 31 -------------------------
 2 files changed, 32 deletions(-)
 delete mode 100644 operations/resiliency/NTP_Resiliency.md

diff --git a/operations/README.md b/operations/README.md
index 8aacc77939ba..abc54b6625bb 100644
--- a/operations/README.md
+++ b/operations/README.md
@@ -385,7 +385,6 @@ HPE Cray EX systems are designed so that system management services \(SMS\) are
 - [Resilience of System Management Services](resiliency/Resilience_of_System_Management_Services.md)
 - [Restore System Functionality if a Kubernetes Worker Node is Down](resiliency/Restore_System_Functionality_if_a_Kubernetes_Worker_Node_is_Down.md)
 - [Recreate `StatefulSet` Pods on Another Node](resiliency/Recreate_StatefulSet_Pods_on_Another_Node.md)
-- [NTP Resiliency](resiliency/NTP_Resiliency.md)
 - [Resiliency Testing Procedure](resiliency/Resiliency_Testing_Procedure.md)
 
 ## ConMan
diff --git a/operations/resiliency/NTP_Resiliency.md b/operations/resiliency/NTP_Resiliency.md
deleted file mode 100644
index 4ab7f0c4a17c..000000000000
--- a/operations/resiliency/NTP_Resiliency.md
+++ /dev/null
@@ -1,31 +0,0 @@
-# NTP Resiliency
-
-Synchronize the time on all non-compute nodes \(NCNs\) via Network Time Protocol \(NTP\). Avoid a single point of failure for NTP when testing system resiliency.
-
-## Prerequisites
-
-This procedure requires administrative privileges.
-
-## Procedure
-
-1. (`ncn#`) Set the date manually if the time on NCNs is off by more than an a few hours.
-
-    For example:
-
-    ```bash
-    timedatectl set-time "2021-02-19 15:04:00"
-    ```
-
-1. (`pit#`) Configure NTP on the Pre-install Toolkit \(PIT\).
-
-    ```bash
-    /root/bin/configure-ntp.sh
-    ```
-
-1. (`ncn#`) Sync NTP on all other nodes.
-
-    If more than nine NCNs are in use on the system, update the loop in the following command accordingly.
-
-    ```bash
-    for i in ncn-{w,s}00{1..3} ncn-m00{2..3}; do echo "------$i--------"; ssh $i '/srv/cray/scripts/common/chrony/csm_ntp.py'; done
-    ```

From 0c98b0382d080ea8c0683edc5a347f2312b31f7b Mon Sep 17 00:00:00 2001
From: Jenkins <jenkins@algol60.net>
Date: Wed, 17 Jul 2024 17:25:52 +0000
Subject: [PATCH 29/37] Automated API docs swagger to md conversion
 (https://jenkins.algol60.net/job/Cray-HPE/job/csm/job/v1.6.0-alpha.58/1/)

---
 api/sls.md | 56 ------------------------------------------------------
 1 file changed, 56 deletions(-)

diff --git a/api/sls.md b/api/sls.md
index f7c2332c8177..cd3931b65fa3 100644
--- a/api/sls.md
+++ b/api/sls.md
@@ -329,14 +329,6 @@ Status Code **200**
 
 *xor*
 
-|Name|Type|Required|Restrictions|Description|
-|---|---|---|---|---|
-|»» *anonymous*|[hardware_comptype_virtual_node](#schemahardware_comptype_virtual_node)|false|none|none|
-|»»» NodeType|string|true|none|The role type assigned to this node.|
-|»»» nid|integer|false|none|none|
-
-*xor*
-
 |Name|Type|Required|Restrictions|Description|
 |---|---|---|---|---|
 |»» *anonymous*|[hardware_ip_and_creds_optional](#schemahardware_ip_and_creds_optional)|false|none|none|
@@ -357,7 +349,6 @@ Status Code **200**
 |NodeType|Application|
 |NodeType|Storage|
 |NodeType|Management|
-|NodeType|Management|
 
 <aside class="warning">
 To perform this operation, you must be authenticated by means of one of the following methods:
@@ -1027,14 +1018,6 @@ Status Code **200**
 
 *xor*
 
-|Name|Type|Required|Restrictions|Description|
-|---|---|---|---|---|
-|»» *anonymous*|[hardware_comptype_virtual_node](#schemahardware_comptype_virtual_node)|false|none|none|
-|»»» NodeType|string|true|none|The role type assigned to this node.|
-|»»» nid|integer|false|none|none|
-
-*xor*
-
 |Name|Type|Required|Restrictions|Description|
 |---|---|---|---|---|
 |»» *anonymous*|[hardware_ip_and_creds_optional](#schemahardware_ip_and_creds_optional)|false|none|none|
@@ -1055,7 +1038,6 @@ Status Code **200**
 |NodeType|Application|
 |NodeType|Storage|
 |NodeType|Management|
-|NodeType|Management|
 
 <aside class="warning">
 To perform this operation, you must be authenticated by means of one of the following methods:
@@ -1620,9 +1602,6 @@ sls_dump:
 |»»»»» *anonymous*|body|[hardware_comptype_node](#schemahardware_comptype_node)|false|none|
 |»»»»»» NodeType|body|string|true|The role type assigned to this node.|
 |»»»»»» nid|body|integer|false|none|
-|»»»»» *anonymous*|body|[hardware_comptype_virtual_node](#schemahardware_comptype_virtual_node)|false|none|
-|»»»»»» NodeType|body|string|true|The role type assigned to this node.|
-|»»»»»» nid|body|integer|false|none|
 |»»»»» *anonymous*|body|[hardware_ip_and_creds_optional](#schemahardware_ip_and_creds_optional)|false|none|
 |»»»»»» IP6addr|body|string|false|The ipv6 address that should be assigned to this BMC, or "DHCPv6".  If omitted, "DHCPv6" is assumed.|
 |»»»»»» IP4addr|body|string|false|The ipv4 address that should be assigned to this BMC, or "DHCPv4".  If omitted, "DHCPv4" is assumed.|
@@ -1668,7 +1647,6 @@ sls_dump:
 |»»»»»» NodeType|Application|
 |»»»»»» NodeType|Storage|
 |»»»»»» NodeType|Management|
-|»»»»»» NodeType|Management|
 
 <h3 id="post__loadstate-responses">Responses</h3>
 
@@ -3512,34 +3490,6 @@ The human-readable time this object was last created or updated.
 |NodeType|Storage|
 |NodeType|Management|
 
-<h2 id="tocS_hardware_comptype_virtual_node">hardware_comptype_virtual_node</h2>
-<!-- backwards compatibility -->
-<a id="schemahardware_comptype_virtual_node"></a>
-<a id="schema_hardware_comptype_virtual_node"></a>
-<a id="tocShardware_comptype_virtual_node"></a>
-<a id="tocshardware_comptype_virtual_node"></a>
-
-```json
-{
-  "NodeType": "Management",
-  "nid": "2"
-}
-
-```
-
-### Properties
-
-|Name|Type|Required|Restrictions|Description|
-|---|---|---|---|---|
-|NodeType|string|true|none|The role type assigned to this node.|
-|nid|integer|false|none|none|
-
-#### Enumerated Values
-
-|Property|Value|
-|---|---|
-|NodeType|Management|
-
 <h2 id="tocS_hardware_comptype_nodecard">hardware_comptype_nodecard</h2>
 <!-- backwards compatibility -->
 <a id="schemahardware_comptype_nodecard"></a>
@@ -3647,12 +3597,6 @@ xor
 
 xor
 
-|Name|Type|Required|Restrictions|Description|
-|---|---|---|---|---|
-|*anonymous*|[hardware_comptype_virtual_node](#schemahardware_comptype_virtual_node)|false|none|none|
-
-xor
-
 |Name|Type|Required|Restrictions|Description|
 |---|---|---|---|---|
 |*anonymous*|[hardware_comptype_nodecard](#schemahardware_comptype_nodecard)|false|none|none|

From 8ec9bca2c97790308c43c9deb6e8463e6295ee93 Mon Sep 17 00:00:00 2001
From: Shane Unruh <87081771+shunr-hpe@users.noreply.github.com>
Date: Tue, 23 Jul 2024 14:50:38 -0600
Subject: [PATCH 30/37] CASMHMS-6228 Changed the way add_cdu_switch.py picks
 IPs (#5232)

Changed the way add_cdu_switch.py picks IPs

When the network_hardware subnet comes after the bootstrap_dhcp subnet
the script picked IPs from in the boostrap_dhcp subnet.

This changes the way the next free IP is selected.
- It prefers IPs that come after the reserved IPs in the given subnet.
- It excludes reserved IPs in the other subnets
- It excludes IPs between any specified DHCP start and end

CASMHMS-6228
CAST-36164
---
 .../system_layout_service/add_cdu_switch.py   | 81 ++++++++++++++++---
 1 file changed, 71 insertions(+), 10 deletions(-)

diff --git a/scripts/operations/system_layout_service/add_cdu_switch.py b/scripts/operations/system_layout_service/add_cdu_switch.py
index 76b6f55aa37b..aaf421785c4e 100755
--- a/scripts/operations/system_layout_service/add_cdu_switch.py
+++ b/scripts/operations/system_layout_service/add_cdu_switch.py
@@ -2,7 +2,7 @@
 
 # MIT License
 #
-# (C) Copyright [2022] Hewlett Packard Enterprise Development LP
+# (C) Copyright [2022,2024] Hewlett Packard Enterprise Development LP
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -26,6 +26,7 @@
 import json
 import re
 import netaddr
+from ipaddress import ip_address
 
 CDU_MGMT_SWITCH_XNAME_REGEX="^d([0-9]+)w([0-9]+)$"
 MGMT_HL_SWITCH_XNAME_REGEX="^x([0-9]{1,4})c([0-7])h([1-9][0-9]*)s([1-9])$"
@@ -36,32 +37,92 @@ def find_subnet(sls_network, name):
         if subnet["Name"] == name:
             network_hardware_subnet = subnet
             break
-    
+
     return  network_hardware_subnet
 
-def find_next_available_ip(sls_subnet):
+def find_next_available_ip(sls_subnet, reserved_ips=[]):
     subnet = netaddr.IPNetwork(sls_subnet["CIDR"])
 
     existing_ip_reservations = netaddr.IPSet()
-    existing_ip_reservations.add(sls_subnet["Gateway"])
+
     for ip_reservation in sls_subnet["IPReservations"]:
         print("  Found existing IP reservation {} with IP {}".format(ip_reservation["Name"], ip_reservation["IPAddress"]))
         existing_ip_reservations.add(ip_reservation["IPAddress"])
 
-    for available_ip in list(subnet[1:-2]):
+    start_ip = None
+    if len(existing_ip_reservations) > 0:
+        start_ip = list(existing_ip_reservations)[0]
+
+    existing_ip_reservations.add(sls_subnet["Gateway"])
+
+    # Add reserved IP addresses. These included IP addresses that could be selected for DHCP.
+    for ip_reservation in reserved_ips:
+        existing_ip_reservations.add(ip_reservation)
+
+    index = 2  # The default is 2 to exclude the Gateway and the address after the gateway
+    if start_ip:
+        # find the index of the start_ip in the full list of IPs for the CIDR
+        i = 1
+        for ip in list(subnet[0:-2]):
+            if str(ip) == str(start_ip):
+                index = i
+                break
+            i += 1
+
+    # Find an IP that is not used starting at the index found above
+    for available_ip in list(subnet[index:-2]):
         if available_ip not in existing_ip_reservations:
             print("  {} Available for use.".format(available_ip))
             return available_ip
 
+    # If an IP wasn't found after the current reservations,
+    # then look for one in the IPs that come before the current reservations
+    if 1 < index:
+        for available_ip in list(subnet[1:index-1]):
+            if available_ip not in existing_ip_reservations:
+                print("  {} Available for use.".format(available_ip))
+                return available_ip
+
+    return None
+
+def get_reserved_ips(sls_subnet):
+    name = sls_subnet["Name"]
+    reserved = []
+    if "DHCPEnd" in sls_subnet and "DHCPStart" in sls_subnet:
+        start = ip_address(sls_subnet["DHCPStart"])
+        end = ip_address(sls_subnet["DHCPEnd"])
+        if start <= end:
+            while start <= end:
+                print(f"  {name} Reserved for DHCP {start}")
+                reserved.append(str(start))
+                start += 1
+
+    for ip_reservation in sls_subnet.get("IPReservations", []):
+        ip = ip_reservation["IPAddress"]
+        print(f"  {name} Reserved {ip}")
+        reserved.append(ip)
+
+    return reserved
+
 def add_cdu_ip_reservation(sls_network, xname, alias):
+    print("Selecting IP Reservation for {} CDU Switch in {}'s network_hardware subnet".format(xname, sls_network["Name"]))
+
+    # collect all used IPs in every subnet of the subnet
+    reserved_ips = []
+    for subnet in sls_network["ExtraProperties"]["Subnets"]:
+        ips = get_reserved_ips(subnet)
+        reserved_ips.extend(ips)
+
     sls_subnet = find_subnet(sls_network, "network_hardware")
-    if sls_subnet == None:
+    if sls_subnet is None:
         print("Error: Unable to find network_hardware subnet in {} network!".format(network_name))
         exit(1)
 
-    print("Selecting IP Reservation for {} CDU Switch in {}'s network_hardware subnet".format(xname, sls_network["Name"]))
+    ip = find_next_available_ip(sls_subnet, reserved_ips=reserved_ips)
+    if ip is None:
+        print(f"Error: Failed to find a free IP address in network: {network_name}, subnet: {sls_subnet.get('Name')}")
+        exit(1)
 
-    ip = find_next_available_ip(sls_subnet)
     ip_reservation = {
         "Name": alias,
         "IPAddress": str(ip),
@@ -148,10 +209,10 @@ def add_cdu_ip_reservation(sls_network, xname, alias):
 
     if hardware["Type"] != "comptype_cdu_mgmt_switch" and hardware["Type"] != "comptype_hl_switch":
         continue
-    
+
     if "ExtraProperties" not in hardware:
         continue
-    
+
     if "Aliases" not in hardware["ExtraProperties"]:
         print("Error {} is missing Alias extra property!".format(xname))
         exit(1)

From e9face097be811d516fd280e32e426f99ca58395 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 23 Jul 2024 15:51:25 -0500
Subject: [PATCH 31/37] CASMTRIAGE-6973: Add PBS info to reboot computes
 (#5237)

* CASMTRIAGE-6973: Add PBS info to reboot computes

Signed-off-by: Jessica Dehn <jessica.dehn@hpe.com>
(cherry picked from commit 6ceae47f6954092a0ae151ed804bfad7c06247cd)

* SME changes

Signed-off-by: Jessica Dehn <jessica.dehn@hpe.com>
(cherry picked from commit 303d0ba2e5dd50c46c6a32e5ade9b5ca65ea943f)

* Remove outdated reference

Signed-off-by: Jessica Dehn <jessica.dehn@hpe.com>
(cherry picked from commit b62fefc2466ccff9ea770ef878042dc630ac966e)

---------

Co-authored-by: Jessica Dehn <jessica.dehn@hpe.com>
---
 operations/iuf/workflows/managed_rollout.md | 73 +++++++++++++--------
 1 file changed, 44 insertions(+), 29 deletions(-)

diff --git a/operations/iuf/workflows/managed_rollout.md b/operations/iuf/workflows/managed_rollout.md
index e4432f0c93c8..8faa6ddb887e 100644
--- a/operations/iuf/workflows/managed_rollout.md
+++ b/operations/iuf/workflows/managed_rollout.md
@@ -81,45 +81,60 @@ Refer to the "Configure `OpenSearch`" section in the _HPE Cray EX System Monitor
 
 1. Invoke `iuf run` with `-r` to execute the [managed-nodes-rollout](../stages/managed_nodes_rollout.md) stage on all
    nodes. This will stage data to BOS and allow the workload manager to reboot the nodes when it is ready to do so. The
-   workload manager must be configured to tell BOS to reboot nodes using this staged data. The following assumes Slurm
-   is the workload manager. Follow the instructions in the
-   section [Using staged sessions with Slurm](../../boot_orchestration/Rolling_Upgrades.md#using-staged-sessions-with-slurm)
-   of the [Rolling Upgrades using BOS](../../boot_orchestration/Rolling_Upgrades.md) documentation. These instructions
-   describe two parameters that must be set in the `slurm.conf` file. Return to these instructions after setting them.
-   If an immediate reboot of compute nodes is desired instead, add `-mrs reboot` to the `iuf run` command.
+   workload manager must be configured to tell BOS to reboot nodes using this staged data.
 
-   (`ncn-m001#`) Execute the `managed-nodes-rollout` stage.
+   - If PBS is the workload manager:
 
-    ```bash
-    iuf -a "${ACTIVITY_NAME}" run -r managed-nodes-rollout
-    ```
+     1. Create a maintenance reservation in PBS. For more information on maintenance reservations, see the
+        [_PBS Professional Administrator's Guide_](https://community.altair.com/community?id=altair_product_documentation&spa=1&filter=language%3Denglish%5Eproduct%3D20069018db0348102af07608f4961995%5Eguide_type%3DAdministration&p=1&d=asc).
 
-   **NOTE:** If the `-mrs reboot` option is used, skip the following step.
+     1. (`ncn-m001#`) After the reservation has started, execute the `managed-nodes-rollout` stage with the `-mrs reboot` option to
+        immediately reboot all compute nodes.
 
-1. Tell the workload manager to reboot the compute nodes. This only works for compute nodes, and they must be specified
-   explicitly. Using the keyword `ALL` to specify the nodes does not work presently.
+        ```bash
+        iuf -a "${ACTIVITY_NAME}" run -r managed-nodes-rollout -mrs reboot
+        ```
 
-  - Use this command to list all of the compute nodes in the system using their Node Identities (NIDs). First, enter the
-    SAT bash shell using `sat bash`.
+   - If Slurm is the workload manager:
 
-    (`ncn-m001#`) Enter the sat container and fetch the compute list
+     1. Follow the instructions in the section
+        [Using staged sessions with Slurm](../../boot_orchestration/Rolling_Upgrades.md#using-staged-sessions-with-slurm)
+        of the [Rolling Upgrades using BOS](../../boot_orchestration/Rolling_Upgrades.md) documentation. These instructions
+        describe two parameters that must be set in the `slurm.conf` file. Return to these instructions after setting them.
 
-    ```bash
-    sat bash
-    (ef637ae8a8b5) sat-container:/sat/share # sat status --fields xname --filter role=compute --no-headings --no-borders | xargs sat xname2nid
-    nid[000001-000004]
-    (ef637ae8a8b5) sat-container:/sat/share # exit
-    logout
-    ```
+     1. (`ncn-m001#`) Execute the `managed-nodes-rollout` stage. If an immediate reboot of compute nodes is desired instead,
+        add `-mrs reboot` to the `iuf run` command.
 
-  - Now, tell the workload manager to reboot the compute nodes. Paste the output from the previous step as the last
-    argument.
+        ```bash
+        iuf -a "${ACTIVITY_NAME}" run -r managed-nodes-rollout
+        ```
 
-    (`compute#`) A sample reboot command to reboot NIDs 1 through 4.
+        **NOTE:** If the `-mrs reboot` option is used with Slurm, skip the following step.
 
-    ```bash
-    scontrol reboot nextstate=Resume Reason="IUF Managed Nodes Rollout" nid[000001-000004]
-    ```
+     1. Tell Slurm to reboot the compute nodes. This only works for compute nodes, and they must be specified
+        explicitly.
+
+        - Use this command to list all of the compute nodes in the system using their Node Identities (NIDs). First, enter the
+          SAT bash shell using `sat bash`.
+
+          (`ncn-m001#`) Enter the sat container and fetch the compute list
+
+          ```bash
+          sat bash
+          (ef637ae8a8b5) sat-container:/sat/share # sat status --fields xname --filter role=compute --no-headings --no-borders | xargs sat xname2nid
+          nid[000001-000004]
+          (ef637ae8a8b5) sat-container:/sat/share # exit
+          logout
+          ```
+
+        - Now, tell the workload manager to reboot the compute nodes. Paste the output from the previous step as the last
+          argument.
+
+          (`compute#`) A sample reboot command to reboot NIDs 1 through 4.
+
+          ```bash
+          scontrol reboot nextstate=Resume Reason="IUF Managed Nodes Rollout" nid[000001-000004]
+          ```
 
 Once this step has completed:
 

From 7c9c3010fc506daf58650992e3aea15dc8a45621 Mon Sep 17 00:00:00 2001
From: shreni123 <53111642+shreni123@users.noreply.github.com>
Date: Wed, 24 Jul 2024 02:23:59 +0530
Subject: [PATCH 32/37] =?UTF-8?q?CASMMON-408:=20sysmgmt-health=20monitorin?=
 =?UTF-8?q?g=20services=20between=20prerequisites=E2=80=A6=20(#5235)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CASMMON-408: sysmgmt-health monitoring services between prerequisites.sh stage and CSM services upgrade stage

Co-authored-by: Rambabu Bolla <rambabubolla@gmail.com>
---
 upgrade/scripts/upgrade/prerequisites.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh
index ed2a77f7a448..27f9bf40656d 100755
--- a/upgrade/scripts/upgrade/prerequisites.sh
+++ b/upgrade/scripts/upgrade/prerequisites.sh
@@ -797,6 +797,7 @@ do_upgrade_csm_chart cray-drydock platform.yaml
 do_upgrade_csm_chart cray-kyverno platform.yaml
 do_upgrade_csm_chart kyverno-policy platform.yaml
 do_upgrade_csm_chart cray-kyverno-policies-upstream platform.yaml
+do_upgrade_csm_chart cray-sysmgmt-health platform.yaml
 do_upgrade_csm_chart cray-tftp sysmgmt.yaml
 do_upgrade_csm_chart cray-tftp-pvc sysmgmt.yaml
 

From 0129474e758addbdc5bc2e27ae7ef67a1e9a7408 Mon Sep 17 00:00:00 2001
From: Mike Buchmann <76961856+mbuchmann-hpe@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:01:57 -0500
Subject: [PATCH 33/37] CASMHMS-6238 - FASBackupImages Script - 1.6 (#5239)

* Added FASBackupImages script and docs.  Added to backup/restore procedures.

* Lint Updates

* +x
---
 operations/Backup_HMS.md                      |  15 +++
 operations/{Restore_HSM.md => Restore_HMS.md} |  50 +++++++-
 .../firmware/FAS_Images_Backup_Restore.md     |  47 ++++++++
 .../operations/firmware/FASBackupImages.py    | 107 ++++++++++++++++++
 4 files changed, 217 insertions(+), 2 deletions(-)
 rename operations/{Restore_HSM.md => Restore_HMS.md} (68%)
 create mode 100644 operations/firmware/FAS_Images_Backup_Restore.md
 create mode 100755 scripts/operations/firmware/FASBackupImages.py

diff --git a/operations/Backup_HMS.md b/operations/Backup_HMS.md
index 724b5c4a542e..9b3639109c7b 100644
--- a/operations/Backup_HMS.md
+++ b/operations/Backup_HMS.md
@@ -113,6 +113,21 @@
     ls -la
     ```
 
+1. (`ncn-mw#`) Backup FAS Images.
+
+    ```bash
+    export TOKEN=$(curl -k -s -S -d grant_type=client_credentials \
+    -d client_id=admin-client -d client_secret=`kubectl get secrets admin-client-auth \
+    -o jsonpath='{.data.client-secret}' | base64 -d` \
+    https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token \
+    | jq -r '.access_token')
+    BACKUP_NAME=cray-fas-images_$BACKUPDIR
+    /usr/share/doc/csm/scripts/operations/firmware/FASBackupImages.py $BACKUP_NAME
+    zip -r $BACKUP_NAME.zip $BACKUP_NAME
+    rm -rf $BACKUP_NAME
+    ls -la
+    ```
+
 1. (`ncn-mw#`) Create a backup of the BSS etcd database.
 
     ```bash
diff --git a/operations/Restore_HSM.md b/operations/Restore_HMS.md
similarity index 68%
rename from operations/Restore_HSM.md
rename to operations/Restore_HMS.md
index 3afdbd582a7a..778ff324eba3 100644
--- a/operations/Restore_HSM.md
+++ b/operations/Restore_HMS.md
@@ -2,9 +2,31 @@
 
 ## Procedure
 
-1. Place `tar` file from [Backup](Backup_HMS.md) on the system to restore.
+1. If `tar` file was placed in `s3` from [Backup](Backup_HMS.md) copy to the system to restore.
 
-1. (`ncn-mw#`) Set name of backup file (without `.tar.gz`).
+    1. (`ncn-mw#`) List objects in `s3`
+
+        ```bash
+        bucket=hms
+        aws s3api list-objects --bucket $bucket --endpoint-url http://ncn-m001.nmn:8000
+        ```
+
+    1. (`ncn-mw#`) Set name of backup file (without `.tar.gz`). For example:
+
+        ```bash
+        BACKUP_FILE=hms-backup_2023-06-28_11-12-24
+        ```
+
+    1. (`ncn-mw#`) To retrieve files from backup disk.
+
+        ```bash
+        file=$BACKUP_FILE.tar.gz
+        aws s3api get-object --bucket $bucket --key $file $file --endpoint-url http://ncn-m001.nmn:8000
+        file=sls_input_file.json
+        aws s3api get-object --bucket $bucket --key $file $file --endpoint-url http://ncn-m001.nmn:8000
+        ```
+
+1. (`ncn-mw#`) Set name of backup file (without `.tar.gz`). example:
 
     ```bash
     BACKUP_FILE=hms-backup_2023-06-28_11-12-24
@@ -34,6 +56,7 @@
 
     ```bash
     kubectl replace -f cray-hms-base-config_$BACKUP_FILE.yaml
+    sleep 60
     $DOCS_DIR/operations/hardware_state_manager/updateroles.py cray-smd-components-dump_$BACKUP_FILE.json
     ```
 
@@ -104,3 +127,26 @@
     ```bash
     $DOCS_DIR/operations/boot_script_service/bss-restore-bootparameters.sh cray-bss-boot-parameters-dump_$BACKUP_FILE.json
     ```
+
+1. (`ncn-mw#`) Disable nodes.
+
+    ```bash
+    COMPONENT_FILE=cray-smd-components-dump_$BACKUP_FILE.json
+    ENABLE_URL=https://api-gw-service-nmn.local/apis/smd/hsm/v2/State/Components/BulkEnabled
+    xnames=`cat $COMPONENT_FILE | jq '.[] | .[] | select(.Enabled==false)' | jq .ID | paste -sd, -`
+    payload='{"ComponentIDs": ['$xnames'], "Enabled": false}'
+    if [ ${#xnames} -gt 0 ]; then curl -k -s -X PATCH -H "Authorization: Bearer ${TOKEN}" --header "Content-Type: application/json" -d $payload $ENABLE_URL; echo $xnames; fi
+    ```
+
+1. (`ncn-mw#`) Restore FAS Images.
+
+    ```bash
+    loaderid=`cray fas loader create --file cray-fas-images_$BACKUP_FILE.zip --format json | jq .loaderRunID`
+    echo $loaderid
+    ```
+
+1. (`ncn-mw#`) Check FAS loader output.
+
+    ```bash
+    cray fas loader describe $loaderid
+    ```
diff --git a/operations/firmware/FAS_Images_Backup_Restore.md b/operations/firmware/FAS_Images_Backup_Restore.md
new file mode 100644
index 000000000000..fec73dd97075
--- /dev/null
+++ b/operations/firmware/FAS_Images_Backup_Restore.md
@@ -0,0 +1,47 @@
+# Backup and Restoring FAS Images
+
+This procedure will backup all the images currently in FAS to allow for a restore.
+
+To backup the images, first create an authentication token.
+On most systems, this is created with the following command (`ncn-mw#`)
+
+```bash
+export TOKEN=$(curl -s -S -d grant_type=client_credentials \
+-d client_id=admin-client -d client_secret=`kubectl get secrets admin-client-auth \
+-o jsonpath='{.data.client-secret}' | base64 -d` \
+https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token \
+| jq -r '.access_token')
+```
+
+Set the name of the Image Backup Directory (`ncn-mw#`)
+
+```bash
+outdir=nameofdir
+```
+
+Run the `FASBackupImages` (`ncn-mw#`)
+
+```bash
+/usr/share/doc/csm/scripts/operations/firmware/FASBackupImages.py $outdir
+```
+
+This will download images from S3 and create the image record for each file.
+Each file will have its own directory.
+
+To zip the entire collect for later restore (`ncn-mw#`)
+
+```bash
+zip -r $outdir.zip $outdir
+```
+
+To restore the images from the zip file into FAS (`ncn-mw#`)
+
+```bash
+cray fas images loader --file $outdir.zip
+```
+
+This will return a `loaderRunID` which can be used to verify the loading of the firmware.
+
+```bash
+cray fas images describe loaderRunID --format json
+```
diff --git a/scripts/operations/firmware/FASBackupImages.py b/scripts/operations/firmware/FASBackupImages.py
new file mode 100755
index 000000000000..18df3ef889f5
--- /dev/null
+++ b/scripts/operations/firmware/FASBackupImages.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# MIT License
+#
+# (C) Copyright [2024] Hewlett Packard Enterprise Development LP
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+# OTHER DEALINGS IN THE SOFTWARE.
+
+import json
+import sys
+import os
+import requests
+import urllib3
+from pathlib import Path
+
+urllib3.disable_warnings()
+
+if len(sys.argv) < 2:
+  print("ERROR: Usage: backupFASImages.py backupdir")
+  sys.exit(2)
+
+token = os.environ.get('TOKEN')
+if token is None or token == "":
+  print("Error environment variable TOKEN was not set")
+  print('Run the following to set the TOKEN:')
+  print('''export TOKEN=$(curl -k -s -S -d grant_type=client_credentials \\
+    -d client_id=admin-client -d client_secret=`kubectl get secrets admin-client-auth \\
+    -o jsonpath='{.data.client-secret}' | base64 -d` \\
+    https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token \\
+    | jq -r '.access_token')
+    ''')
+  sys.exit(1)
+
+backupdir = sys.argv[1]
+Path(backupdir).mkdir(parents=True, exist_ok=True)
+
+headers = {
+  'Content-Type': "application/json",
+  'cache-control': "no-cache",
+  'Authorization': f'Bearer {token}'
+}
+
+errors = 0
+url = "https://api-gw-service-nmn.local/apis/fas/v1/images"
+ret = requests.request("GET", url, headers=headers, verify=False)
+if ret.status_code != 200:
+  print("ERROR: Return Status Code: " + str(ret.status_code))
+  print(ret.json())
+  sys.exit(1)
+
+images = ret.json()
+
+for image in images["images"]:
+  if "s3URL" in image:
+    s3URL=image["s3URL"]
+    file = ""
+    bucket = "fw-update"
+    o = os.path.split(s3URL)
+    image["fileName"] = o[1]
+    while o[1] != bucket:
+      if len(file) > 0:
+        file = o[1] + "/" + file
+      else:
+        file = o[1]
+      o = os.path.split(o[0])
+    if len(backupdir) > 0:
+      localfile = backupdir + "/" + file
+    localdir = os.path.split(localfile)[0]
+    print(file)
+    if not os.path.exists(localfile):
+      command = "cray artifacts get " + bucket + " " + file + " " + localfile
+      Path(localdir).mkdir(parents=True, exist_ok=True)
+      print(command)
+      os.system(command)
+      if "imageID" in image: image.pop("imageID")
+      if "createTime" in image: image.pop("createTime")
+      if "s3URL" in image: image.pop("s3URL")
+      if "target" in image:
+        image["targets"] = [ image.pop("target") ]
+      with open(localfile + ".json", "w") as f:
+        out = json.dumps(image, indent=2)
+        f.write(out)
+      f.close()
+    else:
+      f = open(localfile + ".json")
+      nimage = json.load(f)
+      f.close()
+      nimage["targets"].append(image["target"])
+      with open(localfile + ".json", "w") as f:
+        out = json.dumps(nimage, indent=2)
+        f.write(out)
+      f.close()
\ No newline at end of file

From e8451388fcf640bb92e54def1bf6c8b490b23baf Mon Sep 17 00:00:00 2001
From: David Laine <77020169+dlaine-hpe@users.noreply.github.com>
Date: Tue, 23 Jul 2024 16:02:37 -0500
Subject: [PATCH 34/37] CASMTRIAGE-7138 - fix typos in IMS documentation.
 (#5242)

---
 .../Build_an_Image_Using_IMS_REST_Service.md                | 6 +++---
 operations/image_management/Working_With_aarch64_Images.md  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/operations/image_management/Build_an_Image_Using_IMS_REST_Service.md b/operations/image_management/Build_an_Image_Using_IMS_REST_Service.md
index fba8430071de..84c65884f51a 100644
--- a/operations/image_management/Build_an_Image_Using_IMS_REST_Service.md
+++ b/operations/image_management/Build_an_Image_Using_IMS_REST_Service.md
@@ -7,8 +7,8 @@ Create an image root from an IMS recipe.
 * The Cray command line interface \(CLI\) tool is initialized and configured on the system.
 * System management services \(SMS\) are running in a Kubernetes cluster on non-compute nodes \(NCN\) and include the
   following deployments:
-  * `cray-ims`, the Image Management Service \(IMS\)
-  * `cray-nexus`, the Nexus repository manager service
+    * `cray-ims`, the Image Management Service \(IMS\)
+    * `cray-nexus`, the Nexus repository manager service
 * The NCN Certificate Authority \(CA\) public key has been properly installed into the CA cache for this system.
 * `kubectl` is installed locally and configured to point at the SMS Kubernetes cluster.
 * A Kiwi image recipe uploaded as a gzipped tar file and registered with IMS.
@@ -535,7 +535,7 @@ use. Typically, customers access the system from outside the Kubernetes cluster
    > **IMPORTANT:** The following command will not work when run on a node within the Kubernetes cluster.
 
     ```bash
-    ssh -p IMS_SSH_PORT root@IMS_SSH_HOST
+    ssh -p $IMS_SSH_PORT root@$IMS_SSH_HOST
     ```
 
    Example output:
diff --git a/operations/image_management/Working_With_aarch64_Images.md b/operations/image_management/Working_With_aarch64_Images.md
index 6544a0c047c7..6c40a0e36e13 100644
--- a/operations/image_management/Working_With_aarch64_Images.md
+++ b/operations/image_management/Working_With_aarch64_Images.md
@@ -72,7 +72,7 @@ these workflows.
     ```bash
         cray ims recipes create --name "My Recipe" \
             --recipe-type kiwi-ng --linux-distribution sles15 \
-            --arch = "aarch64" --require-dkms False --format toml
+            --arch "aarch64" --require-dkms False --format toml
     ```
 
     Expected output will look something like:

From de424345d794a12205568878405239743f6e73d9 Mon Sep 17 00:00:00 2001
From: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:50:43 -0500
Subject: [PATCH 35/37] CASMPET-7152 add documentation saying kubernetes
 encryption needs to be restored after master node upgrade/rebuild (#5245)

* CASMPET-7152 add documentation saying kubernetes encryption needs to be restored after master node upgrade/rebuild

* CASMPET-7152 lint fixes

* CASMPET-7152 update Kubernetes kube-apiserver failing process with how to restart kube-apiserver
---
 .../iuf/workflows/management_rollout.md       |  14 ++
 .../Rebuild_NCNs/Rebuild_NCNs.md              |   4 +
 .../Kubernetes_Kube_apiserver_failing.md      | 167 ++++++++++++++++++
 upgrade/Stage_3.md                            |  35 ++--
 4 files changed, 209 insertions(+), 11 deletions(-)
 create mode 100644 troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md

diff --git a/operations/iuf/workflows/management_rollout.md b/operations/iuf/workflows/management_rollout.md
index d72dd5d7aae8..e3f35de0d9eb 100644
--- a/operations/iuf/workflows/management_rollout.md
+++ b/operations/iuf/workflows/management_rollout.md
@@ -104,6 +104,9 @@ Refer to that table and any corresponding product documents before continuing to
 
 1. Perform the NCN master node upgrade on `ncn-m002` and `ncn-m003`.
 
+    > **`NOTE`** If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../../kubernetes/encryption/README.md),
+    then backup the `/etc/cray/kubernetes/encryption` directory on the master node before upgrading and restore the directory after the node has been upgraded.
+
     1. Invoke `iuf run` with `-r` to execute the [`management-nodes-rollout`](../stages/management_nodes_rollout.md) stage on `ncn-m002`. This will rebuild `ncn-m002` with the new CFS configuration and image built in
     previous steps of the workflow.
 
@@ -113,6 +116,9 @@ Refer to that table and any corresponding product documents before continuing to
         iuf -a "${ACTIVITY_NAME}" run -r management-nodes-rollout --limit-management-rollout ncn-m002
         ```
 
+        > **`NOTE`** The `/etc/cray/kubernetes/encryption` directory should be restored if it was backed up. Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted.
+        See [Kubernetes `kube-apiserver` Failing](../../../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`.
+
     1. Verify that `ncn-m002` booted successfully with the desired image and CFS configuration.
 
         ```bash
@@ -130,6 +136,8 @@ Refer to that table and any corresponding product documents before continuing to
         iuf -a "${ACTIVITY_NAME}" run -r management-nodes-rollout --limit-management-rollout ncn-m003
         ```
 
+        > **`NOTE`** The `/etc/cray/kubernetes/encryption` directory should be restored if it was backed up. Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted.
+
     1. Verify that `ncn-m003` booted successfully with the desired image and CFS configuration.
 
         ```bash
@@ -183,10 +191,16 @@ Refer to that table and any corresponding product documents before continuing to
 
     1. (`ncn-m002#`) Upgrade `ncn-m001`. This **must** be executed on **`ncn-m002`**.
 
+        > **`NOTE`** If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../../kubernetes/encryption/README.md),
+        then backup the `/etc/cray/kubernetes/encryption` directory on the master node before upgrading and restore the directory after the node has been upgraded.
+
         ```bash
         /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh ncn-m001
         ```
 
+        > **`NOTE`** The `/etc/cray/kubernetes/encryption` directory should be restored if it was backed up. Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted.
+        See [Kubernetes `kube-apiserver` Failing](../../../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`.
+
 1. Follow the steps documented in [Stage 3.4 - Upgrade `weave` and `multus`](../../../upgrade/Stage_3.md#stage-34---upgrade-weave-and-multus)
 
 1. Follow the steps documented in [Stage 3.5 - `coredns` anti-affinity](../../../upgrade/Stage_3.md#stage-35---coredns-anti-affinity)
diff --git a/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md b/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md
index 3f8498e1657f..11300fb11145 100644
--- a/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md
+++ b/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md
@@ -95,6 +95,10 @@ mount | grep /etc/cray/upgrade/csm
 ```
 
 > - Steps to download CSM tarball are at [Update Product Stream](../../../update_product_stream/README.md).
+> - If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../../kubernetes/encryption/README.md),
+then backup the `/etc/cray/kubernetes/encryption` directory on the master node before upgrading.
+The directory needs to be restored after the node has been rebuilt and the `kube-apiserver` on the node should be restarted.
+See [Kubernetes `kube-apiserver` Failing](../../../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`.
 > - This script should be run from `ncn-m001` when rebuilding `ncn-m002` or `ncn-m003`.
 
 (`ncn-m#`) Rebuild the desired master node. Replace `ncn-m002` with the desired node to rebuild:
diff --git a/troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md b/troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md
new file mode 100644
index 000000000000..fdc35cc1faaa
--- /dev/null
+++ b/troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md
@@ -0,0 +1,167 @@
+# Kubernetes `kube-apiserver` Failing
+
+If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../../operations/kubernetes/encryption/README.md) and the encryption files have not been restored after a master node rebuild or upgrade,
+then the `kube-apiserver` on that node will fail.
+This document only outlines the fix if the `kube-apiserver` if it is failing due to Kubernetes encryption not being restored.
+
+If a `kube-apiserver` is failing because of encryption, the error seen in the `kube-apiserver` pod logs can look like the error below.
+
+```bash
+E0724 19:46:36.855160       1 cacher.go:420] cacher (*core.Secret): unexpected ListAndWatch error: failed to list *core.Secret: unable to transform key "/registry/secrets/argo/argo-server-secret": no matching prefix found; reinitializing...
+E0724 19:46:37.872059       1 cacher.go:420] cacher (*core.Secret): unexpected ListAndWatch error: failed to list *core.Secret: unable to transform key "/registry/secrets/argo/argo-server-secret": no matching prefix found; reinitializing..
+```
+
+## Process
+
+1. (`ncn-m001`) Check if Kubernetes encryption is enabled.
+
+    1. Check if all master nodes have the same encryption files. It is possible that a master node that was upgraded or rebuilt does not have the encryption files that exist on the other nodes.
+
+        ```bash
+        pdsh -w ncn-m00[1-3] 'ls -lh /etc/cray/kubernetes/encryption'
+        ```
+
+        Expected output if encyrption is not enabled. The `current.yaml` file should be symbolically linked to the `default.yaml` file on all master nodes as seen below:
+
+        ```bash
+        ncn-m001:~ # pdsh -w ncn-m00[1-3] 'ls -lh /etc/cray/kubernetes/encryption'
+        ncn-m002: Warning: Permanently added 'ncn-m002,10.252.1.11' (ECDSA) to the list of known hosts.
+        ncn-m003: Warning: Permanently added 'ncn-m003,10.252.1.12' (ECDSA) to the list of known hosts.
+        ncn-m001: Warning: Permanently added 'ncn-m001' (ECDSA) to the list of known hosts.
+        ncn-m001: total 4.0K
+        ncn-m001: lrwxrwxrwx 1 root root  44 Jul  6 21:01 current.yaml -> /etc/cray/kubernetes/encryption/default.yaml
+        ncn-m001: -r-------- 1 root root 151 Jul  6 21:01 default.yaml
+        ncn-m002: total 4.0K
+        ncn-m002: lrwxrwxrwx 1 root root  44 Jul  6 19:33 current.yaml -> /etc/cray/kubernetes/encryption/default.yaml
+        ncn-m002: -r-------- 1 root root 151 Jul  6 19:33 default.yaml
+        ncn-m003: total 4.0K
+        ncn-m003: lrwxrwxrwx 1 root root  44 Jul  6 19:34 current.yaml -> /etc/cray/kubernetes/encryption/default.yaml
+        ncn-m003: -r-------- 1 root root 151 Jul  6 19:34 default.yaml
+        ```
+
+        Expected output if encryption is enabled but has not been restored on a single master node. The `current.yaml` file is symbolically linked to the `default.yaml` file on only one master node:
+
+        ```bash
+        ncn-m001:~ # pdsh -w ncn-m00[1-3] 'ls -lh /etc/cray/kubernetes/encryption'
+        ncn-m001: Warning: Permanently added 'ncn-m001,10.252.1.10' (ECDSA) to the list of known hosts.
+        ncn-m002: Warning: Permanently added 'ncn-m002' (ECDSA) to the list of known hosts.
+        ncn-m002: total 8.0K
+        ncn-m002: lrwxrwxrwx 1 root root  69 Jul 23 22:22 current.yaml -> d857284b70d5157900ee74db5c2ba802f05f7e0d066e91c83c8832d373dd271a.yaml
+        ncn-m002: -rw------- 1 root root 334 Jul 23 22:21 d857284b70d5157900ee74db5c2ba802f05f7e0d066e91c83c8832d373dd271a.yaml
+        ncn-m002: -r-------- 1 root root 151 Jul  6 19:33 default.yaml
+        ncn-m001: total 4.0K
+        ncn-m001: lrwxrwxrwx 1 root root  44 Jul  6 21:01 current.yaml -> /etc/cray/kubernetes/encryption/default.yaml
+        ncn-m001: -r-------- 1 root root 151 Jul  6 21:01 default.yaml
+        ncn-m003: total 8.0K
+        ncn-m003: lrwxrwxrwx 1 root root  69 Jul 23 22:20 current.yaml -> d857284b70d5157900ee74db5c2ba802f05f7e0d066e91c83c8832d373dd271a.yaml
+        ncn-m003: -rw------- 1 root root 334 Jul 23 22:19 d857284b70d5157900ee74db5c2ba802f05f7e0d066e91c83c8832d373dd271a.yaml
+        ncn-m003: -r-------- 1 root root 151 Jul  6 19:34 default.yaml
+        ```
+
+    2. Check the status of Kubernetes encryption.
+
+        ```bash
+        /usr/share/doc/csm/scripts/operations/kubernetes/encryption.sh --status
+        ```
+
+        Expected output if encyrption is not enabled:
+
+        ```bash
+        ncn-m001:~ # /usr/share/doc/csm/scripts/operations/kubernetes/encryption.sh --status
+        k8s encryption status
+        changed: 2024-07-06 20:07:35+0000
+        ncn-m001: identity
+        ncn-m002: identity
+        ncn-m003: dentity
+        current: identity
+        goal: identity
+        etcd: identity
+        ```
+
+        Expected output if encryption is enabled but has not been restored on a single master node:
+
+        ```bash
+        ncn-m001:~ # /usr/share/doc/csm/scripts/operations/kubernetes/encryption.sh --status
+        k8s encryption status
+        changed: 2024-07-06 20:07:35+0000
+        ncn-m001: identity
+        ncn-m002: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656 identity
+        ncn-m003: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656 identity
+        current: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656
+        goal: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656
+        etcd: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656
+        interim state detected, ensure all control plane nodes are in sync
+        ```
+
+        Expected output if encryption is enabled on all master nodes:
+
+        ```bash
+        ncn-m001:~ # /usr/share/doc/csm/scripts/operations/kubernetes/encryption.sh --status
+        k8s encryption status
+        changed: 2024-07-06 20:07:35+0000
+        ncn-m001: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656 identity
+        ncn-m002: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656 identity
+        ncn-m003: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656 identity
+        current: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656
+        goal: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656
+        etcd: aescbc-625e61a4ebe4d3ddf8b5eec3b546663945b837d53ca966d72e49b42cdae4e656
+        ```
+
+    If Kubernetes encryption is set up on the system and is enabled on all master nodes, then there is nothing more to do. Follow the next step if Kubernetes encryption is not correctly set up on a master node.
+
+1. Set up Kubernetes encryption on the master node that does not have it enabled. All of the steps below should be performed from the node where encryption needs to be enabled.
+
+    1. `ssh` the node name where encryption needs to be enabled.
+
+    1. From the node where encryption needs to be enabled, set the environment variable `SRC_NODE` to be another master node which contains the correct configuration files.
+
+        ```bash
+        SRC_NODE=ncn-m002
+        ```
+
+    1. Copy `/etc/cray/kubernetes/encryption` files from the `SRC_NODE` to the node where encryption needs to be enaled.
+
+        ```bash
+        scp ${SRC_NODE}:/etc/cray/kubernetes/encryption/* /etc/cray/kubernetes/encryption/
+        ```
+
+    1. Symbolically link the `current.yaml` file to the correct encryption file.
+
+        ```bash
+        function link_file() {
+            linked_file=$(ssh ${SRC_NODE} 'readlink /etc/cray/kubernetes/encryption/current.yaml')
+            cd /etc/cray/kubernetes/encryption
+            rm current.yaml
+            ln -s ${linked_file} current.yaml
+            ls -lh
+        }
+        link_file
+        ```
+
+    1. Restart `kube-apiserver` on the node where Kubernetes encryption is being enabled. This should be the same node that the `kube-apiserver` is failing on and why this troubleshooting document is being followed.
+
+        ```bash
+        function restart_kubeapiserver() {
+            crictl ps | grep kube-apiserver
+            container_id=$(crictl ps | grep kube-apiserver | awk '{ print $1 }')
+            crictl stop $container_id
+            while [[ -z $(crictl ps | grep kube-apiserver) ]]; do
+              echo "waiting for kube-apiserver to start"
+              sleep 5
+            done
+            crictl ps | grep kube-apiserver
+        }
+        restart_kubeapiserver
+        ```
+
+1. (`ncn-m001#`) Check that encryption is enabled on all master nodes. This may take 10 minutes to for the output to reflect an encryption change. Please see  the [Kubernetes Encryption Documentation](../../operations/kubernetes/encryption/README.md) for details.
+
+    ```bash
+    /usr/share/doc/csm/scripts/operations/kubernetes/encryption.sh --status
+    ```
+
+1. (`ncn-m001#`) Check that all `kube-apiserver` pods are running.
+
+    ```bash
+    kubectl get pods -n kube-system -l component=kube-apiserver
+    ```
diff --git a/upgrade/Stage_3.md b/upgrade/Stage_3.md
index f99a4cc4dd90..38c2f3d6a9ce 100644
--- a/upgrade/Stage_3.md
+++ b/upgrade/Stage_3.md
@@ -7,17 +7,17 @@
 - [Stage 3.1 - Master node image upgrade](#stage-31---master-node-image-upgrade)
 - [Argo workflows](#argo-workflows)
 - [Stage 3.2 - Worker node image upgrade](#stage-32---worker-node-image-upgrade)
-  - [Option 1 - Serial upgrade](#option-1---serial-upgrade)
-  - [Option 2 - Parallel upgrade (Tech preview)](#option-2---parallel-upgrade-tech-preview)
-    - [Restrictions](#restrictions)
-    - [Example](#example)
+    - [Option 1 - Serial upgrade](#option-1---serial-upgrade)
+    - [Option 2 - Parallel upgrade (Tech preview)](#option-2---parallel-upgrade-tech-preview)
+        - [Restrictions](#restrictions)
+        - [Example](#example)
 - [Stage 3.3 - `ncn-m001` upgrade](#stage-33---ncn-m001-upgrade)
-  - [Stop typescript on `ncn-m001`](#stop-typescript-on-ncn-m001)
-  - [Backup artifacts on `ncn-m001`](#backup-artifacts-on-ncn-m001)
-  - [Move to `ncn-m002`](#move-to-ncn-m002)
-  - [Start typescript on `ncn-m002`](#start-typescript-on-ncn-m002)
-  - [Prepare `ncn-m002`](#prepare-ncn-m002)
-  - [Upgrade `ncn-m001`](#upgrade-ncn-m001)
+    - [Stop typescript on `ncn-m001`](#stop-typescript-on-ncn-m001)
+    - [Backup artifacts on `ncn-m001`](#backup-artifacts-on-ncn-m001)
+    - [Move to `ncn-m002`](#move-to-ncn-m002)
+    - [Start typescript on `ncn-m002`](#start-typescript-on-ncn-m002)
+    - [Prepare `ncn-m002`](#prepare-ncn-m002)
+    - [Upgrade `ncn-m001`](#upgrade-ncn-m001)
 - [Stage 3.4 - Upgrade `weave` and `multus`](#stage-34---upgrade-weave-and-multus)
 - [Stage 3.5 - `coredns` anti-affinity](#stage-35---coredns-anti-affinity)
 - [Stage 3.6 - Complete Kubernetes upgrade](#stage-36---complete-kubernetes-upgrade)
@@ -40,6 +40,9 @@ after a break, always be sure that a typescript is running before proceeding.
 
 ## Stage 3.1 - Master node image upgrade
 
+> **`NOTE`** If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../operations/kubernetes/encryption/README.md),
+then backup the `/etc/cray/kubernetes/encryption` directory on the master node before upgrading and restore the directory after the node has been upgraded.
+
 1. (`ncn-m001#`) Run `ncn-upgrade-master-nodes.sh` for `ncn-m002`.
 
    Follow output of the script carefully. The script will pause for manual interaction.
@@ -48,7 +51,9 @@ after a break, always be sure that a typescript is running before proceeding.
    /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh ncn-m002
    ```
 
-   > **`NOTE`** The `root` user password for the node may need to be reset after it is rebooted.
+   > **`NOTE`** The `root` user password for the node may need to be reset after it is rebooted. Additionally, the `/etc/cray/kubernetes/encryption` directory should be restored if it was backed up.
+   Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted.
+   See [Kubernetes `kube-apiserver` Failing](../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`.
 
 1. Repeat the previous step for each other master node **excluding `ncn-m001`**, one at a time.
 
@@ -209,12 +214,20 @@ For any typescripts that were started earlier on `ncn-m001`, stop them with the
 
 ### Upgrade `ncn-m001`
 
+> **`NOTE`** If Kubernetes encryption has been enabled via the [Kubernetes Encryption Documentation](../operations/kubernetes/encryption/README.md),
+then backup the `/etc/cray/kubernetes/encryption` directory on the master node before upgrading and restore the directory after the node has been upgraded.
+
 1. Upgrade `ncn-m001`.
 
    ```bash
    /usr/share/doc/csm/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh ncn-m001
    ```
 
+   > **`NOTE`** The `root` user password for the node may need to be reset after it is rebooted.
+   Additionally, the `/etc/cray/kubernetes/encryption` directory should be restored if it was backed up.
+   Once it is restored, the `kube-apiserver` on the rebuilt node should be restarted.
+   See [Kubernetes `kube-apiserver` Failing](../troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md) for details on how to restart the `kube-apiserver`.
+
 ## Stage 3.4 - Upgrade `weave` and `multus`
 
 Run the following command to complete the upgrade of the `weave` and `multus` manifest versions:

From 7c6d7df686ac8d974856a9d1bd4a5bb8cca1cb01 Mon Sep 17 00:00:00 2001
From: David Laine <77020169+dlaine-hpe@users.noreply.github.com>
Date: Thu, 25 Jul 2024 14:41:52 -0500
Subject: [PATCH 36/37] CASMTRIAGE-7107 - update IMS docs for adding a signing
 key. (#5250)

* CASMTRIAGE-7107 - update IMS docs for adding a signing key.

* Update for PR comments.
---
 .../Configure_IMS_to_validate_rpms.md         | 202 ++++++++++++++++--
 1 file changed, 190 insertions(+), 12 deletions(-)

diff --git a/operations/image_management/Configure_IMS_to_validate_rpms.md b/operations/image_management/Configure_IMS_to_validate_rpms.md
index 2e357cc6e63c..8e3b7928e464 100644
--- a/operations/image_management/Configure_IMS_to_validate_rpms.md
+++ b/operations/image_management/Configure_IMS_to_validate_rpms.md
@@ -28,10 +28,16 @@ Configuring the Image Management Service (IMS) to validate the GPG signatures of
    Example output:
 
    ```yaml
-     - image: cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:0.4.7
+        - image: artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:1.7.0
+            value: "artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:1.7.0"
    ```
 
-   If successful, make note of the version of the listed container. In this case, the version is `0.4.7`.
+   If successful, make note of the version of the listed container. In this case, the version is `1.7.0`.
+   Create an environment variable for this value.
+
+    ```bash
+    KIWI_VERSION=1.7.0
+    ```
 
 1. (`ncn-mw#`) Create a file containing the public portion of the Signing Key to be added to the IMS Kiwi-NG image.
 
@@ -45,9 +51,27 @@ Configuring the Image Management Service (IMS) to validate the GPG signatures of
 1. (`ncn-mw#`) Obtain a copy of the `entrypoint.sh` script from `cray-ims-kiwi-ng-opensuse-x86_64-builder`.
 
    ```bash
-   podman run -it --entrypoint "" --rm cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:0.4.7 cat /scripts/entrypoint.sh | tee entrypoint.sh
+   podman run -it --entrypoint "" --rm registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:${KIWI_VERSION} cat /scripts/entrypoint.sh | tee entrypoint.sh
+   ```
+
+1. (`ncn-mw#`) Set the correct permissions on the script file.
+
+    ```bash
+   chmod 755 entrypoint.sh
    ```
 
+    Verify the correct permissions:
+
+    ```bash
+    ls -la entrypoint.sh
+    ```
+
+    Expected output:
+
+    ```text
+    -rwxr-xr-x 1 root root 8955 Jul 24 15:27 entrypoint.sh
+    ```
+
 1. (`ncn-mw#`) Modify the `entrypoint.sh` script to pass the signing key to the `kiwi-ng` command.
 
     ```bash
@@ -68,14 +92,111 @@ Configuring the Image Management Service (IMS) to validate the GPG signatures of
     [...]
     ```
 
-1. Create a `Dockerfile` to create a new `cray-ims-kiwi-ng-opensuse-x86_64-builder` image.
+1. (`ncn-mw#`) Obtain a copy of the `armentry.sh` script from `cray-ims-kiwi-ng-opensuse-x86_64-builder`.
+
+   ```bash
+   podman run -it --entrypoint "" --rm registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:${KIWI_VERSION} cat /scripts/armentry.sh | tee armentry.sh
+   ```
+
+1. (`ncn-mw#`) Set the correct permissions on the script file.
+
+    ```bash
+   chmod 755 armentry.sh
+   ```
+
+    Verify the correct permissions:
+
+    ```bash
+    ls -la armentry.sh
+    ```
+
+    Expected output:
+
+    ```text
+    -rwxr-xr-x 1 root root 8955 Jul 24 15:27 armentry.sh
+    ```
+
+1. (`ncn-mw#`) Modify the `armentry.sh` script to pass the signing key to the `kiwi-ng` command.
+
+    ```bash
+    cat armentry.sh
+    ```
+
+    Example output:
+
+    ```text
+    [...]
+
+    # Call kiwi to build the image recipe. Note that the command line --add-bootstrap-package
+    # causes kiwi to install the cray-ca-cert RPM into the image root.
+    kiwi-ng $DEBUG_FLAGS --logfile=$PARAMETER_FILE_KIWI_LOGFILE --type tbz system build --description $RECIPE_ROOT_PARENT \
+    --target $IMAGE_ROOT_PARENT --add-bootstrap-package file:///mnt/ca-rpm/cray_ca_cert-1.0.1-1.x86_64.rpm \
+    --signing-key /signing-keys/my-signing-key.asc   # <--- ADD SIGNING-KEY FILE
+
+    [...]
+    ```
+
+1. (`ncn-mw#`) Obtain a copy of the `remote_build_entrypoint.sh` script from `cray-ims-kiwi-ng-opensuse-x86_64-builder`.
+
+   ```bash
+   podman run -it --entrypoint "" --rm registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:${KIWI_VERSION} cat /scripts/remote_build_entrypoint.sh | tee remote_build_entrypoint.sh
+   ```
+
+1. (`ncn-mw#`) Set the correct permissions on the script file.
+
+    ```bash
+   chmod 755 remote_build_entrypoint.sh
+   ```
+
+    Verify the correct permissions:
+
+    ```bash
+    ls -la remote_build_entrypoint.sh
+    ```
+
+    Expected output:
+
+    ```text
+    -rwxr-xr-x 1 root root 8955 Jul 24 15:27 remote_build_entrypoint.sh
+    ```
+
+1. (`ncn-mw#`) Modify the `remote_build_entrypoint.sh` script to pass the signing key to the `kiwi-ng` command.
+
+    ```bash
+    cat remote_build_entrypoint.sh
+    ```
+
+    Example output:
+
+    ```text
+    [...]
+
+    # Call kiwi to build the image recipe. Note that the command line --add-bootstrap-package
+    # causes kiwi to install the cray-ca-cert RPM into the image root.
+    kiwi-ng $DEBUG_FLAGS --logfile=$PARAMETER_FILE_KIWI_LOGFILE --type tbz system build --description $RECIPE_ROOT_PARENT \
+    --target $IMAGE_ROOT_PARENT --add-bootstrap-package file:///mnt/ca-rpm/cray_ca_cert-1.0.1-1.x86_64.rpm \
+    --signing-key /signing-keys/my-signing-key.asc   # <--- ADD SIGNING-KEY FILE
+
+    [...]
+    ```
+
+1. (`ncn-mw#`) Create a `Dockerfile` to create a new `cray-ims-kiwi-ng-opensuse-x86_64-builder` image.
 
     ```text
-    FROM registry.local/cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:0.4.7
+    FROM registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:${KIWI_VERSION}
 
-    RUN mkdir /signing-keys
+    RUN mkdir -p /signing-keys
     COPY my-signing-key.asc /signing-keys
+
     COPY entrypoint.sh /scripts/entrypoint.sh
+    RUN sed -i -e 's/\r$//' /scripts/entrypoint.sh
+
+    COPY armentry.sh /scripts/armentry.sh
+    RUN sed -i -e 's/\r$//' /scripts/armentry.sh
+
+    COPY remote_build_entrypoint.sh /scripts/remote_build_entrypoint.sh
+    RUN sed -i -e 's/\r$//' /scripts/remote_build_entrypoint.sh
+
     ENTRYPOINT ["/scripts/entrypoint.sh"]
     ```
 
@@ -85,19 +206,76 @@ Configuring the Image Management Service (IMS) to validate the GPG signatures of
 1. (`ncn-mw#`) Verify that the following files are in the temporary directory.
 
     ```text
-    Dockerfile  entrypoint.sh  my-signing-key.asc
+    Dockerfile  entrypoint.sh armentry.sh remote_build_entrypoint.sh my-signing-key.asc
     ```
 
+1. (`ncn-mw#`) (With aarch64 hardware) Install QEMU emulation software.
+
+    For cross compiling aarch64 images, QEMU emulation software must be installed on the
+    node where this operation is taking place. If QEMU is already installed this step may
+    be skipped.
+
+    1. Download and install the QEMU emulation package.
+
+        ```bash
+        wget https://github.com/multiarch/qemu-user-static/releases/download/v7.2.0-1/qemu-aarch64-static
+        mv ./qemu-aarch64-static /usr/bin/qemu-aarch64-static
+        chmod +x /usr/bin/qemu-aarch64-static
+        ```
+
+    1. Set up `binfmt_misc` for handling emulation.
+
+        ```bash
+        if [ ! -d /proc/sys/fs/binfmt_misc ] ; then
+            echo "- binfmt_misc does not appear to be loaded or isn't built in."
+            echo "  Trying to load it..."
+            if ! modprobe binfmt_misc ; then
+                echo "FATAL: Unable to load binfmt_misc"
+                exit 1;
+            fi
+        fi
+        ```
+
+    1. Mount the emulation file system.
+
+        ```bash
+        if [ ! -f /proc/sys/fs/binfmt_misc/register ] ; then
+            echo "- The binfmt_misc filesystem does not appear to be mounted."
+            echo "  Trying to mount it..."
+            if ! mount binfmt_misc -t binfmt_misc /proc/sys/fs/binfmt_misc ; then
+                echo "FATAL:  Unable to mount binfmt_misc filesystem."
+                exit 1
+            fi
+        fi
+        ```
+
+    1. Register QEMU for aarch64 emulation.
+
+        ```bash
+        if [ ! -f /proc/sys/fs/binfmt_misc/qemu-aarch64 ] ; then
+            echo "- Setting up QEMU for ARM64"
+            echo ":qemu-aarch64:M::\x7f\x45\x4c\x46\x02\x01\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\xb7\x00:\xff\xff\xff\xff\xff\xff\xff\x00\xff\xff\xff\xff\xff\xff\xff\xff\xfe\xff\xff\xff:/usr/bin/qemu-aarch64-static:F" >> /proc/sys/fs/binfmt_misc/register
+        fi
+        ```
+
 1. (`ncn-mw#`) Using the `podman` command, build and tag a new `cray-ims-kiwi-ng-opensuse-x86_64-builder` image.
 
+    For systems with only `x86_64` hardware, use the following command:
+
+    ```bash
+    podman build -t registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:${KIWI_VERSION}-validate .
+    ```
+
+    For systems that include aarch64 hardware, use the following command:
+
     ```bash
-    podman build -t registry.local/cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:0.4.7-validate .
+    podman buildx build --platform=linux/amd64,linux/arm64 -t registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:${KIWI_VERSION}-validate .
     ```
 
     Expected output:
 
     ```text
-    STEP 1: FROM registry.local/cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:0.4.7
+    STEP 1: FROM registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:1.7.0
     STEP 2: RUN mkdir /signing-keys
     --> Using cache 5d64aadcffd3f9f8f112cca75b886cecfccbfe903d4b0d4176882f0e78ccd4d0
     --> 5d64aadcffd
@@ -109,7 +287,7 @@ Configuring the Image Management Service (IMS) to validate the GPG signatures of
     --> 6e388b60f42
     STEP 5: ENTRYPOINT ["/scripts/entrypoint.sh"]
     --> Using cache 46c78827eb62c66c9f42aeba12333281b073dcc80212c4547c8cc806fe5519b3
-    STEP 6: COMMIT registry.local/cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:0.4.7-validate
+    STEP 6: COMMIT registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:1.7.0-validate
     --> 46c78827eb6
     46c78827eb62c66c9f42aeba12333281b073dcc80212c4547c8cc806fe5519b3
     ```
@@ -124,7 +302,7 @@ Configuring the Image Management Service (IMS) to validate the GPG signatures of
 1. (`ncn-mw#`) Push the new image to the Nexus image registry.
 
     ```bash
-    podman push registry.local/cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:0.4.7-validate --creds="$NEXUS_USERNAME:$NEXUS_PASSWORD"
+    podman push registry.local/artifactory.algol60.net/csm-docker/stable/cray-ims-kiwi-ng-opensuse-x86_64-builder:${KIWI_VERSION}-validate --creds="$NEXUS_USERNAME:$NEXUS_PASSWORD"
     ```
 
 1. (`ncn-mw#`) Update the IMS `cray-configmap-ims-v2-image-create-kiwi-ng` ConfigMap to use this new image.
@@ -138,7 +316,7 @@ Configuring the Image Management Service (IMS) to validate the GPG signatures of
     ```text
     [...]
 
-    - image: cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:0.4.7-validate
+    - image: cray/cray-ims-kiwi-ng-opensuse-x86_64-builder:1.7.0-validate
 
     [...]
     ```

From cfd668da2d817dbb118d1be2c171d311d3dc216c Mon Sep 17 00:00:00 2001
From: shreni123 <53111642+shreni123@users.noreply.github.com>
Date: Fri, 26 Jul 2024 01:13:37 +0530
Subject: [PATCH 37/37] CASMMON-416: cray-sysmgmt-health will not install with
 SNMP monitoring enabled (#5244)

* CASMMON-416: cray-sysmgmt-health will not install with SNMP monitoring enabled
---------

Signed-off-by: Russell Bunch <doomslayer@hpe.com>
Co-authored-by: Rambabu Bolla <rambabubolla@gmail.com>
Co-authored-by: Russell Bunch <doomslayer@hpe.com>
---
 .../snmp_exporter_configs.md                  | 27 +++++++++----------
 scripts/configure_snmp_monitor.py             | 11 ++++----
 .../upgrade/util/update-customizations.sh     |  5 +++-
 3 files changed, 22 insertions(+), 21 deletions(-)
 mode change 100755 => 100644 scripts/configure_snmp_monitor.py

diff --git a/operations/network/management_network/snmp_exporter_configs.md b/operations/network/management_network/snmp_exporter_configs.md
index 126df4fe939a..7f0bb36e46df 100644
--- a/operations/network/management_network/snmp_exporter_configs.md
+++ b/operations/network/management_network/snmp_exporter_configs.md
@@ -58,8 +58,8 @@ In order to provide data to the Grafana SNMP dashboards, the SNMP Exporter must
    [{'name': 'sw-spine-001', 'target': '10.254.0.2'},
     {'name': 'sw-spine-002', 'target': '10.254.0.3'},
     {'name': 'sw-leaf-bmc-001', 'target': '10.254.0.4'}]
-   Enabling prometheus-snmp-exporter serviceMonitor
-   Adding the targets to the SNMP serviceMonitor configuration
+   Enabling snmpExporter VMServiceScrape
+   Adding the targets to the SNMP VMServiceScrape configuration
    ```
 
    The HMN is used by default as ACLs in the switch configuration block SNMP over the NMN.
@@ -68,29 +68,28 @@ In order to provide data to the Grafana SNMP dashboards, the SNMP Exporter must
 1. (`pit#`) Review the SNMP Exporter configuration.
 
     ```bash
-    yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter' "${PITDATA}/prep/site-init/customizations.yaml"
+    yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.snmpExporter' "customizations.yaml"
     ```
 
     The expected output looks similar to:
 
     ```yaml
-    serviceMonitor:
-      enabled: true
-      params:
-        - name: sw-spine-001
-          target: 10.254.0.2
-        - name: sw-spine-002
-          target: 10.254.0.3
-        - name: sw-leaf-bmc-001
-          target: 10.254.0.4
+    enabled: true
+    params:
+      - name: sw-spine-001
+        target: 10.254.0.2
+      - name: sw-spine-002
+        target: 10.254.0.3
+      - name: sw-leaf-bmc-001
+        target: 10.254.0.4
       ```
 
 The most common configuration parameters are specified in the following table. They must be set in the `customizations.yaml` file
-under the `spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter` service definition.
+under the `spec.kubernetes.services.cray-sysmgmt-health.snmpExporter` service definition.
 
 | Customization            | Default      | Description                                                                         |
 |--------------------------|--------------|-------------------------------------------------------------------------------------|
-| `serviceMonitor.enabled` | `true`       | Enables `serviceMonitor` for SNMP Exporter \(default chart value is `true`\)        |
+| `snmpExporter.enabled`   | `true`       | Enables `VMServiceScrape` for SNMP Exporter \(default chart value is `true`\)       |
 | `params.enabled`         | `true`       | Sets the SNMP Exporter `params` change to `true` \(default chart value is `false`\) |
 | `params.conf.module`     | `if_mib`     | SNMP Exporter to select which module \(default chart value is `if_mib`\)            |
 | `params.conf.target`     | `10.252.0.2` | Add list of switch targets to SNMP Exporter to monitor                              |
diff --git a/scripts/configure_snmp_monitor.py b/scripts/configure_snmp_monitor.py
old mode 100755
new mode 100644
index e959de4d8a98..6c6a1ec1c70a
--- a/scripts/configure_snmp_monitor.py
+++ b/scripts/configure_snmp_monitor.py
@@ -80,16 +80,15 @@
         customizations = yaml.safe_load(customizations_file)
 
         print("Enabling prometheus-snmp-exporter serviceMonitor")
-        if "prometheus-snmp-exporter" not in customizations:
+        if "snmpExporter" not in customizations:
             customizations['spec']['kubernetes']['services']['cray-sysmgmt-health'].update(
-                {"prometheus-snmp-exporter":{'serviceMonitor': {'enabled': True, 'params': []}}})
+                {"snmpExporter":{'enabled': True, 'params': []}})
         else:
-            customizations['spec']['kubernetes']['services']['cray-sysmgmt-health']['prometheus-snmp-exporter'][
-                'serviceMonitor'].update({'enabled': 'true'})
+            customizations['spec']['kubernetes']['services']['cray-sysmgmt-health']['snmpExporter'].update({'enabled': 'true'})
 
         print("Adding the targets to the SNMP serviceMonitor configuration")
-        customizations['spec']['kubernetes']['services']['cray-sysmgmt-health']['prometheus-snmp-exporter'][
-            'serviceMonitor']['params'] = switches_to_monitor
+        customizations['spec']['kubernetes']['services']['cray-sysmgmt-health']['snmpExporter'][
+            'params'] = switches_to_monitor
 
         customizations_file.seek(0)
         yaml.dump(customizations, customizations_file, sort_keys=False)
diff --git a/upgrade/scripts/upgrade/util/update-customizations.sh b/upgrade/scripts/upgrade/util/update-customizations.sh
index bf6d3f59cbd7..a35cdff234ad 100755
--- a/upgrade/scripts/upgrade/util/update-customizations.sh
+++ b/upgrade/scripts/upgrade/util/update-customizations.sh
@@ -128,7 +128,10 @@ if [[ -z "$(yq r "$c" "spec.network.netstaticips.nmn_ncn_storage_mons")" ]]; the
   yq w -i --style=single "$c" spec.kubernetes.services.cray-sysmgmt-health.cephExporter.endpoints '{{ network.netstaticips.nmn_ncn_storage_mons }}'
 fi
 
-# Kube-prometheus-stack
+# Disable prometheus-snmp-exporter servicemonitor
+yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.enabled = false' -i $c
+
+# victoria-metrics-k8s-stack
 if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack' $c)" != null ]; then
   if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack' $c)" != null ]; then
     yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack = (.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack * .spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack)' -i $c