From 1092ca90978ae6e0adcc02dd217a7cb56d74b0c5 Mon Sep 17 00:00:00 2001 From: Jenkins Date: Wed, 12 Jun 2024 17:51:59 +0000 Subject: [PATCH 01/37] Automated API docs swagger to md conversion (https://jenkins.algol60.net/job/Cray-HPE/job/csm/job/v1.6.0-alpha.52/1/) --- api/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/api/README.md b/api/README.md index aefbb758e306..587b67ef92b5 100644 --- a/api/README.md +++ b/api/README.md @@ -14,4 +14,3 @@ * [Hardware State Manager API v2](./smd.md) * [Cray STS Token Generator v1](./sts.md) * [TAPMS Tenant Status API v1](./tapms-operator.md) - * [User Access Service v1](./uas-mgr.md) From 72af01d682ab5c51a7dfabe2feba5b96d294f8d0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 14 Jun 2024 14:09:15 -0500 Subject: [PATCH 02/37] CASMINST-6879: Add install-goss-tests.sh (#5148) (cherry picked from commit 3a56fb1d344a9a444de6a1405ef91f73ac6dc7a8) Co-authored-by: Mitch Harding (the weird one) --- install/scripts/install-goss-tests.sh | 235 ++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100755 install/scripts/install-goss-tests.sh diff --git a/install/scripts/install-goss-tests.sh b/install/scripts/install-goss-tests.sh new file mode 100755 index 000000000000..c21eeb2cccad --- /dev/null +++ b/install/scripts/install-goss-tests.sh @@ -0,0 +1,235 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2021-2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +# This script is a replacement for the script of the same name in the lib +# directory of the CSM tarballs and repository. + +# This script is run twice during CSM installs. It does different things depending +# on when it is run. + +# It is run the first time from the PIT node, after the other NCNs have been deployed. In this +# case, it does the following: +# 1) Finds the latest versions of select RPMs in the expanded CSM tarball +# 2) Copies the RPMs into a prep subdirectory +# 3) Installs the RPMs onto the other NCNs and the PIT node +# +# It is run for the second time from ncn-m001, after PIT redeploy. In this case, it does +# the following: +# 1) Installs the RPMs in the prep subdirectory (populated from the first time the script was +# executed) onto ncn-m001 +# 2) Enables and restarts goss-servers on ncn-m001 + +# Globally disable warning about globbing and word splitting +# shellcheck disable=SC2086 + +set -e + +PITFILE="/etc/pit-release" + +function find_latest_rpm { + # $1 - RPM name prefix (e.g. csm-testing, goss-servers, etc) + local name vpattern rpm_regex1 rpm_regex2 filepath + name="$1" + # The first part of the version will be three .-separated numbers + vpattern="[0-9][0-9]*[.][0-9][0-9]*[.][0-9][0-9]*" + + # After the name and version, there are two ways our RPM may be named: + # * It could have a -, followed by characters we do not care about, ending in .rpm + rpm_regex1="${name}-${vpattern}-[^/]*[.]rpm" + # * Or it could just have .rpm after the name and version + rpm_regex2="${name}-${vpattern}[.]rpm" + + # List all RPM files in the rpm directory + filepath=$(find "$RPMDIR" -type f -name \*.rpm \ + | + # Select only names fitting one of our patterns + grep -E "/(${rpm_regex1}|${rpm_regex2})$" \ + | + # Change each line so first it shows just the RPM filename, followed by a blank space, + # followed by the original full path and filename + sed -e "s#^${RPMDIR}.*/\(${rpm_regex1}\)\$#\1 \0#" -e "s#^${RPMDIR}.*/\(${rpm_regex2}\)\$#\1 \0#" \ + | + # Sort the first field (the RPM filename without path) by version + sort -k1V \ + | + # Choose the last one listed (the one with the highest version) + tail -1 \ + | + # Change the line, removing the RPM filename and space, leaving only the full path and filename + sed 's/^[^ ]* //') + if [[ -z ${filepath} ]]; then + echo "The ${name} RPM was not found at the expected location. Ensure this RPM exists under the '$RPMDIR' directory" >&2 + return 1 + fi + echo "${filepath}" + return 0 +} + +function paths_to_basenames { + local rpm_name_list + while [[ $# -gt 0 ]]; do + rpm_name_list="${rpm_name_list} ${1##*/}" + shift + done + echo "${rpm_name_list}" + return 0 +} + +function err_exit { + while [[ $# -gt 0 ]]; do + echo "$1" >&2 + shift + done + exit 1 +} + +function run_on_pit { + [[ -n ${CSM_RELEASE} || -n ${CSM_PATH} ]] || err_exit 'Please set and export $CSM_PATH or $CSM_RELEASE and try again' + + local MTOKEN STOKEN WTOKEN PREPDIR STORAGE_NCNS K8S_NCNS PREP_RPM_DIR ncn + local STORAGE_RPM_PATHS K8S_RPM_PATHS STORAGE_RPM_BASENAMES K8S_RPM_BASENAMES + local HPE_GOSS_RPM CMSTOOLS_RPM CANU_RPM CSM_TESTING_RPM GOSS_SERVERS_RPM PLATFORM_UTILS_RPM IUF_CLI_RPM + + MTOKEN='ncn-m\w+' + STOKEN='ncn-s\w+' + WTOKEN='ncn-w\w+' + + PITDATA=${PITDATA:-/var/www/ephemeral} + CSM_DIRNAME=${CSM_DIRNAME:-${PITDATA}} + CSM_PATH=${CSM_PATH:-${CSM_DIRNAME}/csm-${CSM_RELEASE}} + RPMDIR=${RPMDIR:-${CSM_PATH}/rpm} + PREPDIR="${PITDATA}/prep" + PREP_RPM_DIR="${PREPDIR}/rpms" + + [[ -d ${CSM_PATH} ]] \ + || err_exit "The csm-${CSM_RELEASE} directory was not found at the expected location." \ + "Please set \$CSM_DIRNAME to the absolute path containing the csm-$CSM_RELEASE directory" + + [[ -d ${RPMDIR} ]] \ + || err_exit "The 'rpm' directory was not found in the base directory of the expanded CSM tarball: ${CSM_PATH}" \ + "Please set \$CSM_PATH to the path of the base directory of the expanded CSM tarball, and verify that it contains the 'rpm' directory." + + [[ -d ${PREPDIR} ]] || err_exit "The 'prep' directory was not found in its expected location: '${PREPDIR}'" + + # It's okay if our RPM prep subdirectory already exists (we'll just delete and recreate it), but if it exists + # and isn't a directory, then that means something other than this script created it, so we should be + # cautious and not automatically delete it. + [[ ! -e ${PREP_RPM_DIR} || -d ${PREP_RPM_DIR} ]] \ + || err_exit "ERROR: '${PREP_RPM_DIR}' already exists but it is not a directory. Move, rename, or delete it and then re-run this script" + + STORAGE_NCNS=$(grep -oE "${STOKEN}" /etc/dnsmasq.d/statics.conf | grep -v m001 | sort -u) + K8S_NCNS=$(grep -oE "(${MTOKEN}|${WTOKEN})" /etc/dnsmasq.d/statics.conf | grep -v m001 | sort -u) + + CANU_RPM=$(find_latest_rpm canu) + CSM_TESTING_RPM=$(find_latest_rpm csm-testing) + GOSS_SERVERS_RPM=$(find_latest_rpm goss-servers) + IUF_CLI_RPM=$(find_latest_rpm iuf-cli) + PLATFORM_UTILS_RPM=$(find_latest_rpm platform-utils) + HPE_GOSS_RPM=$(find_latest_rpm hpe-csm-goss-package) + CMSTOOLS_RPM=$(find_latest_rpm cray-cmstools-crayctldeploy) + + # cmstools RPM is not installed on storage nodes + STORAGE_RPM_PATHS="${HPE_GOSS_RPM} ${CANU_RPM} ${CSM_TESTING_RPM} ${GOSS_SERVERS_RPM} ${IUF_CLI_RPM} ${PLATFORM_UTILS_RPM}" + K8S_RPM_PATHS="${STORAGE_RPM_PATHS} ${CMSTOOLS_RPM}" + + # If the RPM prep subdirectory already exists, remove it and its contents + if [[ -d ${PREP_RPM_DIR} ]]; then + echo "Deleting existing directory: '${PREP_RPM_DIR}'" + rm -rf "${PREP_RPM_DIR}" + [[ ! -e ${PREP_RPM_DIR} ]] || err_exit "ERROR: Still exists even after deleting it: '${PREP_RPM_DIR}'" + fi + + # Create prep subdirectory + echo "Creating directory: '${PREP_RPM_DIR}'" + mkdir -v "${PREP_RPM_DIR}" + + # Copy test RPMs into it + cp -v ${K8S_RPM_PATHS} "${PREP_RPM_DIR}" + + STORAGE_RPM_BASENAMES=$(paths_to_basenames ${STORAGE_RPM_PATHS}) + K8S_RPM_BASENAMES=$(paths_to_basenames ${K8S_RPM_PATHS}) + + # Install the RPMs onto the other NCNs + for ncn in ${STORAGE_NCNS}; do + echo "Installing RPMs on ${ncn}" + scp ${STORAGE_RPM_PATHS} ${ncn}:/tmp/ + # CASMINST-6779: Use rpm instead of zypper to avoid problems caused by inaccessible Zypper repos, since we are + # installing from local files anyway. + # shellcheck disable=SC2029 + ssh ${ncn} "cd /tmp && rpm -Uvh --force ${STORAGE_RPM_BASENAMES} && systemctl enable goss-servers && systemctl restart goss-servers && systemctl daemon-reload && echo systemctl daemon-reload has been run && rm -f ${STORAGE_RPM_BASENAMES}" + done + + for ncn in ${K8S_NCNS}; do + echo "Installing RPMs on ${ncn}" + scp ${K8S_RPM_PATHS} ${ncn}:/tmp/ + # CASMINST-6779: Use rpm instead of zypper to avoid problems caused by inaccessible Zypper repos, since we are + # installing from local files anyway. + # shellcheck disable=SC2029 + ssh ${ncn} "cd /tmp && rpm -Uvh --force ${K8S_RPM_BASENAMES} && systemctl enable goss-servers && systemctl restart goss-servers && systemctl daemon-reload && echo systemctl daemon-reload has been run && rm -f ${K8S_RPM_BASENAMES}" + done + + # The RPMs should have been installed on the PIT at the same time csi was installed. Trust, but verify: + echo "Installing RPMs on PIT if needed" + # CASMINST-6779: Use rpm instead of zypper to avoid problems caused by inaccessible Zypper repos, since we are + # installing from local files anyway. + rpm -q canu || rpm -Uvh --force ${CANU_RPM} + rpm -q hpe-csm-goss-package || rpm -Uvh --force ${HPE_GOSS_RPM} + rpm -q csm-testing || rpm -Uvh --force ${CSM_TESTING_RPM} + rpm -q goss-servers || (rpm -Uvh --force ${GOSS_SERVERS_RPM} && systemctl enable goss-servers && systemctl restart goss-servers) + rpm -q platform-utils || rpm -Uvh --force ${PLATFORM_UTILS_RPM} + rpm -q iuf-cli || rpm -Uvh --force ${IUF_CLI_RPM} + systemctl daemon-reload && echo "systemctl daemon-reload has been run" +} + +function run_on_m001 { + local PREP_RPM_DIR + + PREP_RPM_DIR=/metal/bootstrap/prep/rpms + [[ -d ${PREP_RPM_DIR} ]] || err_exit "ERROR: Directory does not exist: '${PREP_RPM_DIR}'" + + echo "Installing RPMs from '${PREP_RPM_DIR}':" + rpm -Uvh --force "${PREP_RPM_DIR}/"*.rpm + + echo "Enabling goss-servers" + systemctl enable goss-servers + + echo "Restarting goss-servers" + systemctl restart goss-servers + + echo "Reloading daemons" + systemctl daemon-reload && echo "systemctl daemon-reload has been run" +} + +if [[ -f ${PITFILE} ]]; then + echo "${PITFILE} exists -- running on PIT node" + run_on_pit +elif [[ ${HOSTNAME} == ncn-m001 ]]; then + echo "Running on ncn-m001 (non-PIT)" + run_on_m001 +else + err_exit "ERROR: This script should only be run from the PIT node or ncn-m001" +fi + +echo PASSED From 27a8dac3b9ab091b4e33bc28a4e36f9214e1368f Mon Sep 17 00:00:00 2001 From: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> Date: Sat, 15 Jun 2024 00:43:27 +0530 Subject: [PATCH 03/37] CASMTRIAGE-7069 Update Reboot_NCNs.md (#5149) Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> --- operations/node_management/Reboot_NCNs.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operations/node_management/Reboot_NCNs.md b/operations/node_management/Reboot_NCNs.md index bf7709bf24ae..bfdf70d56cca 100644 --- a/operations/node_management/Reboot_NCNs.md +++ b/operations/node_management/Reboot_NCNs.md @@ -349,7 +349,7 @@ Before rebooting NCNs: 1. (`ncn-mw#`) Cordon and drain the node. ```bash - kubectl drain --ignore-daemonsets=true --delete-local-data=true + kubectl drain --ignore-daemonsets=true --delete-emptydir-data ``` There may be pods that cannot be gracefully evicted because of Pod Disruption Budgets (PDB). This will result in messages like the following: @@ -370,7 +370,7 @@ Before rebooting NCNs: Then rerun the `kubectl drain` command, and it should report that the node is drained. ```bash - kubectl drain --ignore-daemonsets=true --delete-local-data=true + kubectl drain --ignore-daemonsets=true --delete-emptydir-data ``` 1. If booting from disk is desired, then [set the boot order](../../background/ncn_boot_workflow.md#setting-boot-order). From 3f810aa5d7e72999c562ece6ca69351869be2e49 Mon Sep 17 00:00:00 2001 From: Shane Unruh <87081771+shunr-hpe@users.noreply.github.com> Date: Tue, 18 Jun 2024 13:12:28 -0600 Subject: [PATCH 04/37] CASMHMS-6225 Changed the remove node procedure to allow for already removed nodes (#5166) Changed the remove node procedure to allow for already removed nodes CASMHMS-6225 --- .../node_management/remove_standard_rack_node.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/scripts/operations/node_management/remove_standard_rack_node.sh b/scripts/operations/node_management/remove_standard_rack_node.sh index 5fead528c8f1..4d659efeffcb 100755 --- a/scripts/operations/node_management/remove_standard_rack_node.sh +++ b/scripts/operations/node_management/remove_standard_rack_node.sh @@ -64,7 +64,17 @@ echo echo "==================================================" echo "Removing BMC Event subscriptions" echo "==================================================" -/usr/share/doc/csm/scripts/operations/node_management/delete_bmc_subscriptions.py "${BMC_XNAME}" +EXIT_CODE=0 +/usr/share/doc/csm/scripts/operations/node_management/delete_bmc_subscriptions.py "${BMC_XNAME}" || EXIT_CODE=$? +if [[ $EXIT_CODE -ne 0 ]]; then + if [[ -z ${TOKEN+x} ]]; then + # delete_bmc_subscriptions.py failed because the TOKEN was not set + exit $EXIT_CODE + fi + echo "The redfish subscriptions were not removed from ${BMC_XNAME}. Check the messages above for the specific errors." + echo "This could be because the node has already been physically removed." + echo "The subscriptions will need to be cleaned up when the node is added back, if it is added in a new xname location, and is on a system running CSM 1.4 or older." +fi echo echo "==================================================" From 6464e4581faa8e83420ca2660cfb8b4ae928a346 Mon Sep 17 00:00:00 2001 From: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> Date: Wed, 19 Jun 2024 00:43:46 +0530 Subject: [PATCH 05/37] CASMTRIAGE-7078 Update Rebuild_NCNs.md (#5155) * Update Rebuild_NCNs.md Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> * Update Rebuild_NCNs.md Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> --------- Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> --- operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md b/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md index dbf178f6971e..3f8498e1657f 100644 --- a/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md +++ b/operations/node_management/Rebuild_NCNs/Rebuild_NCNs.md @@ -87,7 +87,9 @@ export CSM_ARTI_DIR="/etc/cray/upgrade/csm/csm-${CSM_RELEASE}/tarball/csm-${CSM_ > > - If the `/etc/cray/upgrade/csm/` directory is empty, create an empty directory at the same path. Download and extract CSM tarball to that directory. > - Update the value of `CSM_ARTI_DIR` with the newly created directory above. +> - Download and install/upgrade the **latest** documentation on `ncn-m001` at path `/root/`. See [Check for Latest Documentation](../../../update_product_stream/README.md#check-for-latest-documentation). > - Ensure the `/etc/cray/upgrade/csm/` directory is `ceph` mount using the command below (its output should show `ceph` as the type): + ```bash mount | grep /etc/cray/upgrade/csm ``` From cf2d339f5c2d6885e527da6187cc12e3d6353bd7 Mon Sep 17 00:00:00 2001 From: Nick Davidson <86747615+ndavidson-hpe@users.noreply.github.com> Date: Tue, 18 Jun 2024 13:16:11 -0600 Subject: [PATCH 06/37] CASMTRIAGE-7061: Add new known issue for keycloak (#5164) --- troubleshooting/README.md | 1 + .../Keycloak_Error_Cannot_read_properties.md | 23 +++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 troubleshooting/known_issues/Keycloak_Error_Cannot_read_properties.md diff --git a/troubleshooting/README.md b/troubleshooting/README.md index 0d84134427d9..47feb8020f6a 100644 --- a/troubleshooting/README.md +++ b/troubleshooting/README.md @@ -43,6 +43,7 @@ to the exiting problem seen into the existing search. (The example searches for * [Software Management Services health check](known_issues/sms_health_check.md) * [QLogic driver crash](known_issues/qlogic_driver_crash.md) * [Nexus Fails Authentication with Keycloak Users](known_issues/Nexus_Fail_Authentication_with_Keycloak_Users.md) +* [Keycloak Error "Cannot read properties" in Web UI](known_issues/Keycloak_Error_Cannot_read_properties.md) * [Gigabyte BMC Missing Redfish Data](known_issues/Gigabyte_BMC_Missing_Redfish_Data.md) * [`admin*client-auth` Not Found](known_issues/admin_client_auth_not_found.md) * [Ceph OSD latency](known_issues/ceph_osd_latency.md) diff --git a/troubleshooting/known_issues/Keycloak_Error_Cannot_read_properties.md b/troubleshooting/known_issues/Keycloak_Error_Cannot_read_properties.md new file mode 100644 index 000000000000..52b31bb86598 --- /dev/null +++ b/troubleshooting/known_issues/Keycloak_Error_Cannot_read_properties.md @@ -0,0 +1,23 @@ +# Keycloak Error "Cannot read properties" in Web UI + +There is a known error that occurs after upgrading CSM from 1.4 to CSM 1.5.0 and later. This error +is shown when looking at users in Keycloak's web UI. The error occurs due to a change in how the LDAP +configuration is done in earlier versions of Keycloak. This should not occur on fresh installs. The +error occurs when looking at the user lists on Keycloak Web UI, and once looking at the page leaves a +error message on the page stating "Cannot read properties of undefined (reading 0)" + +## Fix + +To recover from this situation, perform the following procedure. + +1. After seeing the error page you will need to refresh the page and ensure you are on the correct realm again + +1. Go to the `User Federation` section + +1. Click on the LDAP configuration page + +1. Click on the switch before `Enabled` to disable the LDAP configuration + +1. Click on `Disable` on the pop-up to disable the configuration + +1. Click on the switch again to enable the LDAP configuration From 5416d6e0d41dcaf6fb90188775e69b652269b843 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 18 Jun 2024 14:20:08 -0500 Subject: [PATCH 07/37] CASMHMS-6206 - Procedure for updating vault with Paradise user/passwords (#5170) * Docs for chaning Paradise Password and update to FW Updates (cherry picked from commit f3590ee64d357f8f6d1962227d7abbe27a504f8b) * Spell check / lint updates (cherry picked from commit c8de805219366690d318277be814ecbe5f61a68c) * More Lint / spell updates (cherry picked from commit 98dd3d27a966d25d54376808e0b480e24f366648) * Lint / spell (cherry picked from commit 1dbc07ab412bf23d68c4a5201b8526c58738c869) * Update operations/firmware/FAS_Paradise.md Signed-off-by: Nathan Rockershousen --------- Signed-off-by: Nathan Rockershousen Co-authored-by: Michael Buchmann Co-authored-by: Nathan Rockershousen --- operations/firmware/FAS_Paradise.md | 33 +++++++++++++- .../Replacing_Foxconn_User_Pass.md | 43 +++++++++++++++++++ 2 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 operations/node_management/Replacing_Foxconn_User_Pass.md diff --git a/operations/firmware/FAS_Paradise.md b/operations/firmware/FAS_Paradise.md index 4e3978cf3c1b..d7bd32c35f99 100644 --- a/operations/firmware/FAS_Paradise.md +++ b/operations/firmware/FAS_Paradise.md @@ -27,6 +27,8 @@ The following targets can be updated with FAS on Paradise Nodes: ## Update Paradise `bmc_active` procedure +NOTE: If a reset of the BMC is required, follow [this procedure](#reset-bmc) before and after the update of each node. *Only do this if required!* + The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `bmc_active` - use recipe `foxconn_nodeBMC_bmc.json` The BMC will reboot after the update is complete. @@ -95,7 +97,7 @@ To update using a JSON file and the Cray CLI, use this example JSON file and fol To do an AC power cycle, run the following command (`ncn#`). ```bash -ssh $(xname) "ipmitool raw 0x38 0x02" +ssh admin@$(xname) "ipmitool raw 0x38 0x02" ``` The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `erot_active` - use recipe `foxconn_nodeBMC_erot.json` @@ -130,7 +132,7 @@ To update using a JSON file and the Cray CLI, use this example JSON file and fol To do an AC power cycle, run the following command (`ncn#`). ```bash -ssh $(xname) "ipmitool raw 0x38 0x02" +ssh admin@$(xname) "ipmitool raw 0x38 0x02" ``` The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `fpga_active` - use recipe `foxconn_nodeBMC_fpga.json` @@ -370,3 +372,30 @@ If the firmware file you need is not listed, run the following command to copy t ```bash /usr/share/doc/csm/scripts/operations/firmware/upload_foxconn_images_tftp.py ``` + +## Reset BMC + +This will reset the BMC to factory resets - including resetting the BMC username and password. +*Only do this if required!* + +Before BMC firmware update (`ncn#`): + +The nodes must be **OFF** before updating BMC (when doing a reset) + +```bash +ssh admin@$(xname) 'fw_setenv openbmconce "factory-reset"' +``` + +**Update BMC firmware using one of the methods above** +NOTE: If the password changes after the boot of BMC, FAS will no longer be able to verify the update and will fail after the time limit. + +After firmware update(`ncn#`): + +If the password changed to something other than the what is stored in vault, update the BMC password: + +```bash +ssh admin@$(xname) 'ipmitool user set password 1 "password"' +``` + +Boot the node. + diff --git a/operations/node_management/Replacing_Foxconn_User_Pass.md b/operations/node_management/Replacing_Foxconn_User_Pass.md new file mode 100644 index 000000000000..66852c9fd095 --- /dev/null +++ b/operations/node_management/Replacing_Foxconn_User_Pass.md @@ -0,0 +1,43 @@ +# Replacing `Foxconn` Username and Passwords in Vault + +`Foxconn` (Paradise) nodes may be shipped with a different default username and password then the system password. +Because of the difference in user/password, these nodes will not be able to be discovered. +Vault needs to be updated with the `Foxconn` username and password using the `FoxconnUserPass.py` script or manually. + +## Procedure using the `FoxconnUserPass.py` script + +1. (`ncn-mw#`) Set up API token. + + ```bash + export TOKEN=$(curl -k -s -S -d grant_type=client_credentials -d client_id=admin-client -d client_secret=$(kubectl get secrets admin-client-auth -o jsonpath='{.data.client-secret}' | base64 -d) https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token | jq -r '.access_token') + ``` + +1. (`ncn-mw#`) Set helper variable. + + ```bash + DOCS_DIR=/usr/share/doc/csm/scripts + ``` + +1. (`ncn-mw#`) Run the `Foxconn` update script + + ```bash + $DOCS_DIR/hardware_state_manager/FoxconnUserPass.py + ``` + + This will ask for the BMC username and password for the Paradise nodes. + The scirpt will look for undiscovered nodes, if it finds a `Foxconn` node, update vault with correct credentials. + +1. (`ncn-mw#`) Wait 10+ minutes for changes to take affect and nodes to be discovered. To check nodes which have failed to be discovered: + + ```bash + cray hsm inventory redfishEndpoints list --format json | jq '.[] | .[] | select (.DiscoveryInfo.LastDiscoveryStatus!="DiscoverOK")' + ``` + +## Manual procedure to update credentials in vault + +1. (`ncn-mw#`) Use the Cray CLI to update vault through HSM (replace `BMC_xname` with the xname of the BMC, `Foxconn_user` with the `Foxconn` default username, and `Foxconn_pass` with the `Foxconn` default password): + NOTE: `BMC_xname` needs to be in the line twice + + ```bash + cray hsm inventory redfishEndpoints update BMC_xname -id BMC_xname --user Foxconn_user --password Foxconn_pass + ``` From f91da084d36a92cda545ce75bfb257dd86d85799 Mon Sep 17 00:00:00 2001 From: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> Date: Wed, 19 Jun 2024 00:52:05 +0530 Subject: [PATCH 08/37] CASMTRIAGE-7081 Update Access_the_Keycloak_User_Management_UI.md (#5160) * Update Access_the_Keycloak_User_Management_UI.md Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> * Update Access_the_Keycloak_User_Management_UI.md Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> * Update operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md Signed-off-by: Nathan Rockershousen --------- Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> Signed-off-by: Nathan Rockershousen Co-authored-by: Nathan Rockershousen --- .../Access_the_Keycloak_User_Management_UI.md | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md b/operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md index c89f2da43b54..b481c837b02b 100644 --- a/operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md +++ b/operations/security_and_authentication/Access_the_Keycloak_User_Management_UI.md @@ -7,7 +7,7 @@ See [Create Internal User Accounts in the Keycloak Shasta Realm](Create_Internal - This procedure uses `SYSTEM_DOMAIN_NAME` as an example for the DNS name of the non-compute node \(NCN\). Replace this name with the actual NCN's DNS name while executing this procedure. - This procedure assumes that the password for the Keycloak `admin` account is known. The Keycloak password is set during the software installation process. - - (`ncn-mw#`) The password can be obtained with the following command: + (`ncn-mw#`) The password can be obtained with the following command: ```bash kubectl get secret -n services keycloak-master-admin-auth --template={{.data.password}} | base64 --decode @@ -19,14 +19,21 @@ See [Create Internal User Accounts in the Keycloak Shasta Realm](Create_Internal The following is an example URL for a system: `https://auth.cmn.system1.us.cray.com/keycloak/` + The value of `SYSTEM_DOMAIN_NAME` for a given cluster is obtained as shown in the following example: + + ```bash + # echo $SYSTEM_DOMAIN + system1.us.cray.com + ``` + The browser may return an error message similar to the following when `auth.cmn.SYSTEM_DOMAIN_NAME/keycloak` is launched for the first time: - ```text - This Connection Is Not Private + ```text + This Connection Is Not Private - This website may be impersonating "hostname" to steal your personal or financial information. - You should go back to the previous page. - ``` + This website may be impersonating "hostname" to steal your personal or financial information. + You should go back to the previous page. + ``` See [Make HTTPS Requests from Sources Outside the Management Kubernetes Cluster](Make_HTTPS_Requests_from_Sources_Outside_the_Management_Kubernetes_Cluster.md) for more information on getting the Certificate Authority \(CA\) certificate on the system. From c27a0aca423f8ace833876506b32566ffc370428 Mon Sep 17 00:00:00 2001 From: Rambabu Bolla Date: Wed, 19 Jun 2024 00:53:21 +0530 Subject: [PATCH 09/37] =?UTF-8?q?CASMMON-394:=20CSM1.5.1:=20"grok-exporter?= =?UTF-8?q?"=20pod=20status=20showing=20as=20"Contain=E2=80=A6=20(#5157)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CASMMON-394: CSM1.5.1: "grok-exporter" pod status showing as "ContainerStatusUnknown" and "Error" * Update operations/system_management_health/Grok-Exporter_Error.md Signed-off-by: Nathan Rockershousen --------- Signed-off-by: Nathan Rockershousen Co-authored-by: Nathan Rockershousen --- operations/README.md | 2 +- .../Grok-Exporter_Error.md | 25 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 operations/system_management_health/Grok-Exporter_Error.md diff --git a/operations/README.md b/operations/README.md index e483f5b965a1..dbaef7073ab7 100644 --- a/operations/README.md +++ b/operations/README.md @@ -36,7 +36,6 @@ The following administrative topics can be found in this guide: - [MetalLB in BGP-mode](#metallb-in-bgp-mode) - [Spire](#spire) - [Update firmware with FAS](#update-firmware-with-fas) -- [User Access Service (UAS)](#user-access-service-uas) - [System Admin Toolkit (SAT)](#system-admin-toolkit-sat) - [Install and Upgrade Framework (IUF)](#install-and-upgrade-framework-iuf) - [Backup and recovery](#backup-and-recovery) @@ -466,6 +465,7 @@ confident that a lack of issues indicates the system is operating normally. - [Grafterm](system_management_health/Grafterm.md) - [Remove Kiali](system_management_health/Remove_Kiali.md) - [`prometheus-kafka-adapter` errors during installation](system_management_health/Prometheus_Kafka_Error.md) +- [`grok-exporter` errors during installation](system_management_health/Grok-Exporter_Error.md) - [Troubleshoot Prometheus Alerts](system_management_health/Troubleshoot_Prometheus_Alerts.md) - [Configure UAN Node Exporter](system_management_health/uan_node_exporter_configs.md) diff --git a/operations/system_management_health/Grok-Exporter_Error.md b/operations/system_management_health/Grok-Exporter_Error.md new file mode 100644 index 000000000000..6e2665f40256 --- /dev/null +++ b/operations/system_management_health/Grok-Exporter_Error.md @@ -0,0 +1,25 @@ +# `grok-exporter` pod status showing as `ContainerStatusUnknown` Error + +## Symptom + +On CSM upgrade, the grok-exporter pod log has errors similar to the following: + +```text +The node was low on resource: ephemeral-storage. Container grok-exporter was using 127200Ki, which exceeds its request of 0. +``` + +## Solution + +This Kafka service does not exist, because the [System Monitoring Application (SMA)](../../glossary.md#system-monitoring-application-sma) +has not been installed yet. This causes the above errors for retry to be logged. Prometheus can operate without SMA Kafka and it will +periodically retry the connection to Kafka. These errors will be logged until SMA is installed. Therefore, if they are seen before SMA is +installed, then disregard them. + +The root file system on master is at more than 80% but keeps hitting the threshold to raise `NodeHasDiskPressure`(85%) which causes the +node to then attempt to reclaim ephemeral-storage. + +Increase/clean the root filesystem and delete the grok exporter pod as follows: + +```bash +kubectl delete pod -l app=grok-exporter -n sysmgmt-health +``` From eec5a34eb69f938c269d33d6e464734af47b7298 Mon Sep 17 00:00:00 2001 From: Srinivas-Anand-HPE <119280543+Srinivas-Anand-HPE@users.noreply.github.com> Date: Wed, 19 Jun 2024 01:15:37 +0530 Subject: [PATCH 10/37] =?UTF-8?q?CASMTRIAGE-6990=20update=20the=20IUF=20ma?= =?UTF-8?q?nual=20configuration=20instructions=20UAS=20and=20Badger=20conf?= =?UTF-8?q?i=E2=80=A6=20(#5142)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit update the IUF manual configuration instructions UAS and Badger configurations to match the network settings for SLURM and/or PBS Pro Co-authored-by: Don Bahls <114519367+don-bahls-hpe@users.noreply.github.com> Signed-off-by: Srinivas-Anand-HPE <119280543+Srinivas-Anand-HPE@users.noreply.github.com> update the IUF manual configuration instructions UAS and Badger configurations to match the network settings for SLURM and/or PBS Pro --- operations/iuf/workflows/configuration.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/operations/iuf/workflows/configuration.md b/operations/iuf/workflows/configuration.md index 8b7d7c58ed30..bbaa36baf438 100644 --- a/operations/iuf/workflows/configuration.md +++ b/operations/iuf/workflows/configuration.md @@ -203,11 +203,18 @@ required for initial installation scenarios. - Configure SAT authentication via `sat auth` - Generate SAT S3 credentials - Configure system revision information via `sat setrev` -- UAS - - Configure UAS network settings - - The network settings for UAS must match the WLM to allow job submission from UAIs -- Badger - - Update CSM Diags network attachment definition +- SLURM + - UAS + - Configure UAS network settings + - The network settings for UAS must match the SLURM WLM to allow job submission from UAIs + - CSM Diags + - Update CSM Diags network attachment definition +- PBS Pro + - UAS + - Configure UAS network settings + - The network settings for UAS must match the PBS Pro WLM to allow job submission from UAIs + - CSM Diags + - Update CSM Diags network attachment definition Once this step has completed: From ee3230ac96d66add282dca554b83f826a314e8d2 Mon Sep 17 00:00:00 2001 From: Michael Tupitsyn Date: Tue, 25 Jun 2024 12:02:53 -0700 Subject: [PATCH 11/37] CASMTRIAGE-7092 Workaround for inconsistent skopeo image name shortcuts (#5173) It looks like podman works differently with container image aliases on different systems. The `podman load -i skopeo.tar` creates image named `skopeo:xxx`, which resolves as shortcut to `docker.io/library/skopeo.xxx`. However, command `podman run skopeo.xxx` may try to run `docker.io/library/skopeo.xxx` or `quauy.io/skopeo.xxx`, it appears to be unpredictable. The fix is to capture actual image name from `podman load` output and use it in `podman run`. We actually use this technique in hpc-shastarelm-release (load-vendor-image procedure). --- upgrade/scripts/upgrade/prerequisites.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh index a8f87ccfc163..f35d5172367f 100755 --- a/upgrade/scripts/upgrade/prerequisites.sh +++ b/upgrade/scripts/upgrade/prerequisites.sh @@ -523,14 +523,14 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then fi set -e - # Skopeo image is stored as "skopeo:csm-${CSM_RELEASE}" - podman load -i "${CSM_ARTI_DIR}/vendor/skopeo.tar" + # Skopeo image is stored as "skopeo:csm-${CSM_RELEASE}", which may resolve to docker.io/lirary/skopeo or quay.io/skopeo, depending on configured shortcuts + SKOPEO_IMAGE=$(podman load -q -i "${CSM_ARTI_DIR}/vendor/skopeo.tar" 2> /dev/null | sed -e 's/^.*: //') nexus_images=$(yq r -j "${CSM_MANIFESTS_DIR}/platform.yaml" 'spec.charts.(name==cray-precache-images).values.cacheImages' | jq -r '.[] | select( . | contains("nexus"))') worker_nodes=$(grep -oP "(ncn-w\d+)" /etc/hosts | sort -u) while read -r nexus_image; do echo "Uploading $nexus_image into Nexus ..." podman run --rm -v "${CSM_ARTI_DIR}/docker":/images \ - "skopeo:csm-${CSM_RELEASE}" \ + "${SKOPEO_IMAGE}" \ --override-os=linux --override-arch=amd64 \ copy \ --remove-signatures \ From 810c5ae40aada0335623716309d9bf32244ec3f0 Mon Sep 17 00:00:00 2001 From: Shane Unruh <87081771+shunr-hpe@users.noreply.github.com> Date: Tue, 25 Jun 2024 13:03:49 -0600 Subject: [PATCH 12/37] CASMHMS-5864 Changed subscription removal order in remove blade doc (#5176) In the documentation on how to remove a liquid cooled blade, this moves the step to clear the BMC subscriptions before the step to disable the redfish endpoint. --- ...ing_a_Liquid-cooled_blade_from_a_System.md | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/operations/node_management/Removing_a_Liquid-cooled_blade_from_a_System.md b/operations/node_management/Removing_a_Liquid-cooled_blade_from_a_System.md index cb225137f844..5013657299c3 100644 --- a/operations/node_management/Removing_a_Liquid-cooled_blade_from_a_System.md +++ b/operations/node_management/Removing_a_Liquid-cooled_blade_from_a_System.md @@ -36,16 +36,7 @@ This procedure will remove a liquid-cooled blades from an HPE Cray EX system. cray bos v2 sessions create --template-name $BOS_TEMPLATE --operation shutdown --limit x9000c3s0b0n0,x9000c3s0b0n1,x9000c3s0b1n0,x9000c3s0b1n1 ``` -### 2. Disable the Redfish endpoints for the nodes - -1. (`ncn-mw#`) Temporarily disable the Redfish endpoints for `NodeBMCs` present in the blade. - - ```bash - cray hsm inventory redfishEndpoints update --enabled false x9000c3s0b0 --id x9000c3s0b0 - cray hsm inventory redfishEndpoints update --enabled false x9000c3s0b1 --id x9000c3s0b1 - ``` - -### 3. Clear Redfish event subscriptions from BMCs on the blade +### 2. Clear Redfish event subscriptions from BMCs on the blade 1. (`ncn-mw#`) Set the environment variable `SLOT` to the blade's location. @@ -76,6 +67,15 @@ This procedure will remove a liquid-cooled blades from an HPE Cray EX system. Successfully deleted https://x3000c0s9b0/redfish/v1/EventService/Subscriptions/1 ``` +### 3. Disable the Redfish endpoints for the nodes + +1. (`ncn-mw#`) Temporarily disable the Redfish endpoints for `NodeBMCs` present in the blade. + + ```bash + cray hsm inventory redfishEndpoints update --enabled false x9000c3s0b0 --id x9000c3s0b0 + cray hsm inventory redfishEndpoints update --enabled false x9000c3s0b1 --id x9000c3s0b1 + ``` + ### 4. Clear the node controller settings 1. (`ncn-mw#`) Remove the system-specific settings from each node controller on the blade. From f1f41879aa5a727afbb8bb65bd2cf48ac26e2638 Mon Sep 17 00:00:00 2001 From: David Laine <77020169+dlaine-hpe@users.noreply.github.com> Date: Tue, 25 Jun 2024 14:04:47 -0500 Subject: [PATCH 13/37] CASMCMS-9028 - clarify rbd instructions for an IMS remote build node. (#5177) --- .../Configure_a_Remote_Build_Node.md | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/operations/image_management/Configure_a_Remote_Build_Node.md b/operations/image_management/Configure_a_Remote_Build_Node.md index f720adc4d58c..af4dcbcbc580 100644 --- a/operations/image_management/Configure_a_Remote_Build_Node.md +++ b/operations/image_management/Configure_a_Remote_Build_Node.md @@ -40,6 +40,8 @@ the K8S pods. There are two primary reasons to choose to run jobs on a remote bu run on the native architecture of the remote node. Running `aarch64` image builds on an `aarch64` remote node can see over a 10 fold performance increase versus running the same job under emulation. +Multiple remote build nodes may be created in any mix of architectures. + Any job with an architecture matching a defined remote build node will be run remotely with no other changes needed. If there are multiple remote build nodes with the same architecture, there is a basic load balancing algorithm in place to spread the workload between all active remote build nodes. @@ -47,8 +49,8 @@ algorithm in place to spread the workload between all active remote build nodes. When a new IMS job is created, the defined remote build nodes are checked to ensure SSH access is available and the required software is present on the node. If either of these checks fail, the node will not be used for the new job. If all matching remote nodes fail this check, the job will be created to run within the -K8S environment as a standard local job. There is output in the `cray-ims` pod that will indicate why defined -remote nodes are not being used if these checks fail. +K8S environment as a standard local job. There is output in the `cray-ims` pod log that will indicate why +defined remote nodes are not being used if these checks fail. See [Troubleshoot Remote Build Node](Troubleshoot_Remote_Build_Node.md) for issues running remote jobs. @@ -128,7 +130,7 @@ used to work with images, or if it can still run compute jobs while building ima ### Create a barebones IMS builder image If there is no existing compute image to boot a node with, one can be created based on the barebones -image that is installed with CSM. +image that is installed with CSM. This image may be used to boot multiple remote build nodes. 1. (`ncn-mw#`) Find the latest CSM install on the system. @@ -213,7 +215,7 @@ image that is installed with CSM. Expected output will be something similar to: - ```json + ```json { "last_updated": "2024-04-23T16:44:55Z", "layers": [ @@ -348,16 +350,16 @@ image that is installed with CSM. { "boot_sets": { "compute": { - "arch": "X86", - "etag": "9bbdebd4e51f32a2db8f8dd3e6124166", - "kernel_parameters": "ip=dhcp quiet spire_join_token=${SPIRE_JOIN_TOKEN} root=live:s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/rootfs nmd_data=url=s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/rootfs,etag=9bbdebd4e51f32a2db8f8dd3e6124166", - "node_roles_groups": [ - "Compute" - ], - "path": "s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/manifest.json", - "rootfs_provider": "", - "rootfs_provider_passthrough": "", - "type": "s3" + "arch": "X86", + "etag": "9bbdebd4e51f32a2db8f8dd3e6124166", + "kernel_parameters": "ip=dhcp quiet spire_join_token=${SPIRE_JOIN_TOKEN} root=live:s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/rootfs nmd_data=url=s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/rootfs,etag=9bbdebd4e51f32a2db8f8dd3e6124166", + "node_roles_groups": [ + "Compute" + ], + "path": "s3://boot-images/f6d9cfc7-9291-4c46-8350-c252b919d396/manifest.json", + "rootfs_provider": "", + "rootfs_provider_passthrough": "", + "type": "s3" } }, "name": "bos_ims_remote_node", @@ -386,6 +388,14 @@ directly into the IMS builder node. Below is a procedure to provide the IMS builder node with additional storage. +NOTE: The Ceph storage described below has several important characteristics to keep in mind: + +* This RBD device is created globally. +* Each RBD device will still exist after the remote build node is rebooted. +* Each RBD device must have a unique name, but may be re-used after the node is rebooted. +* This type of RBD device may only be mounted on one node - one must be created for each remote build node. +* If the remote build node is rebooted, the RBD device must be manually mounted again. + 1. Set an environment variable for the xname of the remote build node. ```bash From 1520fb697568e82086b0287ccebb732e9fb44dc9 Mon Sep 17 00:00:00 2001 From: Srinivas-Anand-HPE <119280543+Srinivas-Anand-HPE@users.noreply.github.com> Date: Wed, 26 Jun 2024 00:41:16 +0530 Subject: [PATCH 14/37] =?UTF-8?q?CASMTRIAGE-7055:=20Check=20for=20the=20la?= =?UTF-8?q?test=20docs-csm=20before=20starting=20the=20up=E2=80=A6=20(#518?= =?UTF-8?q?0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CASMTRIAGE-7055: Check for the latest docs-csm before starting the upgrade * Style check Signed-off-by: Russell Bunch * Fix indentation Signed-off-by: Russell Bunch * Spellcheck Signed-off-by: Russell Bunch --------- Signed-off-by: Russell Bunch Co-authored-by: Russell Bunch --- operations/iuf/workflows/preparation.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/operations/iuf/workflows/preparation.md b/operations/iuf/workflows/preparation.md index 90411a380639..272e46cc548e 100644 --- a/operations/iuf/workflows/preparation.md +++ b/operations/iuf/workflows/preparation.md @@ -31,9 +31,15 @@ This section defines environment variables and directory content that is used th mkdir -p "${ACTIVITY_DIR}" "${MEDIA_DIR}" "${ADMIN_DIR}" ``` -Once this step has completed: + Once this step has completed: -- Environment variables have been set and required IUF directories have been created + - Environment variables have been set and required IUF directories have been created + +1. Ensure that the + [latest version of `docs-csm`](https://github.com/Cray-HPE/docs-csm/blob/release/1.6/update_product_stream/README.md#check-for-latest-documentation) + is installed for the target CSM version being installed or upgraded. + + For example: when upgrading from CSM version 1.5.0 to version 1.5.1, install `docs-csm-1.5.1.noarch` ## 2. Use of `iuf activity` From 8b7fdd68f397a8971b39cc9fa2724f0ae92830d5 Mon Sep 17 00:00:00 2001 From: Mitch Harding Date: Wed, 26 Jun 2024 15:27:24 -0400 Subject: [PATCH 15/37] CASMINST-6902: Improve/automate PIT data backup (#5182) --- install/deploy_final_non-compute_node.md | 78 ++++--------- install/scripts/backup-pit-data.sh | 139 +++++++++++++++++++++++ 2 files changed, 159 insertions(+), 58 deletions(-) create mode 100755 install/scripts/backup-pit-data.sh diff --git a/install/deploy_final_non-compute_node.md b/install/deploy_final_non-compute_node.md index e88d458fe7be..0d96d4007d06 100644 --- a/install/deploy_final_non-compute_node.md +++ b/install/deploy_final_non-compute_node.md @@ -190,70 +190,34 @@ The steps in this section load hand-off data before a later procedure reboots th It is important to backup some files from `ncn-m001` before it is rebooted. -1. (`pit#`) Set up passwordless SSH **to** the PIT node from `ncn-m002`. - - > The `ssh` command below may prompt for the NCN root password. - - ```bash - ssh ncn-m002 cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys && - chmod 600 /root/.ssh/authorized_keys - ``` - 1. (`pit#`) Stop the typescript session. ```bash exit ``` -1. (`pit#`) Preserve logs and configuration files if desired. +1. (`pit#`) Create PIT backup and copy it off. - The following commands create a `tar` archive of select files on the PIT node. This archive is located - in a directory that will be backed up in the next steps. + This script creates a backup of select files on the PIT node, copying them to both + another master NCN and to S3. + + > The script below may prompt for the NCN root password. ```bash - mkdir -pv "${PITDATA}"/prep/logs && - ls -d \ - /etc/dnsmasq.d \ - /etc/os-release \ - /etc/sysconfig/network \ - /opt/cray/tests/cmsdev.log \ - /opt/cray/tests/install/logs \ - /opt/cray/tests/logs \ - /root/.canu \ - /root/.config/cray/logs \ - /root/csm*.{log,txt} \ - /tmp/*.log \ - /usr/share/doc/csm/install/scripts/csm_services/yapl.log \ - /var/log/conman \ - /var/log/zypper.log 2>/dev/null | - sed 's_^/__' | - xargs tar -C / -czvf "${PITDATA}/prep/logs/pit-backup-$(date +%Y-%m-%d_%H-%M-%S).tgz" + /usr/share/doc/csm/install/scripts/backup-pit-data.sh ``` -1. (`pit#`) Copy some of the installation files to `ncn-m002`. - - These files will be copied back to `ncn-m001` after the PIT node is rebooted. + Ensure that the script output ends with `COMPLETED`, indicating that the procedure was successful. - ```bash - ssh ncn-m002 \ - "mkdir -pv /metal/bootstrap - rsync -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -rltD -P --delete pit.nmn:'${PITDATA}'/prep /metal/bootstrap/ - rsync -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -rltD -P --delete pit.nmn:'${CSM_PATH}'/images/pre-install-toolkit/pre-install-toolkit*.iso /metal/bootstrap/" - ``` +1. In the output of the script run in the previous step, note the value it reports for the `first-master-hostname`. + This will be needed in a later step. -1. (`pit#`) Upload install files to S3 in the cluster. + Example output excerpt: - ```bash - PITBackupDateTime=$(date +%Y-%m-%d_%H-%M-%S) - tar -czvf "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" "${PITDATA}/prep" "${PITDATA}/configs" "${CSM_PATH}/images/pre-install-toolkit/pre-install-toolkit"*.iso && - cray artifacts create config-data \ - "PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" \ - "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" && - rm -v "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" && echo COMPLETED + ```text + first-master-hostname: ncn-m002 ``` - Ensure that the previous command chain output ends with `COMPLETED`, indicating that the procedure was successful. - ## 4. Reboot 1. (`external#`) Open a serial console to the PIT node, if one is not already open. @@ -327,13 +291,15 @@ It is important to backup some files from `ncn-m001` before it is rebooted. 1. (`ncn-m001#`) Restore and verify the site link. Restore networking files from the manual backup taken during the - [Backup](#33-backup) step. + [Backup](#33-backup) step. Set the `FM` variable to the `first-master-hostname` + value noted in that section. > **`NOTE`** Do NOT change any default NCN hostname; otherwise, unexpected deployment or upgrade errors may happen. ```bash SYSTEM_NAME=eniac - rsync "ncn-m002:/metal/bootstrap/prep/${SYSTEM_NAME}/pit-files/ifcfg-lan0" /etc/sysconfig/network/ && \ + FM=ncn-m002 + rsync "${FM}:/metal/bootstrap/prep/${SYSTEM_NAME}/pit-files/ifcfg-lan0" /etc/sysconfig/network/ && \ wicked ifreload lan0 && \ wicked ifstatus lan0 ``` @@ -378,19 +344,15 @@ It is important to backup some files from `ncn-m001` before it is rebooted. exit ``` - 1. (`ncn-m002#`) Copy install files back to `ncn-m001`. + 1. If `ncn-m002` is not the `first-master-hostname` noted in the [Backup](#33-backup) step, then SSH to that node. - ```bash - rsync -rltDv -P /metal/bootstrap ncn-m001:/metal/ && rm -rfv /metal/bootstrap - ``` - - 1. (`ncn-m002#`) Log out of `ncn-m002`. + 1. (`first-master-hostname#`) Copy install files back to `ncn-m001`. ```bash - exit + rsync -rltDv -P /metal/bootstrap ncn-m001:/metal/ && rm -rfv /metal/bootstrap ``` - 1. Log in to `ncn-m001`. + 1. Log out of the other nodes and log in to `ncn-m001`. SSH back into `ncn-m001` or log in at the console. diff --git a/install/scripts/backup-pit-data.sh b/install/scripts/backup-pit-data.sh new file mode 100755 index 000000000000..cf97d8d6518c --- /dev/null +++ b/install/scripts/backup-pit-data.sh @@ -0,0 +1,139 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +set -euo pipefail + +# This means that something like /tmp/*.log will evaluate to an empty string if no files fit the pattern +shopt -s nullglob + +# This script is a replacement for the steps that were previously done manually +# during the "Deploy Final NCN" step of CSM installs. + +function err_exit { + echo "ERROR: $*" >&2 + exit 1 +} + +function dir_exists { + [[ -e $1 ]] || err_exit "Directory '$1' does not exist" + [[ -d $1 ]] || err_exit "'$1' exists but is not a directory" +} + +function run_cmd { + echo "# $*" + "$@" || err_exit "Command failed with exit code $?: $*" +} + +# Ensure that PITDATA and CSM_PATH variables are set +[[ -v PITDATA && -n ${PITDATA} ]] || err_exit "PITDATA variable must be set" +[[ -v CSM_PATH && -n ${CSM_PATH} ]] || err_exit "CSM_PATH variable must be set" + +# Make sure that expected directories exist and are actually directories +for DIR in "${PITDATA}" "${PITDATA}/prep" "${PITDATA}/configs" "${CSM_PATH}" \ + "${CSM_PATH}/images" "${CSM_PATH}/images/pre-install-toolkit"; do + + dir_exists "${DIR}" + +done + +PIT_ISO_DIR="${CSM_PATH}/images/pre-install-toolkit" + +# Make sure that expected PIT iso file can be found +compgen -G "${PIT_ISO_DIR}/pre-install-toolkit*.iso" > /dev/null 2>&1 || err_exit "PIT ISO file (${PIT_ISO_DIR}/pre-install-toolkit*.iso) not found" + +# Make sure we can figure out the first master node +DATA_JSON="${PITDATA}/configs/data.json" +[[ -e ${DATA_JSON} ]] || err_exit "File does not exist: '${DATA_JSON}'" +[[ -f ${DATA_JSON} ]] || err_exit "Exists but is not a regular file: '${DATA_JSON}'" +[[ -s ${DATA_JSON} ]] || err_exit "File exists but is empty: '${DATA_JSON}'" + +FM=$(jq -r '."Global"."meta-data"."first-master-hostname"' < "${DATA_JSON}") || err_exit "Error getting first-master-hostname from '${DATA_JSON}'" +[[ -n ${FM} ]] || err_exit "No first-master-hostname found in '${DATA_JSON}'" +echo "first-master-hostname: $FM" + +# Set up passwordless SSH **to** the PIT node from the first-master node +echo "If prompted, enter the $(whoami) password for ${FM}" +ssh "${FM}" cat /root/.ssh/id_rsa.pub >> /root/.ssh/authorized_keys || err_exit "Unable to read ${FM}:/root/.ssh/id_rsa.pub and/or write to /root/.ssh/authorized_keys" +run_cmd chmod 600 /root/.ssh/authorized_keys + +# Okay, everything seems good +run_cmd mkdir -pv "${PITDATA}"/prep/logs + +# Because some of these files are log files that are changing during this procedure, any call to directly +# tar them may result in the tar command failing. Thus, we first copy all of these files into a temporary +# directory, and from there we create the tar archive + +TEMPDIR=$(mktemp -d) || err_exit "Command failed: mktemp -d" + +echo "Copying selected files to temporary directory" + +for BACKUP_TARGET in \ + /etc/conman.conf \ + /etc/dnsmasq.d \ + /etc/os-release \ + /etc/sysconfig/network \ + /opt/cray/tests/cmsdev.log \ + /opt/cray/tests/install/logs \ + /opt/cray/tests/logs \ + /root/.bash_history \ + /root/.canu \ + /root/.config/cray/logs \ + /root/csm*.{log,txt} \ + /tmp/*.log \ + /usr/share/doc/csm/install/scripts/csm_services/yapl.log \ + /var/log; do + + [[ -e ${BACKUP_TARGET} ]] || continue + DIRNAME=$(dirname "${BACKUP_TARGET}") + TARG_DIR="${TEMPDIR}${DIRNAME}" + run_cmd mkdir -pv "${TARG_DIR}" + run_cmd cp -pr "${BACKUP_TARGET}" "${TARG_DIR}" + +done + +echo "Creating PIT backup tarfile" + +pushd "${TEMPDIR}" +run_cmd tar -czvf "${PITDATA}/prep/logs/pit-backup-$(date +%Y-%m-%d_%H-%M-%S).tgz" --remove-files * +popd +run_cmd rmdir -v "${TEMPDIR}" + +echo "Copying files to ${FM}" +ssh "${FM}" \ + "mkdir -pv /metal/bootstrap && + rsync -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -rltD -P --delete pit.nmn:'${PITDATA}'/prep /metal/bootstrap/ && + rsync -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null' -rltD -P --delete pit.nmn:'${PIT_ISO_DIR}'/pre-install-toolkit*.iso /metal/bootstrap/" + +PITBackupDateTime=$(date +%Y-%m-%d_%H-%M-%S) +run_cmd tar -czvf "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" "${PITDATA}/prep" "${PITDATA}/configs" "${PIT_ISO_DIR}/pre-install-toolkit"*.iso +run_cmd cray artifacts create config-data \ + "PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" \ + "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" +run_cmd rm -v "${PITDATA}/PitPrepIsoConfigsBackup-${PITBackupDateTime}.tgz" + +# Since the installer needs to take note of this value, we will display it again here at the end of the script +echo "first-master-hostname: $FM" + +echo COMPLETED From 941238e86ac144eb82727084f2c8dc8c20fa90de Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 2 Jul 2024 14:02:09 -0500 Subject: [PATCH 16/37] CASMINST-6718: Create VCS import/export scripts (#5192) * CASMINST-6718: Create VCS import/export scripts (cherry picked from commit 2173fe18af9fdeb294d338035fd0507712a64c59) (cherry picked from commit a2e78bac4aa7d6241a24ac3de488fbae136277e3) * Placate modified linter whims (cherry picked from commit ae797d98643e2fd41030fcd1cae9bce665930d30) (cherry picked from commit 9bfc2ca78e5211158e92a01d03528234757a5ca9) * CASMINST-6718: Allow user to specify working directory location for VCS backups (cherry picked from commit 030d3f871d00668292c52338d7dd262b53195a50) * CASMINST-6718: setup_cms_minio_mount: Allow user to specify mount point (cherry picked from commit 945ac0bffa1a19e68e51b4cdfbb3549c10b7a22e) * CASMINST-6718: setup_cms_minio_mount: Add option to create cms bucket without creating s3fs mount (cherry picked from commit 6920179c604c49bd5bee887737465f410464d59a) --------- Co-authored-by: Mitch Harding (the weird one) --- .../Version_Control_Service_VCS.md | 74 ++++++- .../operations/configuration/backup_vcs.sh | 133 ++++++++++++ .../operations/configuration/bash_lib/vcs.sh | 45 ++++ .../operations/configuration/restore_vcs.sh | 203 ++++++++++++++++++ .../system_recovery/setup_cms_minio_mount.sh | 128 +++++++++++ 5 files changed, 575 insertions(+), 8 deletions(-) create mode 100755 scripts/operations/configuration/backup_vcs.sh create mode 100644 scripts/operations/configuration/bash_lib/vcs.sh create mode 100755 scripts/operations/configuration/restore_vcs.sh create mode 100755 scripts/operations/system_recovery/setup_cms_minio_mount.sh diff --git a/operations/configuration_management/Version_Control_Service_VCS.md b/operations/configuration_management/Version_Control_Service_VCS.md index afbd75e950f4..f227cd14c1a9 100644 --- a/operations/configuration_management/Version_Control_Service_VCS.md +++ b/operations/configuration_management/Version_Control_Service_VCS.md @@ -6,10 +6,14 @@ * [Change VCS administrative user password](#change-vcs-administrative-user-password) * [Access the `cray` Gitea organization](#access-the-cray-gitea-organization) * [Backup and restore data](#backup-and-restore-data) - * [Backup Postgres data](#backup-postgres-data) - * [Backup PVC data](#backup-pvc-data) - * [Restore Postgres data](#restore-postgres-data) - * [Restore PVC data](#restore-pvc-data) + * [Automated backup and restore](#automated-backup-and-restore) + * [Automated backup](#automated-backup) + * [Automated restore](#automated-restore) + * [Manual backup and restore](#manual-backup-and-restore) + * [Manually backup Postgres data](#manually-backup-postgres-data) + * [Manually backup PVC data](#manually-backup-pvc-data) + * [Manually restore Postgres data](#manually-restore-postgres-data) + * [Manually restore PVC data](#manually-restore-pvc-data) * [Alternative backup/restore strategy](#alternative-backuprestore-strategy) * [Alternative export method](#alternative-export-method) * [Alternative import method](#alternative-import-method) @@ -227,7 +231,58 @@ Select the permissions appropriately, and then navigate to the following URL to Data for Gitea is stored in two places: Git content is stored directly in a PVC, while structural data, such as Gitea users and the list and attributes of repositories, is stored in a Postgres database. Because of this, both sources must be backed up and restored together. -### Backup Postgres data +* [Automated backup and restore](#automated-backup-and-restore) + * [Automated backup](#automated-backup) + * [Automated restore](#automated-restore) +* [Manual backup and restore](#manual-backup-and-restore) + * [Manually backup Postgres data](#manually-backup-postgres-data) + * [Manually backup PVC data](#manually-backup-pvc-data) + * [Manually restore Postgres data](#manually-restore-postgres-data) + * [Manually restore PVC data](#manually-restore-pvc-data) +* [Alternative backup/restore strategy](#alternative-backuprestore-strategy) + * [Alternative export method](#alternative-export-method) + * [Alternative import method](#alternative-import-method) + +### Automated backup and restore + +* [Automated backup](#automated-backup) +* [Automated restore](#automated-restore) + +#### Automated backup + +(`ncn-mw#`) Running the following script creates a tar archive containing both the Postgres and PVC data. + +> The argument to the script is the directory where the resulting archive should be created. + +```bash +/usr/share/doc/csm/scripts/operations/configuration/backup_vcs.sh /root +``` + +The end of the output will include the path to the backup archive. For example: + +```text +Gitea/VCS data successfully backed up to /root/gitea-vcs-20240626192742-dRW95b.tgz +``` + +Be sure to save the resulting archive file to a safe location. + +#### Automated restore + +(`ncn-mw#`) The archive generated by the [Automated backup](#automated-backup) script can be used as +input to the following automated restore script. + +```bash +/usr/share/doc/csm/scripts/operations/configuration/restore_vcs.sh /root/gitea-vcs-20240626192742-dRW95b.tgz +``` + +### Manual backup and restore + +* [Manually backup Postgres data](#manually-backup-postgres-data) +* [Manually backup PVC data](#manually-backup-pvc-data) +* [Manually restore Postgres data](#manually-restore-postgres-data) +* [Manually restore PVC data](#manually-restore-pvc-data) + +#### Manually backup Postgres data 1. (`ncn-mw#`) Determine which Postgres member is the leader. @@ -290,7 +345,7 @@ in a Postgres database. Because of this, both sources must be backed up and rest 1. Copy all files to a safe location. -### Backup PVC data +#### Manually backup PVC data (`ncn-mw#`) The VCS Postgres backups should be accompanied by backups of the VCS PVC. The export process can be run at any time while the service is running using the following commands: @@ -303,11 +358,11 @@ kubectl -n services cp ${POD}:/tmp/vcs.tar ./vcs.tar Be sure to save the resulting `tar` file to a safe location. -### Restore Postgres data +#### Manually restore Postgres data See [Restore Postgres for VCS](../../operations/kubernetes/Restore_Postgres.md#restore-postgres-for-vcs). -### Restore PVC data +#### Manually restore PVC data (`ncn-mw#`) When restoring the VCS Postgres database, the PVC should also be restored to the same point in time. The restore process can be run at any time while the service is running using the following commands: @@ -327,6 +382,9 @@ and may need to be recreated manually if the VCS deployment is lost. The following scripts create and use a `vcs-content` directory that contains all Git data. This should be copied to a safe location after export, and moved back to the system before import. +* [Alternative export method](#alternative-export-method) +* [Alternative import method](#alternative-import-method) + #### Alternative export method > **WARNING:** The following example uses the VCS `admin` username and password in plaintext on the command line, meaning it will be stored in the shell history as diff --git a/scripts/operations/configuration/backup_vcs.sh b/scripts/operations/configuration/backup_vcs.sh new file mode 100755 index 000000000000..882a2afe3380 --- /dev/null +++ b/scripts/operations/configuration/backup_vcs.sh @@ -0,0 +1,133 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +# Inform ShellCheck about the file we are sourcing +# shellcheck source=./bash_lib/common.sh +. "${locOfScript}/bash_lib/common.sh" + +# Inform ShellCheck about the file we are sourcing +# shellcheck source=./bash_lib/vcs.sh +. "${locOfScript}/bash_lib/vcs.sh" + +set -uo pipefail + +function backup_postgres { + local leader_pod json secrets secret num_secrets tmpfile field + + sql_outfile="${TMPDIR}/${SQL_BACKUP_NAME}" + sec_outfile="${TMPDIR}/${SEC_BACKUP_NAME}" + + json=$(run_cmd kubectl exec gitea-vcs-postgres-0 -n services -c postgres -it -- patronictl list -f json) || err_exit + leader_pod=$(run_cmd jq -r '.[] | select(.Role == "Leader") | .Member' <<< "${json}") || err_exit + [[ -n ${leader_pod} ]] || err_exit "No gitea-vcs-postgres leader pod found" + echo "Backing up data from gitea-vcs-postgres leader pod ${leader_pod} to ${sql_outfile}" + + run_cmd kubectl exec -it "${leader_pod}" -n services -c postgres -- pg_dumpall --if-exists -c -U postgres > "${sql_outfile}" \ + || err_exit "Error writing to file '${sql_outfile}'" + + echo "Backing up gitea-vcs-postgres Kubernetes secrets to ${sec_outfile}" + + num_secrets=0 + secrets=$(run_cmd kubectl get secrets -n services -l cluster-name=gitea-vcs-postgres -o custom-columns=":metadata.name" --no-headers) || err_exit + tmpfile=$(run_mktemp -p "$TMPDIR") || err_exit + echo "---" > "${sec_outfile}" || err_exit "Error writing to '${sec_outfile}'" + for secret in ${secrets}; do + let num_secrets+=1 + echo "Backing up secret: ${secret}" + run_cmd kubectl get secret "${secret}" -n services -o yaml > "${tmpfile}" || err_exit "Error writing to '${tmpfile}'" + for field in creationTimestamp resourceVersion selfLink uid; do + run_cmd yq d -i "${tmpfile}" "metadata.${field}" + done + run_cmd cat "${tmpfile}" >> "${sec_outfile}" || err_exit "Error appending to '${sec_outfile}'" + echo "---" >> "${sec_outfile}" || err_exit "Error appending to '${sec_outfile}'" + done + run_cmd rm "${tmpfile}" + [[ ${num_secrets} -ge 3 ]] || err_exit "Expected at least 3 secrets, but only found ${num_secrets}" +} + +function backup_pvc { + local pvc_outfile gitea_pod + + pvc_outfile="${TMPDIR}/${PVC_BACKUP_NAME}" + + # Set the gitea_pod variable to the name of the gitea pod + get_gitea_pod + + echo "Backing up PVC data from gitea pod ${gitea_pod}" + run_cmd kubectl -n services exec "${gitea_pod}" -- tar -cf /tmp/vcs.tar /var/lib/gitea/ + echo "Copying backed up data out of the pod to ${pvc_outfile}" + run_cmd kubectl -n services cp "${gitea_pod}":/tmp/vcs.tar "${pvc_outfile}" +} + +function usage { + echo "Usage: backup_vcs.sh [-t workdir_location] [output_directory]" >&2 + echo + echo "If no output directory is specified, one is created under the user's home directory" >&2 + echo "If no working directory is specified, one is created under the user's home directory" >&2 +} + +OUTDIR="" +WORKDIR_BASE="" + +if [[ $# -eq 1 ]] && [[ $1 == "-h" || $1 == "--help" ]]; then + usage + exit 2 +fi + +while [[ $# -gt 0 ]]; do + case "$1" in + "-t") + [[ $# -gt 1 ]] || usage_err_exit "The $1 parameter requires an argument" + [[ -n ${WORKDIR_BASE} ]] && usage_err_exit "The $1 parameter may only be specified once" + shift + [[ -n $1 ]] || usage_err_exit "Work directory may not be blank" + [[ -e $1 ]] || usage_err_exit "Specified work directory ($1) does not exist" + [[ -d $1 ]] || usage_err_exit "Specified work directory ($1) exists but is not a directory" + WORKDIR_BASE="$1" + ;; + *) + [[ $# -eq 1 ]] || usage_err_exit "Too many arguments" + [[ -n $1 ]] || usage_err_exit "Output directory argument may not be blank" + [[ -e $1 ]] || usage_err_exit "Specified output directory ($1) does not exist" + [[ -d $1 ]] || usage_err_exit "Specified output directory ($1) exists but is not a directory" + OUTDIR="$1" + ;; + esac + shift +done + +[[ -n ${OUTDIR} ]] || OUTDIR=~ +[[ -n ${WORKDIR_BASE} ]] || WORKDIR_BASE=~ + +TMPDIR=$(run_mktemp -d "${WORKDIR_BASE}/gitea_vcs_backup.$(date +%Y%m%d%H%M%S).XXX") || err_exit + +echo "Backing up Gitea/VCS data" +backup_postgres +backup_pvc +BACKUP_TARFILE=$(run_mktemp "${OUTDIR}/gitea-vcs-$(date +%Y%m%d%H%M%S)-XXXXXX.tgz") || err_exit +run_cmd tar -C "${TMPDIR}" -czf "${BACKUP_TARFILE}" --remove-files "${SQL_BACKUP_NAME}" "${SEC_BACKUP_NAME}" "${PVC_BACKUP_NAME}" +rmdir "${TMPDIR}" || echo "WARNING: Unable to remove temporary directory '${TMPDIR}'" +echo "Gitea/VCS data successfully backed up to ${BACKUP_TARFILE}" diff --git a/scripts/operations/configuration/bash_lib/vcs.sh b/scripts/operations/configuration/bash_lib/vcs.sh new file mode 100644 index 000000000000..c79add810db8 --- /dev/null +++ b/scripts/operations/configuration/bash_lib/vcs.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +# Inform ShellCheck about the file we are sourcing +# shellcheck source=./common.sh +. "${locOfScript}/common.sh" + +# Shared function and variable definitions between VCS backup and restore scripts + +# These variables are not used in this file, but are used by scripts which source this file +#shellcheck disable=SC2034 +SQL_BACKUP_NAME=gitea-vcs-postgres.sql +#shellcheck disable=SC2034 +SEC_BACKUP_NAME=gitea-vcs-postgres.manifest +#shellcheck disable=SC2034 +PVC_BACKUP_NAME=vcs.tar + +function get_gitea_pod { + # Sets $gitea_pod to the name of the gitea pod, or exits if it cannot be found + gitea_pod=$(run_cmd kubectl -n services get pod -l app.kubernetes.io/instance=gitea -o custom-columns=":metadata.name" --no-headers) || err_exit + [[ -n ${gitea_pod} ]] || err_exit "No gitea pod found" +} diff --git a/scripts/operations/configuration/restore_vcs.sh b/scripts/operations/configuration/restore_vcs.sh new file mode 100755 index 000000000000..4b350d572508 --- /dev/null +++ b/scripts/operations/configuration/restore_vcs.sh @@ -0,0 +1,203 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +# Inform ShellCheck about the file we are sourcing +# shellcheck source=./bash_lib/common.sh +. "${locOfScript}/bash_lib/common.sh" + +# Inform ShellCheck about the file we are sourcing +# shellcheck source=./bash_lib/vcs.sh +. "${locOfScript}/bash_lib/vcs.sh" + +set -uo pipefail + +DUMPFILE="" +MANIFEST="" +TARFILE="" + +function wait_for_pods_to_start { + # Usage: wait_for_pods_to_start <# of pods expected> + local num_pods + [[ $# -ne 2 ]] && err_exit "$0: Function requires exactly 2 arguments but received $#. Invalid arguments: $*" + [[ -z $1 ]] && err_exit "$0: First argument may not be blank" + [[ -z $1 ]] && err_exit "$0: Second argument may not be blank" + [[ ! $2 -gt 0 ]] && err_exit "$0: Second argument must be an integer greated than 0. Invalid second argument: $2" + + echo "Wait for $2 pod(s) to be running." + num_pods=$(kubectl get pods -l "$1" -n services | grep Running | wc -l) + while [[ ${num_pods} -lt $2 ]]; do + echo " ${num_pods} running; waiting for $2 pod(s) to be running" + sleep 5 + num_pods=$(kubectl get pods -l "$1" -n services | grep Running | wc -l) + done +} + +function wait_for_pods_to_terminate { + # Usage: wait_for_pods_to_terminate + [[ $# -ne 1 ]] && err_exit "$0: Function requires exactly 1 argument but received $#. Invalid arguments: $*" + [[ -z $1 ]] && err_exit "$0: Argument may not be blank" + echo "Wait for pods to terminate ($1)" + while kubectl get pods -n services -l "$1" | grep -qv NAME; do + echo " waiting for pods to terminate" + sleep 5 + done +} + +# It seems that shellcheck doesn't like that we defensively check to make sure the function +# did not accidentally get passed arguments. Sorry not sorry, shellcheck +#shellcheck disable=SC2120 +function wait_for_postgres_cluster_running { + # Takes no arguments + local status + [[ $# -ne 0 ]] && err_exit "$0: Function takes no arguments but received $#. Invalid arguments: $*" + + echo "Wait for the gitea-vcs-postgres Postgres cluster to start running." + while true; do + status=$(kubectl get postgresql gitea-vcs-postgres -n services -o json | jq -r '.status.PostgresClusterStatus') + [[ ${status} == "Running" ]] && return + echo " waiting for postgresql to start running" + sleep 5 + done +} + +function restore_sql_and_secrets { + local tmp_outfile postgres_cr_json postgres_cr_single_json + + echo "Scale VCS service to 0" + run_cmd kubectl scale deployment gitea-vcs -n services --replicas=0 + + wait_for_pods_to_terminate app.kubernetes.io/name=vcs + + echo "Delete VCS Postgres cluster" + + tmp_outfile=$(run_mktemp -p ~) || exit 1 + run_cmd kubectl get postgresql gitea-vcs-postgres -n services -o json > "${tmp_outfile}" || err_exit "Error creating ${tmp_outfile}" + + postgres_cr_json=$(run_mktemp -p ~ postgres-cr.XXX.json) || exit 1 + run_cmd jq 'del(.spec.selector) | del(.spec.template.metadata.labels."controller-uid") | del(.status)' "${tmp_outfile}" > "${postgres_cr_json}" || err_exit "Error creating ${postgres_cr_json}" + + run_cmd kubectl delete -f "${postgres_cr_json}" + + wait_for_pods_to_terminate application=spilo,cluster-name=gitea-vcs-postgres + + echo "Create a new single instance VCS Postgres cluster." + postgres_cr_single_json=$(run_mktemp -p ~ postgres-cr-single.XXX.json) || exit 1 + + run_cmd jq '.spec.numberOfInstances = 1' "${postgres_cr_json}" > "${postgres_cr_single_json}" || err_exit "Error creating ${postgres_cr_single_json}" + + run_cmd kubectl create -f "${postgres_cr_single_json}" + + wait_for_pods_to_start application=spilo,cluster-name=gitea-vcs-postgres 1 + + wait_for_postgres_cluster_running + + echo "Restore the database from ${DUMPFILE}" + run_cmd kubectl exec gitea-vcs-postgres-0 -c postgres -n services -it -- psql -U postgres < "${DUMPFILE}" || err_exit "Error reading from $DUMPFILE" + + echo "Delete the gitea-vcs-postgres secrets" + run_cmd kubectl delete -f "${MANIFEST}" + + echo "Recreate the gitea-vcs-postgres secrets using the manifest (${MANIFEST})" + run_cmd kubectl apply -f "${MANIFEST}" + + echo "Restart the Postgres cluster." + run_cmd kubectl delete pod -n services gitea-vcs-postgres-0 + + wait_for_pods_to_start application=spilo,cluster-name=gitea-vcs-postgres 1 + + echo "Scale the Postgres cluster back to 3 instances." + run_cmd kubectl patch postgresql gitea-vcs-postgres -n services --type=json -p='[{"op" : "replace", "path":"/spec/numberOfInstances", "value" : 3}]' + + wait_for_postgres_cluster_running + + echo "Scale the Gitea service back up." + run_cmd kubectl scale deployment gitea-vcs -n services --replicas=1 + + wait_for_pods_to_start app.kubernetes.io/name=vcs 1 + + rm "${tmp_outfile}" "${postgres_cr_json}" "${postgres_cr_single_json}" > /dev/null 2>&1 +} + +function restore_pvc_data { + local gitea_pod + + # Set the gitea_pod variable to the name of the gitea pod + get_gitea_pod + + echo "Copy PVC data tarfile into pod (${gitea_pod})" + run_cmd kubectl -n services cp "${TARFILE}" "${gitea_pod}":/tmp/vcs.tar + + echo "Expand PVC data tarfile in pod" + run_cmd kubectl -n services exec "${gitea_pod}" -- tar -C / -xf /tmp/vcs.tar +} + +function usage { + echo "Usage: restore_vcs.sh " + echo + echo "This file is the one produced by the backup_vcs.sh script" >&2 +} + +function input_file_exists_nonempty { + [[ $# -eq 1 ]] || err_exit "Programming logic error: $0 function takes exactly 1 argument but received $#: $*" + [[ -n $1 ]] || err_exit "Programming logic error: $0 function argument may not be blank" + [[ -e $1 ]] || usage_err_exit "File does not exist: '$1'" + [[ -f $1 ]] || usage_err_exit "Exists but is not a regular file: '$1'" + [[ -s $1 ]] || usage_err_exit "File is 0 size: '$1'" +} + +[[ $# -eq 0 ]] && usage_err_exit "Missing required arguments" +[[ $# -gt 1 ]] && usage_err_exit "Too many arguments" +[[ -n $1 ]] || usage_err_exit "Argument may not be blank" +input_file_exists_nonempty "$1" + +TMPDIR=$(run_mktemp -d -p ~) || err_exit +run_cmd tar -C "${TMPDIR}" -xvf "$1" + +DUMPFILE="${TMPDIR}/${SQL_BACKUP_NAME}" +MANIFEST="${TMPDIR}/${SEC_BACKUP_NAME}" +TARFILE="${TMPDIR}/${PVC_BACKUP_NAME}" +input_file_exists_nonempty "${DUMPFILE}" +input_file_exists_nonempty "${MANIFEST}" +input_file_exists_nonempty "${TARFILE}" + +# A very quick check just to help catch cases where the completely wrong file is somehow found +grep -q 'PostgreSQL database cluster dump' "${DUMPFILE}" || usage_err_exit "Does not appear to be a SQL database cluster dump: '${DUMPFILE}'" +grep -Eq '^apiVersion:' "${MANIFEST}" || usage_err_exit "Does not appear to be a manifest file: '${MANIFEST}'" +file "${TARFILE}" | grep -q 'tar archive' || usage_err_exit "Does not appear to be a tar archive: '${TARFILE}'" + +restore_sql_and_secrets +restore_pvc_data + +echo "Restart gitea-vcs deployment" +run_cmd kubectl -n services rollout restart deployment gitea-vcs + +echo "Wait for restart to complete" +run_cmd kubectl -n services rollout status deployment gitea-vcs + +rm "${DUMPFILE}" "${MANIFEST}" "${TARFILE}" +rmdir "${TMPDIR}" + +echo "Gitea/VCS restore completed!" diff --git a/scripts/operations/system_recovery/setup_cms_minio_mount.sh b/scripts/operations/system_recovery/setup_cms_minio_mount.sh new file mode 100755 index 000000000000..d5d40360648f --- /dev/null +++ b/scripts/operations/system_recovery/setup_cms_minio_mount.sh @@ -0,0 +1,128 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +DEFAULT_CMS_MINIO_MNT=/etc/cray/minio/cms +AWS_CREDFILE=/root/.aws/credentials + +locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +# Inform ShellCheck about the file we are sourcing +# shellcheck source=../configuration/bash_lib/common.sh +. "${locOfScript}/../configuration/bash_lib/common.sh" + +set -o pipefail + +function usage { + echo "Usage: setup_cms_minio_mount.sh {--rw | --ro} [--init] [mount_point]" >&2 + echo " setup_cms_minio_mount.sh --nomount --init" >&2 + echo >&2 + echo "If --init is specified, the cms bucket will be created, if it does not exist." >&2 + echo "The --rw / --ro arguments govern whether it will be mounted read-write or read-only" >&2 + echo "If mount_point is not specified, it defaults to '${DEFAULT_CMS_MINIO_MNT}'" >&2 + echo >&2 + echo "The --nomount --init option creates the cms bucket (if needed) but does not create a mount" >&2 + echo >&2 +} + +CMS_MINIO_MNT="" +MOUNT_OPT="" +INIT="" +[[ $# -eq 0 ]] && usage_err_exit "At least 1 argument is required" +while [[ $# -gt 0 ]]; do + case "$1" in + "--ro") + [[ ${MOUNT_OPT} == "ro" ]] && usage_err_exit "Argument --$1 may only be specified once" + [[ -n ${MOUNT_OPT} ]] && usage_err_exit "Arguments $1 and --${MOUNT_OPT} are mutually exclusive" + MOUNT_OPT=ro + ;; + "--rw") + [[ ${MOUNT_OPT} == "rw" ]] && usage_err_exit "Argument --$1 may only be specified once" + [[ -n ${MOUNT_OPT} ]] && usage_err_exit "Arguments $1 and --${MOUNT_OPT} are mutually exclusive" + MOUNT_OPT=rw + ;; + "--nomount") + [[ ${MOUNT_OPT} == "nomount" ]] && usage_err_exit "Argument --$1 may only be specified once" + [[ -n ${MOUNT_OPT} ]] && usage_err_exit "Arguments $1 and --${MOUNT_OPT} are mutually exclusive" + MOUNT_OPT=nomount + ;; + "--init") + [[ -n ${INIT} ]] && usage_err_exit "Argument --init may only be specified once" + INIT=Y + ;; + *) + [[ $# -gt 1 ]] && usage_err_exit "Too many arguments" + [[ ${MOUNT_OPT} == "nomount" ]] && usage_err_exit "Invalid to specify a mount point when --nomount specified" + [[ -n $1 ]] || usage_err_exit "Mount point may not be blank" + [[ $1 =~ ^/.* ]] || usage_err_exit "Cannot use relative path for mount point" + CMS_MINIO_MNT="$1" + ;; + esac + shift +done + +[[ -z ${MOUNT_OPT} ]] && usage_err_exit "One of the following options must be specified: --nomount, --ro, --rw" + +[[ ${MOUNT_OPT} == "nomount" && -z ${INIT} ]] && usage_err_exit "Invalid to specify --nomount without --init" + +# Make sure the credentials file exists and is not empty +[[ -e ${AWS_CREDFILE} ]] || err_exit "AWS credentials file (${AWS_CREDFILE}) does not exist" +[[ -f ${AWS_CREDFILE} ]] || err_exit "AWS credentials file (${AWS_CREDFILE}) exists but is not a regular file" +[[ -s ${AWS_CREDFILE} ]] || err_exit "AWS credentials file (${AWS_CREDFILE}) exists but is empty" + +# Check for existence of CMS bucket +if ! aws s3api list-buckets --endpoint-url http://ncn-m001.nmn:8000 | jq -r '.Buckets[] | .Name' | grep -Eq '^cms$'; then + [[ -z ${INIT} ]] && err_exit "'cms' bucket does not exist in Minio" + echo "Creating cms bucket" + run_cmd aws s3api create-bucket --bucket cms --endpoint-url http://ncn-m001.nmn:8000 + echo "cms minio bucket created" +else + echo "cms minio bucket already exists" +fi + +[[ ${MOUNT_OPT} == "nomount" ]] && exit 0 + +[[ -n ${CMS_MINIO_MNT} ]] || CMS_MINIO_MNT="${DEFAULT_CMS_MINIO_MNT}" + +# Unmount, if it is currently mounted +umount "${CMS_MINIO_MNT}" > /dev/null 2>&1 + +if [[ ! -d ${CMS_MINIO_MNT} ]]; then + echo "Creating directory '${CMS_MINIO_MNT}'" + run_cmd mkdir -pv "${CMS_MINIO_MNT}" +fi + +credfile=$(run_mktemp /root/.XXXXXX.minio.s3fs) || exit 1 + +AKEY=$(grep '^aws_access_key_id = ' "${AWS_CREDFILE}" | awk '{ print $NF }') || err_exit "Error getting aws_access_key_id from ${AWS_CREDFILE}" +SKEY=$(grep '^aws_secret_access_key = ' "${AWS_CREDFILE}" | awk '{ print $NF }') || err_exit "Error getting aws_secret_access_key from ${AWS_CREDFILE}" + +cat << EOF > "${credfile}" || err_exit "Error writing to '${credfile}'" +${AKEY}:${SKEY} +EOF + +run_cmd chmod 600 "${credfile}" +run_cmd s3fs cms "${CMS_MINIO_MNT}" -o "_netdev,${MOUNT_OPT},allow_other,passwd_file=${credfile},url=http://ncn-m001.nmn:8000,use_path_request_style,use_xattr" + +echo "CMS minio mount (${CMS_MINIO_MNT}) created" +exit 0 From 169b8e257869af34f5e9f4f405ddcc6b05218cb0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 2 Jul 2024 14:06:39 -0500 Subject: [PATCH 17/37] CASMINST-6906: Update RPMs on all NCNs in prerequisites.sh; allow vendor changes in test RPM updater (#5196) * CASMINST-6906: Update RPMs on all NCNs in prerequisites.sh; allow vendor changes in test RPM updater (cherry picked from commit 0af679c695cb57ad24874e20289ac5391e25e269) * Update Validate_CSM_Health_During_Upgrade.md Fix pre-existing issue that makes the linter sad Signed-off-by: Mitch Harding --------- Signed-off-by: Mitch Harding Co-authored-by: Mitch Harding (the weird one) --- upgrade/Validate_CSM_Health_During_Upgrade.md | 10 ++++------ upgrade/scripts/upgrade/prerequisites.sh | 5 +++-- upgrade/scripts/upgrade/util/upgrade-test-rpms.sh | 10 ++++++---- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/upgrade/Validate_CSM_Health_During_Upgrade.md b/upgrade/Validate_CSM_Health_During_Upgrade.md index 34f459a683fc..a5f4100cb9a3 100644 --- a/upgrade/Validate_CSM_Health_During_Upgrade.md +++ b/upgrade/Validate_CSM_Health_During_Upgrade.md @@ -18,15 +18,12 @@ If additional shells are opened during this procedure, then record those with typescripts as well. When resuming a procedure after a break, always be sure that a typescript is running before proceeding. -1. Validate CSM health. +1. (`ncn-m002#`) Validate CSM health. - Run the combined health check script, which runs a variety of health checks that should pass at this stage of the upgrade: - - - Kubernetes health checks - - NCN health checks + Run the combined health check script, which runs a variety of health checks that should pass at this stage of the upgrade. ```bash - /opt/cray/tests/install/ncn/automated/ncn-k8s-combined-healthcheck-post-service-upgrade + /opt/cray/tests/install/ncn/automated/ncn-k8s-combined-healthcheck ``` Review the output and follow the instructions provided to resolve any test failures. With the exception of @@ -59,6 +56,7 @@ ```bash cray artifacts create config-data "${TARFILE}" "/root/${TARFILE}" ``` + 1. Update ceph node-exporter config for SNMP counters. > **OPTIONAL:** This is an optional step. diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh index f35d5172367f..dad2997fec42 100755 --- a/upgrade/scripts/upgrade/prerequisites.sh +++ b/upgrade/scripts/upgrade/prerequisites.sh @@ -1289,8 +1289,9 @@ if [[ ${state_recorded} == "0" ]]; then systemctl enable goss-servers systemctl restart goss-servers - # Install above RPMs and restart goss-servers on ncn-w001 - ssh ncn-w001 "rpm --force -Uvh ${url_list[*]}; systemctl enable goss-servers; systemctl restart goss-servers;" + # Install above RPMs and restart goss-servers on all other NCNs + ncns=$(grep -oP 'ncn-\w\d+' /etc/hosts | sort -u | grep -Ev "^$(hostname -s)$" | tr -t '\n' ',') + pdsh -S -b -w ${ncns} "rpm --force -Uvh ${url_list[*]}; systemctl enable goss-servers; systemctl restart goss-servers;" # get all installed CSM version into a file kubectl get cm -n services cray-product-catalog -o json | jq -r '.data.csm' | yq r - -d '*' -j | jq -r 'keys[]' > /tmp/csm_versions diff --git a/upgrade/scripts/upgrade/util/upgrade-test-rpms.sh b/upgrade/scripts/upgrade/util/upgrade-test-rpms.sh index 6f18da0140aa..58bdae3d4744 100755 --- a/upgrade/scripts/upgrade/util/upgrade-test-rpms.sh +++ b/upgrade/scripts/upgrade/util/upgrade-test-rpms.sh @@ -27,21 +27,23 @@ # If --local is not specified, upgrade the test RPMs on all NCNs # If --local is specified, upgrade the test RPMs just on the system where the script is being executed +RPM_LIST="hpe-csm-goss-package csm-testing goss-servers craycli cray-cmstools-crayctldeploy" + set -euo pipefail if [[ $# -eq 0 ]]; then ncns=$(grep -oP 'ncn-\w\d+' /etc/hosts | sort -u | tr -t '\n' ',') - echo "Installing updated versions of hpe-csm-goss-package csm-testing goss-servers craycli RPMs" - pdsh -S -b -w ${ncns} 'zypper install -y hpe-csm-goss-package csm-testing goss-servers craycli' + echo "Installing updated versions of RPMs on all NCNs: ${RPM_LIST}" + pdsh -S -b -w ${ncns} "zypper install -y --allow-vendor-change ${RPM_LIST}" echo "Enabling and restarting goss-servers" pdsh -S -b -w ${ncns} 'systemctl enable goss-servers && systemctl restart goss-servers' elif [[ $# -eq 1 && $1 == --local ]]; then - echo "Installing updated versions of hpe-csm-goss-package csm-testing goss-servers craycli RPMs" - zypper install -y hpe-csm-goss-package csm-testing goss-servers craycli + echo "Installing updated versions of RPMs: ${RPM_LIST}" + zypper install -y --allow-vendor-change ${RPM_LIST} echo "Enabling and restarting goss-servers" systemctl enable goss-servers && systemctl restart goss-servers From 4d445f1a276111f62e607a7ca49b3622dd9da612 Mon Sep 17 00:00:00 2001 From: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> Date: Wed, 3 Jul 2024 00:40:16 +0530 Subject: [PATCH 18/37] CASMTRIAGE-7110 Update configure_cray_cli.md (#5200) Update configure_cray_cli.md Signed-off-by: ganesh s <124240773+ganeshs-hpe@users.noreply.github.com> --- operations/configure_cray_cli.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/operations/configure_cray_cli.md b/operations/configure_cray_cli.md index f0ca173e8345..fd4ffa62fb1a 100644 --- a/operations/configure_cray_cli.md +++ b/operations/configure_cray_cli.md @@ -73,6 +73,11 @@ install and should be removed when the install is complete. As the script leverages Keycloak administrative APIs, the `--keycloakHost` command line option must be set to use the CMN load balancer, as detailed below. +> **`NOTES:`** +> +> - This script creates a `temporary user` that can be used for basic `cray` CLI command only until Keycloak is populated with real users. At which point, the `cray` CLI should be re-initialized with a real user. +> - The `temporary user` that was created is only in Keycloak - it is not a `real` user with login shells and home directories. + ### Procedure for temporary Keycloak user 1. (`ncn-mws#`) Unset the `CRAY_CREDENTIALS` environment variable, if previously set. From c884308e2ddcbc7f06ac615a1b7928e60936d758 Mon Sep 17 00:00:00 2001 From: Ryan Haasken <77809410+haasken-hpe@users.noreply.github.com> Date: Tue, 2 Jul 2024 14:10:39 -0500 Subject: [PATCH 19/37] CASMTRIAGE-7016: Fix update_tags.sh by removing bad `echo` (#5201) The `echo` command inside of `get_latest_tag_for_image` pollutes the output of this function, which is then fed into `update_tags_in_file`, which then results in the incorrect arguments being passed to a `sed` command which results in `sed` failing with a "No such file or directory" error. Remove this `echo` command. It's not clear what its purpose is. Test Description: Removed this echo command from a copy of the script and executed the script and verified that it correctly updated image tags in Argo workflow templates. --- workflows/update_tags.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/update_tags.sh b/workflows/update_tags.sh index b6f1857c6cae..76227afc7b77 100755 --- a/workflows/update_tags.sh +++ b/workflows/update_tags.sh @@ -75,7 +75,6 @@ function get_latest_tag_for_image() { THIS_PREFIX="${THIS_REGISTRY_NAME}/" THIS_IMAGE=$(echo "${THIS_IMAGE}" | sed "s#^${DEFAULT_REGISTRY_REGEX}/##") fi - echo $THIS_PODMAN_TLS $THIS_PREFIX$THIS_IMAGE podman search $THIS_PODMAN_TLS $THIS_PREFIX$THIS_IMAGE --list-tags --format=json | jq -r ' def opt(f): . as $in | try f catch $in; From 406e70d9121a363b99365936b63197f40afbfc8c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:11:17 -0500 Subject: [PATCH 20/37] CASMINST-6916: Create CPC exporter & combined CMS/CPC export scripts (#5204) * CASMINST-6916: Create script to dump Cray Product Catalog (cherry picked from commit 1cb1a41f8180120a83b14167972d50ecf2d27712) * CASMINST-6916: Create disaster recovery scripts for CMS/CPC exports (cherry picked from commit 9314599f4defca4a0d1ee00e235772605e6f45af) * Minor tweaks (cherry picked from commit c40d78c961bd83d95c33a09d9548281adb26319a) * Unmount s3fs mount before exiting (cherry picked from commit d9203c3d4493e8385547c9478a2cae9d396dd34c) --------- Co-authored-by: Mitch Harding (the weird one) --- scripts/operations/configuration/dump_cpc.sh | 65 +++++++ .../system_recovery/bash_lib/ims.sh | 30 +++ .../cms_minio_export_helper.sh | 106 +++++++++++ .../export_cms_cpc_to_minio.sh | 176 ++++++++++++++++++ 4 files changed, 377 insertions(+) create mode 100755 scripts/operations/configuration/dump_cpc.sh create mode 100644 scripts/operations/system_recovery/bash_lib/ims.sh create mode 100755 scripts/operations/system_recovery/cms_minio_export_helper.sh create mode 100755 scripts/operations/system_recovery/export_cms_cpc_to_minio.sh diff --git a/scripts/operations/configuration/dump_cpc.sh b/scripts/operations/configuration/dump_cpc.sh new file mode 100755 index 000000000000..66bfef7fc29b --- /dev/null +++ b/scripts/operations/configuration/dump_cpc.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +# Inform ShellCheck about the file we are sourcing +# shellcheck source=./bash_lib/common.sh +. "${locOfScript}/bash_lib/common.sh" + +set -uo pipefail + +function usage { + echo "Usage: dump_cpc.sh [output_directory]" >&2 + echo + echo "If no output directory is specified, it defaults to the user's home directory" >&2 +} + +OUTDIR="" + +if [[ $# -eq 1 ]] && [[ $1 == "-h" || $1 == "--help" ]]; then + usage + exit 2 +fi + +if [[ $# -gt 1 ]]; then + usage_err_exit "Too many arguments" +elif [[ $# -eq 0 ]]; then + OUTDIR=~ +elif [[ -z $1 ]]; then + usage_err_exit "Output directory argument may not be blank" +elif [[ ! -e $1 ]]; then + usage_err_exit "Specified output directory ($1) does not exist" +elif [[ ! -d $1 ]]; then + usage_err_exit "Specified output directory ($1) exists but is not a directory" +else + OUTDIR="$1" +fi + +OUTFILE=$(run_mktemp "${OUTDIR}/cray-product-catalog-$(date +%Y%m%d%H%M%S)-XXXXXX.yaml") || err_exit +echo "Dumping Cray Product Catalog to '${OUTFILE}'" + +run_cmd kubectl get cm -n services cray-product-catalog -o yaml > "${OUTFILE}" || err_exit "Error writing to '${OUTFILE}'" + +echo "Cray Product Catalog dumped to '${OUTFILE}'" diff --git a/scripts/operations/system_recovery/bash_lib/ims.sh b/scripts/operations/system_recovery/bash_lib/ims.sh new file mode 100644 index 000000000000..de7fcab2f55f --- /dev/null +++ b/scripts/operations/system_recovery/bash_lib/ims.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +# Shared function and variable definitions between IMS backup and restore scripts + +# These variables are not used in this file, but are used by scripts which source this file +#shellcheck disable=SC2034 +IMS_FS_MNT=/opt/cray/pit/ims diff --git a/scripts/operations/system_recovery/cms_minio_export_helper.sh b/scripts/operations/system_recovery/cms_minio_export_helper.sh new file mode 100755 index 000000000000..94952aede0b9 --- /dev/null +++ b/scripts/operations/system_recovery/cms_minio_export_helper.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +CONFIG_SCRIPT_DIR="${locOfScript}/../configuration" +# Inform ShellCheck about the file we are sourcing +# shellcheck source=../configuration/bash_lib/common.sh +. "${CONFIG_SCRIPT_DIR}/bash_lib/common.sh" + +set -uo pipefail + +function usage { + echo "Usage: cms_minio_export_helper.sh " >&2 +} + +if [[ $# -eq 1 ]] && [[ $1 == "-h" || $1 == "--help" ]]; then + usage + exit 2 +fi + +[[ $# -ne 0 ]] || usage_err_exit "Missing required argument" +[[ $# -le 1 ]] || usage_err_exit "Too many arguments" +[[ -n $1 ]] || usage_err_exit "Export area cannot be blank" + +# Set defaults +area="$1" +EXPORT_SCRIPT_ARGS=() +TMPDIR_BASE=~ +BACKUP_EXT=tgz + +case "${area}" in + bos) + EXPORT_SCRIPT_NAME="export_bos_data.sh" + BACKUP_PREFIX=bos-export + ;; + cfs) + EXPORT_SCRIPT_NAME="export_cfs_data.sh" + BACKUP_PREFIX=cfs-export + ;; + cpc) + EXPORT_SCRIPT_NAME="dump_cpc.sh" + BACKUP_PREFIX=cray-product-catalog + BACKUP_EXT=yaml + ;; + ims) + EXPORT_SCRIPT_NAME="export_ims_data.py" + # Unlike the other export scripts, we use an additional argument with the IMS exporter + EXPORT_SCRIPT_ARGS=("--no-tar") + # We don't set backup prefix and ext for IMS, because it is handled differently + + # IMS uses a different temp location for its backup, because of how large it is + # Inform ShellCheck about the file we are sourcing + # shellcheck source=./bash_lib/ims.sh + . "${locOfScript}/bash_lib/ims.sh" + + [[ -e ${IMS_FS_MNT} ]] || err_exit "Directory does not exist: '${IMS_FS_MNT}'" + [[ -d ${IMS_FS_MNT} ]] || err_exit "Exists but is not a directory: '${IMS_FS_MNT}'" + TMPDIR_BASE="${IMS_FS_MNT}" + ;; + vcs) + EXPORT_SCRIPT_NAME="backup_vcs.sh" + BACKUP_PREFIX=gitea-vcs + ;; + *) + usage_err_exit "Unknown export area: '${area}'" + ;; +esac + +TMPDIR=$(run_mktemp -d "${TMPDIR_BASE}/export-${area}.XXX") || err_exit +run_cmd "${CONFIG_SCRIPT_DIR}/${EXPORT_SCRIPT_NAME}" "${EXPORT_SCRIPT_ARGS[@]}" "${TMPDIR}" + +# Copying the data over to minio is different for IMS versus the others +if [[ ${area} == ims ]]; then + run_cmd aws s3 sync "${TMPDIR}" s3://cms/ims --endpoint-url http://localhost:8000 + # We want to fail the script if this fails, because it will leave a lot of data on disk otherwise + run_cmd rm -rf "${TMPDIR}" +else + run_cmd aws s3 mv "${TMPDIR}/${BACKUP_PREFIX}"*".${BACKUP_EXT}" s3://cms --endpoint-url http://localhost:8000 + # Non-IMS backups are much smaller, plus we are using s3 mv, so it's not the end of the world if + # we don't clean up the temporary directory + rmdir "${TMPDIR}" || echo "WARNING: Unable to remove directory '${TMPDIR}'" >&2 +fi + +echo "${area} export completed successfully" diff --git a/scripts/operations/system_recovery/export_cms_cpc_to_minio.sh b/scripts/operations/system_recovery/export_cms_cpc_to_minio.sh new file mode 100755 index 000000000000..3788f4ce50f6 --- /dev/null +++ b/scripts/operations/system_recovery/export_cms_cpc_to_minio.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# +# MIT License +# +# (C) Copyright 2024 Hewlett Packard Enterprise Development LP +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# + +locOfScript=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +CONFIG_SCRIPT_DIR="${locOfScript}/../configuration" +# Inform ShellCheck about the file we are sourcing +# shellcheck source=../configuration/bash_lib/common.sh +. "${CONFIG_SCRIPT_DIR}/bash_lib/common.sh" + +CMS_EXPORT_SCRIPT="${locOfScript}/cms_minio_export_helper.sh" + +set -uo pipefail + +function usage { + echo "Usage: export_cms_cpc.sh [bos] [cfs] [cpc] [ims] [vcs]" >&2 + echo >&2 + echo "If no areas are specified, all areas are exported." >&2 + echo "Otherwise, only the specified areas are exported." >&2 +} + +if [[ $# -eq 1 ]] && [[ $1 == "-h" || $1 == "--help" ]]; then + usage + exit 2 +fi + +backup_areas=() +backup_pids=() + +function add_area { + local a IMS_FS_MNT + [[ $1 =~ ^(bos|cfs|cpc|ims|vcs)$ ]] || usage_err_exit "Unrecognized export area '$1'" + for a in "${backup_areas[@]}"; do + # no need to add it if we already have it + [[ $a == "$1" ]] && return + done + backup_areas+=("$1") + [[ $1 == ims ]] || return + # Since we're exporting IMS, make sure /opt/cray/pit/ims exists + # Inform ShellCheck about the file we are sourcing + # shellcheck source=./bash_lib/ims.sh + . "${locOfScript}/bash_lib/ims.sh" + [[ -e ${IMS_FS_MNT} ]] || err_exit "Directory does not exist: '${IMS_FS_MNT}'" + [[ -d ${IMS_FS_MNT} ]] || err_exit "Exists but is not a directory: '${IMS_FS_MNT}'" +} + +if [[ $# -eq 0 ]]; then + add_area bos + add_area cfs + add_area cpc + add_area ims + add_area vcs +else + while [[ $# -gt 0 ]]; do + add_area "$1" + shift + done +fi + +# Create mount point for CMS minio s3fs +CMS_MINIO_MNT=$(run_mktemp -d ~/.export_cms_cpc_minio_mnt.XXX) || err_exit + +echo "Initializing CMS bucket in minio (if needed)" +run_cmd "${locOfScript}/setup_cms_minio_mount.sh" --rw --init "${CMS_MINIO_MNT}" + +LOG_REL_DIR="logs/exports/$(date +%Y%m%d%H%M%S)" +LOG_DIR="${CMS_MINIO_MNT}/${LOG_REL_DIR}" +echo "Create log directory in minio://cms/${LOG_REL_DIR}" +run_cmd mkdir -p "${LOG_DIR}" + +function launch_area_export { + local epid logbase area + area="$1" + logbase="${area}.log" + echo "$(date) Starting ${area} export (log: minio://cms/${LOG_REL_DIR}/${logbase})" + nohup "${CMS_EXPORT_SCRIPT}" "${area}" > "${LOG_DIR}/${logbase}" 2>&1 & + epid=$! + echo "${area} export PID is ${epid}" + backup_pids+=("${epid}") +} + +for area in "${backup_areas[@]}"; do + launch_area_export "${area}" +done + +echo "Waiting for exports to complete" + +errors=0 +running=${#backup_pids[@]} +last_print=$SECONDS +need_newline="" +while [[ ${running} -gt 0 ]]; do + sleep 1 + old_running=${running} + running=0 + still_running=() + i=0 + while [[ $i -lt ${#backup_pids[@]} ]]; do + bpid=${backup_pids[$i]} + area=${backup_areas[$i]} + + # If the PID is 0, it means we have previously seen that this + # backup completed and checked it + if [[ ${bpid} == 0 ]]; then + let i+=1 + continue + fi + + # Don't let the scary kill fool you -- with signal 0, this just checks + # if the process is still running -- no killing involved! + if kill -0 "${bpid}" > /dev/null 2>&1; then + let i+=1 + let running+=1 + still_running+=("${area} (${bpid})") + continue + fi + + # The process seems to be done, so let's get its exit code + wait "${bpid}" + rc=$? + # Mark that it is done + backup_pids[$i]=0 + let i+=1 + [[ -n ${need_newline} ]] && echo + last_print=$SECONDS + need_newline="" + if [[ $rc -eq 0 ]]; then + echo "$(date) ${area} export (PID ${bpid}) completed successfully" + else + echo "$(date) ${area} export (PID ${bpid}) FAILED with exit code $rc (logfile: ${LOG_DIR}/${area}.log)" + let errors+=1 + fi + done + if [[ ${running} -gt 0 && ${running} -ne ${old_running} ]]; then + [[ -n ${need_newline} ]] && echo + last_print=$SECONDS + need_newline="" + echo "Still running: ${still_running[*]}" + continue + fi + # Print some progress characters while waiting, occasionally + [[ $((SECONDS - last_print)) -ge 180 ]] || continue + printf . + need_newline=y + last_print=$SECONDS +done + +umount "${CMS_MINIO_MNT}" || echo "WARNING: Unable to unmount '${CMS_MINIO_MNT}'" >&2 + +if [[ $errors -ne 0 ]]; then + err_exit "${errors} of the exports failed. See individual log files for details" + exit 1 +fi + +echo "All exports completed successfully" From aca5aa164405303323468bc2fbf1f417972b3a55 Mon Sep 17 00:00:00 2001 From: Lindsay Eliasen <87664908+leliasen-hpe@users.noreply.github.com> Date: Tue, 9 Jul 2024 14:13:35 -0500 Subject: [PATCH 21/37] CASMPET-6904 edit prerequisites.sh for CSM 1.6 certmanager upgrade (#5206) * CASMPET-6904 edit prerequisites.sh for CSM 1.6 certmanager upgrade * `shfmt` --------- Co-authored-by: Russell Bunch --- upgrade/scripts/upgrade/prerequisites.sh | 49 ++++++++++-------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh index dad2997fec42..ed2a77f7a448 100755 --- a/upgrade/scripts/upgrade/prerequisites.sh +++ b/upgrade/scripts/upgrade/prerequisites.sh @@ -613,20 +613,25 @@ else echo "====> ${state_name} has been completed" | tee -a "${LOG_FILE}" fi +# upgrade all charts dependent on cray-certmanager chart +# it is neccessary to upgrade these before upgrade +do_upgrade_csm_chart cray-istio platform.yaml +do_upgrade_csm_chart cray-keycloak platform.yaml +do_upgrade_csm_chart cray-oauth2-proxies platform.yaml +do_upgrade_csm_chart spire sysmgmt.yaml +do_upgrade_csm_chart cray-spire sysmgmt.yaml +do_upgrade_csm_chart cray-tapms-crd sysmgmt.yaml +do_upgrade_csm_chart cray-tapms-operator sysmgmt.yaml + # Note for csm 1.5/k8s 1.22 only if ANY chart depends on /v1 cert-manager api # usage it *MUST* come after this or prerequisites will fail on an upgrade. # Helper functions for cert-manager upgrade -has_cm_init() { - ns="${1?no namespace provided}" - helm list -n "${ns}" --filter cray-certmanager-init | grep cray-certmanager-init > /dev/null 2>&1 -} - has_craycm() { ns="${1?no namespace provided}" helm list -n "${ns}" --filter 'cray-certmanager$' | grep cray-certmanager > /dev/null 2>&1 } -state_name="UPGRADE_CERTMANAGER_0141_CHART" +state_name="UPGRADE_CERTMANAGER_155_CHART" state_recorded=$(is_state_recorded "${state_name}" "$(hostname)") if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then echo "====> ${state_name} ..." | tee -a "${LOG_FILE}" @@ -636,7 +641,7 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then # work due to helm hooks. Making this work on both isn't really worth the # time so just constrain this block of logic to 0.14.1 where we know its # needed. - gate="0.14.1" + gate="1.5.5" found=$(helm list -n cert-manager --filter 'cray-certmanager$' | awk '/deployed/ {print $10}') needs_upgrade=0 @@ -647,20 +652,19 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then else printf "note: cert-manager helm chart version %s\n" "${found}" >&2 - # We might be rerunning from a pre 1.5.x install and there is no + # We might be rerunning from a pre 1.6.x install and there is no # cert-manager installed due to a prior removal if [ "${found}" = "" ]; then printf "note: no helm install appears to exist for cert-manager, likely this state is being run again\n" >&2 ((needs_upgrade += 1)) else - printf "note: no cert-manager upgrade steps needed, cert-manager 0.14.1 is not installed\n" >&2 + printf "note: no cert-manager upgrade steps needed, cert-manager 1.5.5 is not installed\n" >&2 fi fi - # Only run if we need to and detected not 0.14.1 or "" + # Only run if we need to and detected not 1.12.9 or "" if [ "${needs_upgrade}" -gt 0 ]; then cmns="cert-manager" - cminitns="cert-manager-init" backup_secret="cm-restore-data" @@ -671,10 +675,6 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then ((needs_backup += 1)) fi - if has_cm_init ${cminitns}; then - ((needs_backup += 1)) - fi - # Ok so the gist of this "backup" is we back up all the cert-manager data as # guided by them. The secret we use for this is only kept around until this # prereq state completes. @@ -688,35 +688,26 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then fi fi - # Only remove these charts if installed + # Only remove cray-certmanager if installed if has_craycm ${cmns}; then helm uninstall -n "${cmns}" cray-certmanager fi - if has_cm_init ${cminitns}; then - helm uninstall -n "${cminitns}" cray-certmanager-init - fi - # Note: These should *never* fail as we depend on helm uninstall doing # its job, but if it didn't exit early here as something is amiss. cm=1 - cminit=1 if ! helm list -n "${cmns}" --filter 'cray-certmanager$' | grep cray-certmanager > /dev/null 2>&1; then cm=0 fi - if ! helm list -n "${cminitns}" --filter cray-certmanager-init | grep cray-certmanager-init > /dev/null; then - cminit=0 - fi - - if [ "${cm}" = "1" ] || [ "${cminit}" = "1" ]; then - printf "fatal: helm uninstall did not remove expected charts, cert-manager %s cert-manager-init %s\n" "${cm}" "${cminit}" >&2 + if [ "${cm}" = "1" ]; then + printf "fatal: helm uninstall did not remove expected chart cert-manager %s\n" "${cm}" >&2 exit 1 fi # Ensure the cert-manager namespace is deleted in a case of both helm charts - # removed but there might be detritus leftover in the namespace. + # removed but there might be detritous leftover in the namespace. kubectl delete namespace "${cmns}" || : tmp_manifest=/tmp/certmanager-tmp-manifest.yaml @@ -747,7 +738,7 @@ EOF done platform="${CSM_MANIFESTS_DIR}/platform.yaml" - for chart in cray-drydock cray-certmanager cray-certmanager-issuers; do + for chart in cray-certmanager cray-certmanager-issuers; do printf " -\n" >> "${tmp_manifest}" yq r "${platform}" 'spec.charts.(name=='${chart}')' | sed 's/^/ /' >> "${tmp_manifest}" done From 1160121e2b614813c7adf5ad436815367b87121f Mon Sep 17 00:00:00 2001 From: Mitch Harding Date: Tue, 9 Jul 2024 15:17:25 -0400 Subject: [PATCH 22/37] CASMTRIAGE-7122: ncn-upgrade-master-nodes.sh: Add retries of docs-csm RPM install (#5208) --- .../upgrade/ncn-upgrade-master-nodes.sh | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh b/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh index afb872d87bde..65518654abbf 100755 --- a/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh +++ b/upgrade/scripts/upgrade/ncn-upgrade-master-nodes.sh @@ -187,9 +187,23 @@ state_recorded=$(is_state_recorded "${state_name}" ${target_ncn}) if [[ $state_recorded == "0" ]]; then echo "====> ${state_name} ..." { - record_state "${state_name}" ${target_ncn} scp -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null /root/docs-csm-latest.noarch.rpm $target_ncn:/root/docs-csm-latest.noarch.rpm - ssh $target_ncn "rpm --force -Uvh /root/docs-csm-latest.noarch.rpm" + # CASMTRIAGE-7122: This RPM install can fail if it happens while CFS is installing RPMs + # Therefore, we retry a limited number of times before giving up + attempt=0 + while [[ true ]]; do + if [[ ${attempt} -gt 0 ]]; then + # Wait briefly before trying again + sleep 5 + fi + if [[ ${attempt} -lt 12 ]]; then + let attempt+=1 + ssh $target_ncn "rpm --force -Uvh /root/docs-csm-latest.noarch.rpm" && break || continue + fi + # Final attempt. The lack of the || continue means that this will cause the script to + # fail (since it runs with set -e) if the command fails + ssh $target_ncn "rpm --force -Uvh /root/docs-csm-latest.noarch.rpm" && break + done } >> ${LOG_FILE} 2>&1 record_state "${state_name}" ${target_ncn} else From f923c3f4f78a6a96d441bdc127c235e2629bb0a7 Mon Sep 17 00:00:00 2001 From: Mitch Harding Date: Tue, 9 Jul 2024 15:23:54 -0400 Subject: [PATCH 23/37] CASMTRIAGE-7131: Correct BOS CLI commands (#5216) * CASMTRIAGE-7131: Correct BOS CLI commands; minor linting * Placate altered linter rules --- .../boot_orchestration/BOS_Workflows.md | 97 ++++++++++++------- operations/boot_orchestration/Cheatsheet.md | 16 +-- ...Template_to_Boot_Compute_Nodes_with_CPS.md | 2 +- ...emplate_to_Boot_Compute_Nodes_with_SBPS.md | 2 +- .../Manage_a_BOS_Session.md | 2 +- 5 files changed, 71 insertions(+), 48 deletions(-) diff --git a/operations/boot_orchestration/BOS_Workflows.md b/operations/boot_orchestration/BOS_Workflows.md index 26791deca819..713313e02b1e 100644 --- a/operations/boot_orchestration/BOS_Workflows.md +++ b/operations/boot_orchestration/BOS_Workflows.md @@ -1,25 +1,28 @@ # BOS Workflows The following workflows present a high-level overview of common Boot Orchestration Service \(BOS\) operations. -These workflows depict how services interact with each other when booting, configuring, or shutting down nodes. They also help provide a quicker and deeper understanding of how the system functions. +These workflows depict how services interact with each other when booting, configuring, or shutting down nodes. +They also help provide a quicker and deeper understanding of how the system functions. * [Terminology](#terminology) * [Workflows] - * [Boot nodes](#boot-nodes) - * [Reboot nodes](#reboot-nodes) - * [Power off nodes](#power-off-nodes) + * [Boot nodes](#boot-nodes) + * [Reboot nodes](#reboot-nodes) + * [Power off nodes](#power-off-nodes) ## Terminology The following are mentioned in the workflows: -* Boot Orchestration Service \(BOS\) is responsible for booting, configuring, and shutting down collections of nodes. The Boot Orchestration Service has the following components: - * A BOS session template is a collection of one or more boot sets. A boot set defines a collection of nodes and the information about the boot artifacts and parameters. - Session templates also include information on which [Configuration Framework Service (CFS)](../../glossary.md#configuration-framework-service-cfs) configuration should be applied. - * BOS sessions provide a way to apply a template across a group of nodes and monitor the progress of those nodes as they move toward their desired state. - * BOS operators interact with other services to perform actions on nodes, moving them toward their desired state. -* [Cray Advanced Platform Monitoring and Control (CAPMC)](../../glossary.md#cray-advanced-platform-monitoring-and-control-capmc) service provides system-level power control for nodes in the system. - CAPMC interfaces directly with the Redfish APIs to the controller infrastructure to effect power and environmental changes on the system. +* Boot Orchestration Service \(BOS\) is responsible for booting, configuring, and shutting down collections of nodes. + The Boot Orchestration Service has the following components: + * A BOS session template is a collection of one or more boot sets. A boot set defines a collection of nodes and the information about the boot artifacts and parameters. + Session templates also include information on which [Configuration Framework Service (CFS)](../../glossary.md#configuration-framework-service-cfs) configuration should + be applied. + * BOS sessions provide a way to apply a template across a group of nodes and monitor the progress of those nodes as they move toward their desired state. + * BOS operators interact with other services to perform actions on nodes, moving them toward their desired state. +* [Cray Advanced Platform Monitoring and Control (CAPMC)](../../glossary.md#cray-advanced-platform-monitoring-and-control-capmc) service provides system-level power control + for nodes in the system. CAPMC interfaces directly with the Redfish APIs to the controller infrastructure to effect power and environmental changes on the system. * [Hardware State Manager (HSM)](../../glossary.md#hardware-state-manager-hsm) tracks the state of each node and its group and role associations. * [Boot Script Service (BSS)](../../glossary.md#boot-script-service-bss) stores per-node information about the iPXE boot script. When booting or rebooting, nodes consult BSS for boot artifacts \(kernel, `initrd`, image root\) and boot parameters. @@ -71,28 +74,32 @@ The following workflows are included in this section: 1. **Administrator creates a BOS session template** - A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them. A session template can be created from a JSON structure. It returns a Session Template ID if successful. + A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them. + A session template can be created from a JSON structure. It returns a session template ID if successful. - See [Manage a Session Template](Manage_a_Session_Template.md) for more information. + See [Manage a session template](Manage_a_Session_Template.md) for more information. 1. **Administrator creates a session** - Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, Administrator creates a session with operation as Boot and specifies the session template ID. + Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, + the administrator creates a session with operation as `boot` and specifies the session template ID. (`ncn-mw#`) ```bash - cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation Boot + cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation boot ``` 1. **Session setup operator** The creation of a session causes the session-setup operator to set a desired state on all components listed in the session template. - This includes pulling files from S3 to determine boot artifacts like kernel, `initrd`, and root file system. The session setup operator also enables the relevant components at this time. + This includes pulling files from S3 to determine boot artifacts like kernel, `initrd`, and root file system. The session setup operator also enables the relevant + components at this time. 1. **Status operator (powering-on)** - The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with HSM to determine the current power status of the node. + The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with HSM + to determine the current power status of the node. In this example of booting nodes, the first phase is `powering-on`. If queried at this point, the nodes will have a status of `power-on-pending`. For more on component phase and status, see [Component Status](Component_Status.md) @@ -103,7 +110,8 @@ The following workflows are included in this section: If configuration is enabled for the node, the power-on operator will also call CFS to set the desired configuration and disable the node with CFS. The node must be disabled within CFS so that CFS does not try to configure node until it has booted. The power-on operator then calls CAPMC to power-on the node. - Lastly, the power-on operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-on-called`. + Lastly, the power-on operator will update the state of the node in BOS, including setting the last action. If queried at this point, + the nodes will have a status of `power-on-called`. 1. **CAPMC boots nodes** @@ -120,7 +128,8 @@ The following workflows are included in this section: 1. **Status operator (configuring)** The status operator monitors a node's power state until HSM reports that the power state is on. - When the power state for a node is on, the status operator will either set the phase to `configuring` if CFS configuration is required or it will clear the current phase if the node is in its final state. + When the power state for a node is on, the status operator will either set the phase to `configuring` if CFS configuration is required or it will clear the current phase + if the node is in its final state. 1. **CFS applies configuration** @@ -173,28 +182,32 @@ The following workflows are included in this section: 1. **Administrator creates a BOS session template** - A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them. A session template can be created from a JSON structure. It returns a Session Template ID if successful. + A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them. + A session template can be created from a JSON structure. It returns a session template ID if successful. - See [Manage a Session Template](Manage_a_Session_Template.md) for more information. + See [Manage a session template](Manage_a_Session_Template.md) for more information. 1. **Administrator creates a session** - Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, the administrator creates a session with operation as Boot and specifies the session template ID. + Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, + the administrator creates a session with operation as `reboot` and specifies the session template ID. (`ncn-mw#`) ```bash - cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation Reboot + cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation reboot ``` 1. **Session setup operator** The creation of a session causes the session-setup operator to set a desired state on all components listed in the session template. - This includes pulling files from S3 to determine boot artifacts like kernel, `initrd`, and root file system. The session setup operator also enables the relevant components at this time. + This includes pulling files from S3 to determine boot artifacts like kernel, `initrd`, and root file system. The session setup operator also enables the relevant + components at this time. 1. **Status operator (powering-off)** - The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with HSM to determine the current power status of the node. + The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with + HSM to determine the current power status of the node. In this example of rebooting nodes, the first phase is `powering-off`. If queried at this point, the nodes will have a status of `power-off-pending`. For more on component phase and status, see [Component Status](Component_Status.md) @@ -202,12 +215,14 @@ The following workflows are included in this section: 1. **Graceful-power-off operator** The power-off operator will detect nodes with a `power-off-pending` status, calls CAPMC to power-off the node. - Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-off-gracefully-called`. + Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of + `power-off-gracefully-called`. 1. **Forceful-power-off operator** If powering-off is taking too long, the forceful-power-off will take over. It also calls CAPMC to power-off the node, but with the addition of the forceful flag. - Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-off-forcefully-called`. + Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of + `power-off-forcefully-called`. 1. **CAPMC powers off nodes** @@ -216,7 +231,8 @@ The following workflows are included in this section: 1. **Status operator (powering-on)** The status operator monitors a node's power state until HSM reports that the power state is off. - When the power state for a node is off, the status operator will set the phase to `powering-on`. If queried at this point, the nodes will have a status of `power-on-pending`. + When the power state for a node is off, the status operator will set the phase to `powering-on`. If queried at this point, the nodes will have a status of + `power-on-pending`. 1. **Power-on operator** @@ -224,7 +240,8 @@ The following workflows are included in this section: If configuration is enabled for the node, the power-on operator will also call CFS to set the desired configuration and disable the node with CFS. The node must be disabled within CFS so that CFS does not try to configure node until it has booted. The power-on operator then calls CAPMC to power-on the node. - Lastly, the power-on operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-on-called`. + Lastly, the power-on operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of + `power-on-called`. 1. **CAPMC boots nodes** @@ -241,7 +258,8 @@ The following workflows are included in this section: 1. **Status operator (configuring)** The status operator monitors a node's power state until HSM reports that the power state is on. - When the power state for a node is on, the status operator will either set the phase to `configuring` if CFS configuration is required or it will clear the current phase if the node is in its final state. + When the power state for a node is on, the status operator will either set the phase to `configuring` if CFS configuration is required or it will clear the current + phase if the node is in its final state. 1. **CFS applies configuration** @@ -268,18 +286,20 @@ The following workflows are included in this section: 1. **Administrator creates a BOS session template** - A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them. A session template can be created from a JSON structure. It returns a Session Template ID if successful. + A session template is a collection of data specifying a group of nodes, as well as the boot artifacts and configuration that should be applied to them. A session + template can be created from a JSON structure. It returns a session template ID if successful. - See [Manage a Session Template](Manage_a_Session_Template.md) for more information. + See [Manage a session template](Manage_a_Session_Template.md) for more information. 1. **Administrator creates a session** - Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, the administrator creates a session with operation as Boot and specifies the session template ID. + Create a session to perform the operation specified in the operation request parameter on the boot set defined in the session template. For this use case, + the administrator creates a session with operation as `shutdown` and specifies the session template ID. (`ncn-mw#`) ```bash - cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation Reboot + cray bos v2 sessions create --template-name SESSIONTEMPLATE_NAME --operation shutdown ``` 1. **Session setup operator** @@ -290,7 +310,8 @@ The following workflows are included in this section: 1. **Status operator (powering-off)** - The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating with HSM to determine the current power status of the node. + The status operator will detect the enabled components and assign them a phase. This involves checking the current state of the node, including communicating + with HSM to determine the current power status of the node. In this example of booting nodes, the first phase is `powering-off`. If queried at this point, the nodes will have a status of `power-off-pending`. For more on component phase and status, see [Component Status](Component_Status.md) @@ -298,12 +319,14 @@ The following workflows are included in this section: 1. **Graceful-power-off operator** The power-off operator will detect nodes with a `power-off-pending` status, calls CAPMC to power-off the node. - Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-off-gracefully-called`. + Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of + `power-off-gracefully-called`. 1. **Forceful-power-off operator** If powering-off is taking too long, the forceful-power-off will take over. It also calls CAPMC to power-off the node, but with the addition of the forceful flag. - Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of `power-off-forcefully-called`. + Then, the power-off operator will update the state of the node in BOS, including setting the last action. If queried at this point, the nodes will have a status of + `power-off-forcefully-called`. 1. **CAPMC powers off nodes** diff --git a/operations/boot_orchestration/Cheatsheet.md b/operations/boot_orchestration/Cheatsheet.md index 5eb422041523..7a658c429bb5 100644 --- a/operations/boot_orchestration/Cheatsheet.md +++ b/operations/boot_orchestration/Cheatsheet.md @@ -9,25 +9,25 @@ To find the API versions of any commands listed, add `-vvv` to the end of the CL * (`ncn-mw#`) Boot all nodes in a template: ```bash - cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Boot + cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation boot ``` * (`ncn-mw#`) Reboot all nodes in a template: ```bash - cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Reboot + cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation reboot ``` * (`ncn-mw#`) Shutdown all nodes in a template: ```bash - cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Shutdown + cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation shutdown ``` * (`ncn-mw#`) Stage a reboot for all nodes in a template: ```bash - cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Reboot --staged True + cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation reboot --staged True ``` ## Single node commands @@ -35,25 +35,25 @@ To find the API versions of any commands listed, add `-vvv` to the end of the CL * (`ncn-mw#`) Boot a single node: ```bash - cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Boot --limit + cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation boot --limit ``` * (`ncn-mw#`) Reboot a single node: ```bash - cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Reboot --limit + cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation reboot --limit ``` * (`ncn-mw#`) Shutdown a single node: ```bash - cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Shutdown --limit + cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation shutdown --limit ``` * (`ncn-mw#`) Stage a reboot for a single node: ```bash - cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation Reboot --staged True --limit + cray bos v2 sessions create --template-name SESSION_TEMPLATE_NAME --operation reboot --staged True --limit ``` * (`ncn-mw#`) Monitor the overall boot progress of a single node: diff --git a/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_CPS.md b/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_CPS.md index 193764c62b28..3721f481d525 100644 --- a/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_CPS.md +++ b/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_CPS.md @@ -125,7 +125,7 @@ Refer to [Manage a Session Template](Manage_a_Session_Template.md) for more info (`ncn-mw#`) The new CPS-based session template can be used when creating a BOS session. The following is an example of creating a reboot session using the CLI: ```bash -cray bos v2 sessions create --template-name cps_rootfs_template --operation Reboot +cray bos v2 sessions create --template-name cps_rootfs_template --operation reboot ``` ## Appendix: `root=` kernel parameter diff --git a/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_SBPS.md b/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_SBPS.md index fba11720c208..adcb1f2554bb 100644 --- a/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_SBPS.md +++ b/operations/boot_orchestration/Create_a_Session_Template_to_Boot_Compute_Nodes_with_SBPS.md @@ -165,7 +165,7 @@ Refer to [Manage a Session Template](Manage_a_Session_Template.md) for more info (`ncn-mw#`) The new CPS-based session template can be used when creating a BOS session. The following is an example of creating a reboot session using the CLI: ```bash -cray bos v2 sessions create --template-name cps_rootfs_template --operation Reboot +cray bos v2 sessions create --template-name cps_rootfs_template --operation reboot ``` ## Appendix: `root=` kernel parameter diff --git a/operations/boot_orchestration/Manage_a_BOS_Session.md b/operations/boot_orchestration/Manage_a_BOS_Session.md index 4a027b1be85d..08c43e432377 100644 --- a/operations/boot_orchestration/Manage_a_BOS_Session.md +++ b/operations/boot_orchestration/Manage_a_BOS_Session.md @@ -19,7 +19,7 @@ Creating a new BOS session requires the following command-line options: (`ncn-mw#`): The following is a boot operation: ```bash -cray bos v2 sessions create --template-name --operation Boot --format json +cray bos v2 sessions create --template-name --operation boot --format json ``` Example output: From 9ca86fe927646e5c9e8ddf4a91b3bf98cbebe334 Mon Sep 17 00:00:00 2001 From: Chris Spiller <86013738+spillerc-hpe@users.noreply.github.com> Date: Tue, 9 Jul 2024 20:29:16 +0100 Subject: [PATCH 24/37] CASMNET-2179 - Document kea arm64 and node-specific boot file functionality (#5214) * CASMNET-2179 - Document kea arm64 and node-specific boot file functionality * Update operations/network/dhcp/Customize_boot_file.md Signed-off-by: Nathan Rockershousen * Syntax Highlighting Signed-off-by: Russell Bunch --------- Signed-off-by: Nathan Rockershousen Signed-off-by: Russell Bunch Co-authored-by: Nathan Rockershousen Co-authored-by: Russell Bunch --- operations/README.md | 1 + .../network/dhcp/Customize_boot_file.md | 154 ++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 operations/network/dhcp/Customize_boot_file.md diff --git a/operations/README.md b/operations/README.md index dbaef7073ab7..8aacc77939ba 100644 --- a/operations/README.md +++ b/operations/README.md @@ -654,6 +654,7 @@ The DHCP service on the HPE Cray EX system uses the Internet Systems Consortium - [DHCP](network/dhcp/DHCP.md) - [Troubleshoot DHCP Issues](network/dhcp/Troubleshoot_DHCP_Issues.md) +- [DHCP boot file customization](network/dhcp/Customize_boot_file.md) ### Domain Name Service (DNS) diff --git a/operations/network/dhcp/Customize_boot_file.md b/operations/network/dhcp/Customize_boot_file.md new file mode 100644 index 000000000000..eaee15f64bfc --- /dev/null +++ b/operations/network/dhcp/Customize_boot_file.md @@ -0,0 +1,154 @@ +# DHCP boot file customization + +* [DHCP Boot file customization](#dhcp-boot-file-customization) + * [Background](#background) + * [Override the boot file name](#override-the-boot-file-name) + * [Verify the node DHCP configuration has been updated](#verify-the-node-dhcp-configuration-has-been-updated) + * [Reset the boot file to the default option](#reset-the-boot-file-name-to-default) + +## Background + +The `cray-dhcp-kea` service is configured to send a CPU architecture appropriate boot file based on the value received in +the client system architecture field (option 93) of the incoming DHCP request. By default `cray-dhcp-kea` will send +the following in the DHCP boot file name field (option 67) of the DHCP response. + +| Option 93 value | Filename | +|-------------------------|------------------| +| `0x7` - x64 UEFI | `ipxe.efi` | +| `0xb` - ARM 64-bit UEFI | `ipxe.arm64.efi` | + +It may be desirable to use a different boot file to the default one for testing or debugging purposes. This document +describes how the boot file name may be overridden on a per-node basis. + +## Override the boot file name + +1. (`ncn#`) Determine the HSM `ethernetInterfaces` record for the node. + + ```bash + cray hsm inventory ethernetInterfaces list --component-id x3000c0s17b4n0 + ``` + + Example output: + + ```toml + [[results]] + ID = "b42e99dfec47" + Description = "" + MACAddress = "b4:2e:99:df:ec:47" + LastUpdate = "2024-07-01T11:31:24.942557Z" + ComponentID = "x3000c0s17b4n0" + Type = "Node" + [[results.IPAddresses]] + IPAddress = "10.106.0.15" + ``` + +1. (`ncn#`) Set the desired boot file name by adding the `ipxe` option to `Description` field of the HSM `ethernetInterfaces` record. + + This example will set the boot file name to `ipxe.test`. + + ```bash + cray hsm inventory ethernetInterfaces update b42e99dfec47 --description="ipxe=ipxe.test" + ``` + + Example output: + + ```toml + ID = "b42e99dfec47" + Description = "ipxe=ipxe.test" + MACAddress = "b4:2e:99:df:ec:47" + LastUpdate = "2024-04-25T06:28:34.825112Z" + ComponentID = "x3000c0s17b4n0" + Type = "Node" + [[IPAddresses]] + IPAddress = "10.106.0.15" + ``` + +## Verify the node DHCP configuration has been updated + +1. (`ncn#`) Retrieve a token. + + ```bash + export TOKEN=$(curl -s -k -S -d grant_type=client_credentials -d client_id=admin-client \ + -d client_secret=`kubectl get secrets admin-client-auth -o jsonpath='{.data.client-secret}' | base64 -d` \ + https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token | jq -r '.access_token') + ``` + +2. (`ncn#`) Dump the DHCP server configuration. + + **`NOTE`** It make take up to two minutes for the change to HSM to be reflected in the DHCP server configuration as the DHCP helper has to run to update the configuration. + + ```bash + curl -H "Authorization: Bearer ${TOKEN}" -X POST -H "Content-Type: application/json" \ + -d '{ "command": "config-get", "service": [ "dhcp4" ] }' \ + https://api-gw-service-nmn.local/apis/dhcp-kea | jq + ``` + + The `boot-file-name` field for the node should reflect the desired boot file name. + + Example output: + + ```json + { + "boot-file-name": "ipxe.test", + "client-classes": [], + "hostname": "nid000004", + "hw-address": "b4:2e:99:df:ec:47", + "ip-address": "10.106.0.15", + "next-server": "0.0.0.0", + "option-data": [], + "server-hostname": "" + } + ``` + +When the node boots, it should now boot using the desired boot file. + +Example output: + +```text +2024-06-05 12:33:18 >>Start PXE over IPv4 on MAC: B4-2E-99-DF-EC-47. Press ESC key to abort PXE boot. +2024-06-05 12:33:26 Station IP address is 10.106.0.15 +2024-06-05 12:33:26 +2024-06-05 12:33:26 Server IP address is 10.92.100.60 +2024-06-05 12:33:26 NBP filename is ipxe.test +``` + +## Reset the boot file name to default + +1. (`ncn#`) Remove the `ipxe=` setting from the HSM `ethernetInterfaces` record. + + ```bash + cray hsm inventory ethernetInterfaces update b42e99dfec47 --description="" + ``` + + Example output: + + ```toml + ID = "b42e99dfec47" + Description = "" + MACAddress = "b4:2e:99:df:ec:47" + LastUpdate = "2024-04-25T06:28:34.825112Z" + ComponentID = "x3000c0s17b4n0" + Type = "Node" + [[IPAddresses]] + IPAddress = "10.106.0.15" + ``` + +1. Verify the node configuration. + + Use the [Verify the node DHCP configuration has been updated](#verify-the-node-dhcp-configuration-has-been-updated) procedure to verify the configuration for the node. + The `boot-file-name` field should be empty indicating that the DHCP service will supply the default boot file name. + + Example output: + + ```json + { + "boot-file-name": "", + "client-classes": [], + "hostname": "nid000004", + "hw-address": "b4:2e:99:df:ec:47", + "ip-address": "10.106.0.15", + "next-server": "0.0.0.0", + "option-data": [], + "server-hostname": "" + } + ``` From 3aa3661dfaa01b08c21ca15f86e31ca87b9a8fe7 Mon Sep 17 00:00:00 2001 From: shreni123 <53111642+shreni123@users.noreply.github.com> Date: Thu, 11 Jul 2024 00:40:21 +0530 Subject: [PATCH 25/37] CASMMON-401: Upgrade and fresh install changes for Victoria-metrics (#5207) CASMMON-401: Upgrade and fresh install changes into csm and docs-csm for Victoriametrics Co-authored-by: Rambabu Bolla --- .../upgrade/util/sysmgmt-health-upgrade.sh | 116 +++++------------- .../upgrade/util/update-customizations.sh | 45 +++---- 2 files changed, 42 insertions(+), 119 deletions(-) diff --git a/upgrade/scripts/upgrade/util/sysmgmt-health-upgrade.sh b/upgrade/scripts/upgrade/util/sysmgmt-health-upgrade.sh index a24714050ece..4d53b82e39e9 100755 --- a/upgrade/scripts/upgrade/util/sysmgmt-health-upgrade.sh +++ b/upgrade/scripts/upgrade/util/sysmgmt-health-upgrade.sh @@ -23,100 +23,40 @@ # OTHER DEALINGS IN THE SOFTWARE. # -# Function to check cray-sysmgmt-health chart with app version 9.3.1 for prometheus-operator and retain old PVs data. +# Function to check cray-sysmgmt-health chart with app version 45.1 for kube-prometheus-stack and delete old PVCs. function sysmgmt_health() { echo "Checking for chart version of cray-sysmgmt-health" version="45.1" - if [ ! -z "$(helm ls -o json --namespace sysmgmt-health | jq -r --argjson version $version '.[] | select(.app_version | sub(".[0-9]$";"") | tonumber | . < $version).name')" ]; then - prom_pvc="prometheus-cray-sysmgmt-health-promet-prometheus-db-prometheus-cray-sysmgmt-health-promet-prometheus-0" - alert_pvc="alertmanager-cray-sysmgmt-health-promet-alertmanager-db-alertmanager-cray-sysmgmt-health-promet-alertmanager-0" - echo "Get PV for both prometheus and Alertmanager" - prom_pv=$(kubectl get pvc -n sysmgmt-health -o jsonpath='{.spec.volumeName}' $prom_pvc) - alert_pv=$(kubectl get pvc -n sysmgmt-health -o jsonpath='{.spec.volumeName}' $alert_pvc) - prom_pv="${prom_pv//[\",]/}" - alert_pv="${alert_pv//[\",]/}" - echo "Prometheus PV: $prom_pv" - echo "Alertmanager PV: $alert_pv" - - # Patch the PersistenceVolume created/used by the prometheus-operator and alertmanager to Retain claim policy - prom_pv_reclaim=$(kubectl get pv -o jsonpath='{.spec.persistentVolumeReclaimPolicy}' $prom_pv) - alert_pv_reclaim=$(kubectl get pv -o jsonpath='{.spec.persistentVolumeReclaimPolicy}' $alert_pv) - prom_pv_reclaim="${prom_pv_reclaim//[\",]/}" - alert_pv_reclaim="${alert_pv_reclaim//[\",]/}" - if [ "$prom_pv_reclaim" != Retain ] && [ "$alert_pv_reclaim" != Retain ]; then - kubectl patch pv/$prom_pv -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}' - kubectl patch pv/$alert_pv -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}' - else - echo "PVs persistentVolumeReclaimPolicy is already Retain" - fi - - # Uninstall the cray-sysmgmt-health release - helm ls -o json --namespace sysmgmt-health | jq -r --argjson version $version '.[] | select(.app_version | sub(".[0-9]$";"") | tonumber | . < $version).name' | xargs -L1 helm uninstall --namespace sysmgmt-health - - # Delete the existing PersistentVolumeClaim, and verify PV become Released. - prom_pv_reclaim=$(kubectl get pv -o jsonpath='{.spec.persistentVolumeReclaimPolicy}' $prom_pv) - alert_pv_reclaim=$(kubectl get pv -o jsonpath='{.spec.persistentVolumeReclaimPolicy}' $alert_pv) - prom_pv_reclaim="${prom_pv_reclaim//[\",]/}" - alert_pv_reclaim="${alert_pv_reclaim//[\",]/}" - if [ "$prom_pv_reclaim" == Retain ] && [ "$alert_pv_reclaim" == Retain ]; then - kubectl delete pvc/$prom_pvc -n sysmgmt-health - kubectl delete pvc/$alert_pvc -n sysmgmt-health - prom_pv_phase=$(kubectl get pv -o jsonpath='{.status.phase}' $prom_pv) - alert_pv_phase=$(kubectl get pv -o jsonpath='{.status.phase}' $alert_pv) - prom_pv_phase="${prom_pv_phase//[\",]/}" - alert_pv_phase="${alert_pv_phase//[\",]/}" - echo "Verifying whether PVs became Released or not." - sleep 5 - if [ "$alert_pv_phase" == Released ] && [ "$prom_pv_phase" == Released ]; then - echo "Both Prometheus and Alertmanager PVs are Released" - else - echo >&2 "PVs are not Released. Verify if PV exists or not." - echo "Prometheus PV: $prom_pv" - echo "Alertmanager PV: $alert_pv" - exit - fi - - # Remove the cray-sysmgmt-health-promet-kubelet service. - echo "Deleting cray-sysmgmt-health-promet-kubelet service in kube-system namespace." - kubectl delete service/cray-sysmgmt-health-promet-kubelet -n kube-system - - # Remove all the existing CRDs (ServiceMonitors, Podmonitors, etc.) - echo "Deleting sysmgmt-health existing CRDs" - for c in $(kubectl get crds -A -o jsonpath='{range .items[?(@.metadata.annotations.controller-gen\.kubebuilder\.io\/version=="v0.2.4")]}{.metadata.name}{"\n"}{end}'); do - kubectl delete crd ${c} - done - else - echo >&2 "PersistenceVolume created/used by the prometheus-operator and alertmanager is not Retain claim policy" - echo >&2 "Exiting" - exit - fi - - # Remove current spec.claimRef values to change the PV's status from Released to Available. - if [ "$alert_pv_phase" == Released ] && [ "$prom_pv_phase" == Released ]; then - kubectl patch pv/$prom_pv --type json -p='[{"op": "remove", "path": "/spec/claimRef"}]' - kubectl patch pv/$alert_pv --type json -p='[{"op": "remove", "path": "/spec/claimRef"}]' - prom_pv_phase=$(kubectl get pv -o jsonpath='{.status.phase}' $prom_pv) - alert_pv_phase=$(kubectl get pv -o jsonpath='{.status.phase}' $alert_pv) - prom_pv_phase="${prom_pv_phase//[\",]/}" - alert_pv_phase="${alert_pv_phase//[\",]/}" - echo "Verifying whether PV became Available or not." - sleep 5 - if [ "$alert_pv_phase" == Available ] && [ "$prom_pv_phase" == Available ]; then - echo "Both Prometheus and Alertmanager PVs are Available. Ready to deploy the latest cray-sysmgmt-chart now." - else - echo >&2 "PVs are not Available. Verify if PV exists or not." - echo "Prometheus PV: $prom_pv" - echo "Alertmanager PV: $alert_pv" - exit - fi - else - echo "PV's status is not Released. Exiting" - exit - fi + if [ "$(helm ls -o json --namespace sysmgmt-health | jq -r --argjson version $version '.[] | select(.app_version | sub(".[0-9]$";"") | tonumber | . = $version).name')" ]; then + prom0_pvc="prometheus-cray-sysmgmt-health-kube-p-prom-db-prometheus-cray-sysmgmt-health-kube-p-prom-0" + prom1_pvc="prometheus-cray-sysmgmt-health-kube-p-prom-db-prometheus-cray-sysmgmt-health-kube-p-prom-1" + prom0_shard_pvc="prometheus-cray-sysmgmt-health-kube-p-prom-db-prometheus-cray-sysmgmt-health-kube-p-prom-shard-1-0" + prom1_shard_pvc="prometheus-cray-sysmgmt-health-kube-p-prom-db-prometheus-cray-sysmgmt-health-kube-p-prom-shard-1-1" + alert_pvc="alertmanager-cray-sysmgmt-health-kube-p-alertmanager-db-alertmanager-cray-sysmgmt-health-kube-p-alertmanager-0" + thanos_ruler_pvc="thanos-ruler-kube-prometheus-stack-thanos-ruler-data-thanos-ruler-kube-prometheus-stack-thanos-ruler-0" + + # Uninstall the cray-sysmgmt-health and delete PVCs + helm ls -o json --namespace sysmgmt-health | jq -r --argjson version $version '.[] | select(.app_version | sub(".[0-9]$";"") | tonumber | . = $version).name' | xargs -L1 helm uninstall --namespace sysmgmt-health + + kubectl delete pvc/$prom0_pvc -n sysmgmt-health + kubectl delete pvc/$prom1_pvc -n sysmgmt-health + kubectl delete pvc/$prom0_shard_pvc -n sysmgmt-health + kubectl delete pvc/$prom1_shard_pvc -n sysmgmt-health + kubectl delete pvc/$alert_pvc -n sysmgmt-health + kubectl delete pvc/$thanos_ruler_pvc -n sysmgmt-health + + # Remove the cray-sysmgmt-health-promet-kubelet service. + echo "Deleting cray-sysmgmt-health-kube-p-kubelet service in kube-system namespace." + kubectl delete service/cray-sysmgmt-health-kube-p-kubelet -n kube-system + + # Remove all the existing CRDs (ServiceMonitors, Podmonitors, etc.) + echo "Deleting sysmgmt-health existing CRDs" + for c in $(kubectl get crds -A -o jsonpath='{range .items[?(@.metadata.annotations.controller-gen\.kubebuilder\.io\/version=="v0.2.4")]}{.metadata.name}{"\n"}{end}'); do + kubectl delete crd ${c} + done fi } # sysmgmt_health function call - sysmgmt_health diff --git a/upgrade/scripts/upgrade/util/update-customizations.sh b/upgrade/scripts/upgrade/util/update-customizations.sh index ff3dd532f574..f9986166f10e 100755 --- a/upgrade/scripts/upgrade/util/update-customizations.sh +++ b/upgrade/scripts/upgrade/util/update-customizations.sh @@ -127,40 +127,23 @@ if [[ -z "$(yq r "$c" "spec.network.netstaticips.nmn_ncn_storage_mons")" ]]; the done yq w -i --style=single "$c" spec.kubernetes.services.cray-sysmgmt-health.cephExporter.endpoints '{{ network.netstaticips.nmn_ncn_storage_mons }}' fi -if [[ "$(yq r "$c" "spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.enabled")" ]]; then - idx=0 - temp=1 - mon_node=$(yq r "$c" 'spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.params.conf.target' | awk '{print $2}') - for node in ${mon_node}; do - yq w -i "$c" "spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.params[${idx}].name" "snmp$temp" - yq w -i "$c" "spec.kubernetes.services.cray-sysmgmt-health.prometheus-snmp-exporter.serviceMonitor.params[${idx}].target" "${node}" - idx=$((idx + 1)) - temp=$((temp + 1)) - done -fi - -# Cray-sysmgmt-health -yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.thanosCompactor.resolutionraw = "15d"' -i $c -yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.thanosCompactor.resolution5m = "15d"' -i $c -yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.thanosCompactor.resolution1h = "15d"' -i $c # Kube-prometheus-stack -if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator' $c)" != null ]; then - if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack' $c)" != null ]; then - yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator = (.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack * .spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator)' -i $c +if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack' $c)" != null ]; then + if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack' $c)" != null ]; then + yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack = (.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack * .spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack)' -i $c fi - yq4 eval 'del(.spec.proxiedWebAppExternalHostnames.customerManagement.[] | select(. == "*prometheus-operator*"))' -i $c - yq4 eval ".spec.proxiedWebAppExternalHostnames.customerManagement += \"{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].prometheus.prometheusSpec.externalAuthority }}\"" -i $c - yq4 eval ".spec.proxiedWebAppExternalHostnames.customerManagement += \"{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].alertmanager.alertmanagerSpec.externalAuthority }}\"" -i $c - yq4 eval ".spec.proxiedWebAppExternalHostnames.customerManagement += \"{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].grafana.externalAuthority }}\"" -i $c - yq4 eval ".spec.proxiedWebAppExternalHostnames.customerManagement += \"{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].thanos.thanosSpec.externalAuthority }}\"" -i $c - yq4 eval ".spec.kubernetes.services.cray-kiali.kiali-operator.cr.spec.external_services.grafana.url = \"https://{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].grafana.externalAuthority }}/\"" -i $c - yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack = .spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator | del(.spec.kubernetes.services.cray-sysmgmt-health.prometheus-operator)" -i $c - yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.prometheus.prometheusSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].prometheus.prometheusSpec.externalAuthority }}/\"" -i $c - yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.alertmanager.alertmanagerSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].alertmanager.alertmanagerSpec.externalAuthority }}/\"" -i $c - yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.thanos.thanosSpec.externalAuthority = "thanos.cmn.{{ network.dns.external }}"' -i $c - yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.thanos.thanosSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['kube-prometheus-stack'].thanos.thanosSpec.externalAuthority }}/\"" -i $c - yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack.thanos.s3_endpoint = "{{network.dns.internal_s3 }}"' -i $c + yq4 eval 'del(.spec.proxiedWebAppExternalHostnames.customerManagement[] | select(. == "{{ kubernetes.services['\''cray-sysmgmt-health'\'']['\''kube-prometheus-stack'\''].thanos.thanosSpec.externalAuthority }}"))' -i $c + yq4 ".spec.proxiedWebAppExternalHostnames.customerManagement[3] = \"{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].vmselect.vmselectSpec.externalAuthority }}\"" -i $c + yq4 ".spec.proxiedWebAppExternalHostnames.customerManagement[4] = \"{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].alertmanager.externalAuthority }}\"" -i $c + yq4 ".spec.proxiedWebAppExternalHostnames.customerManagement[5] = \"{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].grafana.externalAuthority }}\"" -i $c + yq4 eval ".spec.kubernetes.services.cray-kiali.kiali-operator.cr.spec.external_services.grafana.url = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].grafana.externalAuthority }}/\"" -i $c + yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack = .spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack | del(.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack)" -i $c + yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.vmselect.vmselectSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].vmselect.vmselectSpec.externalAuthority }}/\"" -i $c + yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].alertmanager.externalAuthority }}/\"" -i $c + yq4 'del(.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.alertmanagerSpec)' -i $c + yq4 'del(.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.prometheus)' -i $c + yq4 'del(.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.thanos)' -i $c fi #sma-pcim From 5c5b4ee98f54b15cc2c69aba5fe83fcd71c14bf0 Mon Sep 17 00:00:00 2001 From: shreni123 <53111642+shreni123@users.noreply.github.com> Date: Sat, 13 Jul 2024 01:17:31 +0530 Subject: [PATCH 26/37] CASMMON-412: cray-oauth2-proxies upgrade fix due to victoria-metrics upgrade (#5228) Co-authored-by: Rambabu Bolla --- upgrade/scripts/upgrade/util/update-customizations.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/upgrade/scripts/upgrade/util/update-customizations.sh b/upgrade/scripts/upgrade/util/update-customizations.sh index f9986166f10e..bf6d3f59cbd7 100755 --- a/upgrade/scripts/upgrade/util/update-customizations.sh +++ b/upgrade/scripts/upgrade/util/update-customizations.sh @@ -139,6 +139,8 @@ if [ "$(yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus- yq4 ".spec.proxiedWebAppExternalHostnames.customerManagement[5] = \"{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].grafana.externalAuthority }}\"" -i $c yq4 eval ".spec.kubernetes.services.cray-kiali.kiali-operator.cr.spec.external_services.grafana.url = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].grafana.externalAuthority }}/\"" -i $c yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack = .spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack | del(.spec.kubernetes.services.cray-sysmgmt-health.kube-prometheus-stack)" -i $c + yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.vmselect.vmselectSpec.externalAuthority = "vmselect.cmn.{{ network.dns.external }}"' -i $c + yq4 eval '.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.externalAuthority = "alertmanager.cmn.{{ network.dns.external }}"' -i $c yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.vmselect.vmselectSpec.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].vmselect.vmselectSpec.externalAuthority }}/\"" -i $c yq4 eval ".spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.externalUrl = \"https://{{ kubernetes.services['cray-sysmgmt-health']['victoria-metrics-k8s-stack'].alertmanager.externalAuthority }}/\"" -i $c yq4 'del(.spec.kubernetes.services.cray-sysmgmt-health.victoria-metrics-k8s-stack.alertmanager.alertmanagerSpec)' -i $c From 544eee31dd5c0366e10c4ceb09fa12a1e081a654 Mon Sep 17 00:00:00 2001 From: Joel Landsteiner <76180635+jsl-hpe@users.noreply.github.com> Date: Tue, 16 Jul 2024 14:04:50 -0500 Subject: [PATCH 27/37] CASMCMS-8894 Include new image management workflow instructions for updating image labels (#5222) * Include new image management workflow instructions for updating image labels * Remove trailing spaces * Separate command from expected output * add more spacing by tripleback ticks. * Apply suggestions from code review `s/bash/toml/` Signed-off-by: Russell Bunch * Update operations/image_management/Image_Management_Workflows.md Signed-off-by: Nathan Rockershousen --------- Signed-off-by: Russell Bunch Signed-off-by: Nathan Rockershousen Co-authored-by: Russell Bunch Co-authored-by: Nathan Rockershousen --- .../Image_Management_Workflows.md | 64 ++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/operations/image_management/Image_Management_Workflows.md b/operations/image_management/Image_Management_Workflows.md index b43f42fa3c4b..56fc10c2f065 100644 --- a/operations/image_management/Image_Management_Workflows.md +++ b/operations/image_management/Image_Management_Workflows.md @@ -1,6 +1,6 @@ # Image Management Workflows -Overview of how to create an image and how to customize and image. +Overview of how to create an image and how to customize an image. The following workflows are intended to be high-level overviews of image management tasks. These workflows depict how services interact with each other during image management and help to provide a quicker and deeper understanding of how the system functions. @@ -9,6 +9,7 @@ The workflows in this section include: * [Create a new image](#create-a-new-image) * [Customize an image](#customize-an-image) +* [Manage image labels](#manage-image-labels) ## Create a new image @@ -172,3 +173,64 @@ Mentioned in this workflow: 1. Upload the new image artifacts to Ceph S3. The new image artifacts are uploaded to Ceph S3. + +## Manage Image Labels + +**Use Case:** The system administrator would like to apply user supplied information about IMS images or remove metadata that has been previously set. + +**Components:** This workflow is based on the interaction of the Image Management Service \(IMS\) with other services after the image build process completes. The information added or removed can be used by separate +APIs or processes for whatever specific purposes they implement for, as it involves specific images. Generally, downstream APIs define specific keys and values that can be associated with an image, then perform specific +actions against those image records in a way that is consistent with their API's behavior. Typically, this allows administrators to attach general purpose information about IMS images that will help them manage the lifecycle +of images that IMS maintains. + +**Workflow Overview:** The following sequence of steps occurs during this workflow. + +1. (`ncn-mw#`) Administrator identifies the image to add metadata information to. + + Administrators may already know the image ID in question to label. If not, examining the existing images may be of help. + + ```bash + cray ims images list + ``` + +1. (`ncn-mw#`) Administrators may set a new label for an existing IMS image. + + One label may be changed (added or removed) during each update operation. Existing values for the provided key may be overwritten if already part of the image record. + + ```bash + cray ims images update a506a6f6-54d9-4e5a-9e8d-1fc052d62504 --metadata-operation set --metadata-value value --metadata-key key + ``` + +1. (`ncn-mw#`) Administrators and downstream APIs may obtain the active record for a given image. + + Image metadata information is also available via the `list` command for all images. + + ```bash + cray ims images describe a506a6f6-54d9-4e5a-9e8d-1fc052d62504 + ``` + + Expected output: + + ```toml + arch = "x86_64" + created = "2024-06-27T15:41:22.467177" + id = "a506a6f6-54d9-4e5a-9e8d-1fc052d62504" + [metadata] + key = "value" + ``` + +1. (`ncn-mw#`) Administrators may remove previously set image metadata. + + Downstream APIs and Administrators using the CLI may affect these changes. During `--metadata-operation remove`, users may omit `--metadata-value` command line arguments. + + ```bash + cray ims images update a506a6f6-54d9-4e5a-9e8d-1fc052d62504 --metadata-operation remove --metadata-key key + ``` + + Expected output: + + ```toml + arch = "x86_64" + created = "2024-06-27T15:41:22.467177" + id = "a506a6f6-54d9-4e5a-9e8d-1fc052d62504" + ``` From e613c67056e4d7036d384038b3add168f1c139df Mon Sep 17 00:00:00 2001 From: Chris Spiller <86013738+spillerc-hpe@users.noreply.github.com> Date: Tue, 16 Jul 2024 20:45:52 +0100 Subject: [PATCH 28/37] CASMINST-5556 - Remove outdated and unreferenced NTP procedure (#5223) * CASMINST-5556 - Remove outdated and unreferenced NTP procedure * Fix table of contents --- operations/README.md | 1 - operations/resiliency/NTP_Resiliency.md | 31 ------------------------- 2 files changed, 32 deletions(-) delete mode 100644 operations/resiliency/NTP_Resiliency.md diff --git a/operations/README.md b/operations/README.md index 8aacc77939ba..abc54b6625bb 100644 --- a/operations/README.md +++ b/operations/README.md @@ -385,7 +385,6 @@ HPE Cray EX systems are designed so that system management services \(SMS\) are - [Resilience of System Management Services](resiliency/Resilience_of_System_Management_Services.md) - [Restore System Functionality if a Kubernetes Worker Node is Down](resiliency/Restore_System_Functionality_if_a_Kubernetes_Worker_Node_is_Down.md) - [Recreate `StatefulSet` Pods on Another Node](resiliency/Recreate_StatefulSet_Pods_on_Another_Node.md) -- [NTP Resiliency](resiliency/NTP_Resiliency.md) - [Resiliency Testing Procedure](resiliency/Resiliency_Testing_Procedure.md) ## ConMan diff --git a/operations/resiliency/NTP_Resiliency.md b/operations/resiliency/NTP_Resiliency.md deleted file mode 100644 index 4ab7f0c4a17c..000000000000 --- a/operations/resiliency/NTP_Resiliency.md +++ /dev/null @@ -1,31 +0,0 @@ -# NTP Resiliency - -Synchronize the time on all non-compute nodes \(NCNs\) via Network Time Protocol \(NTP\). Avoid a single point of failure for NTP when testing system resiliency. - -## Prerequisites - -This procedure requires administrative privileges. - -## Procedure - -1. (`ncn#`) Set the date manually if the time on NCNs is off by more than an a few hours. - - For example: - - ```bash - timedatectl set-time "2021-02-19 15:04:00" - ``` - -1. (`pit#`) Configure NTP on the Pre-install Toolkit \(PIT\). - - ```bash - /root/bin/configure-ntp.sh - ``` - -1. (`ncn#`) Sync NTP on all other nodes. - - If more than nine NCNs are in use on the system, update the loop in the following command accordingly. - - ```bash - for i in ncn-{w,s}00{1..3} ncn-m00{2..3}; do echo "------$i--------"; ssh $i '/srv/cray/scripts/common/chrony/csm_ntp.py'; done - ``` From 0c98b0382d080ea8c0683edc5a347f2312b31f7b Mon Sep 17 00:00:00 2001 From: Jenkins Date: Wed, 17 Jul 2024 17:25:52 +0000 Subject: [PATCH 29/37] Automated API docs swagger to md conversion (https://jenkins.algol60.net/job/Cray-HPE/job/csm/job/v1.6.0-alpha.58/1/) --- api/sls.md | 56 ------------------------------------------------------ 1 file changed, 56 deletions(-) diff --git a/api/sls.md b/api/sls.md index f7c2332c8177..cd3931b65fa3 100644 --- a/api/sls.md +++ b/api/sls.md @@ -329,14 +329,6 @@ Status Code **200** *xor* -|Name|Type|Required|Restrictions|Description| -|---|---|---|---|---| -|»» *anonymous*|[hardware_comptype_virtual_node](#schemahardware_comptype_virtual_node)|false|none|none| -|»»» NodeType|string|true|none|The role type assigned to this node.| -|»»» nid|integer|false|none|none| - -*xor* - |Name|Type|Required|Restrictions|Description| |---|---|---|---|---| |»» *anonymous*|[hardware_ip_and_creds_optional](#schemahardware_ip_and_creds_optional)|false|none|none| @@ -357,7 +349,6 @@ Status Code **200** |NodeType|Application| |NodeType|Storage| |NodeType|Management| -|NodeType|Management|