diff --git a/install/README.md b/install/README.md index 629243b205b5..4daeb20fd8b2 100644 --- a/install/README.md +++ b/install/README.md @@ -40,39 +40,35 @@ describes how to install additional HPE Cray EX software products using the The topics in this chapter need to be done as part of an ordered procedure so are shown here with numbered topics. -- [Cray System Management Install](#cray-system-management-install) - - [Bifurcated CAN notice](#bifurcated-can-notice) - - [High-level overview of CSM install](#high-level-overview-of-csm-install) - - [Topics](#topics) - - [Pre-installation](#pre-installation) - - [1. Preparing for a re-installation](#1-preparing-for-a-re-installation) - - [2. Boot installation environment](#2-boot-installation-environment) - - [3. Download the CSM tarball](#3-download-the-csm-tarball) - - [4. Create system configuration](#4-create-system-configuration) - - [5. Import the CSM tarball](#5-import-the-csm-tarball) - - [6. Validate the LiveCD](#6-validate-the-livecd) - - [7. Configure management network switches](#7-configure-management-network-switches) - - [Ensure SNMP is configured on the management network switches](#ensure-snmp-is-configured-on-the-management-network-switches) - - [When the management network is already configured](#when-the-management-network-is-already-configured) - - [When the management network has not been configured](#when-the-management-network-has-not-been-configured) - - [Configure the management network with CANU](#configure-the-management-network-with-canu) - - [Installation](#installation) - - [1. Deploy management nodes](#1-deploy-management-nodes) - - [2. Install CSM services](#2-install-csm-services) - - [3. Validate CSM health before final NCN deployment](#3-validate-csm-health-before-final-ncn-deployment) - - [4. Deploy final NCN](#4-deploy-final-ncn) - - [5. Configure administrative access](#5-configure-administrative-access) - - [6. Validate CSM health](#6-validate-csm-health) - - [7. Configure Prometheus alert notifications](#7-configure-prometheus-alert-notifications) - - [8. Update ceph node-exporter config for SNMP counters](#8-update-ceph-node-exporter-config-for-snmp-counters) - - [9. Upload Olympus BMC recovery firmware into TFTP server](#9-upload-olympus-bmc-recovery-firmware-into-tftp-server) - - [10. Update firmware with FAS](#10-update-firmware-with-fas) - - [11. Prepare compute nodes](#11-prepare-compute-nodes) - - [12. Troubleshooting installation problems](#12-troubleshooting-installation-problems) - - [Post-Installation](#post-installation) - - [1. Kubernetes encryption](#1-kubernetes-encryption) - - [2. Export Nexus data](#2-export-nexus-data) - - [Installation of additional HPE Cray EX software products](#installation-of-additional-hpe-cray-ex-software-products) +- [Pre-installation](#pre-installation) + 1. [Preparing for a re-installation](#1-preparing-for-a-re-installation) + 1. [Boot installation environment](#2-boot-installation-environment) + 1. [Download the CSM tarball](#3-download-the-csm-tarball) + 1. [Create system configuration](#4-create-system-configuration) + 1. [Import the CSM tarball](#5-import-the-csm-tarball) + 1. [Validate the LiveCD](#6-validate-the-livecd) + 1. [Configure management network switches](#7-configure-management-network-switches) + - [Ensure SNMP is configured on the management network switches](#ensure-snmp-is-configured-on-the-management-network-switches) + - [When the management network is already configured](#when-the-management-network-is-already-configured) + - [When the management network has not been configured](#when-the-management-network-has-not-been-configured) + - [Configure the management network with CANU](#configure-the-management-network-with-canu) +- [Installation](#installation) + 1. [Deploy management nodes](#1-deploy-management-nodes) + 1. [Install CSM services](#2-install-csm-services) + 1. [Validate CSM health before final NCN deployment](#3-validate-csm-health-before-final-ncn-deployment) + 1. [Deploy final NCN](#4-deploy-final-ncn) + 1. [Configure administrative access](#5-configure-administrative-access) + 1. [Validate CSM health](#6-validate-csm-health) + 1. [Configure Prometheus alert notifications](#7-configure-prometheus-alert-notifications) + 1. [Update ceph node-exporter config for SNMP counters](#8-update-ceph-node-exporter-config-for-snmp-counters) + 1. [Upload Olympus BMC recovery firmware into TFTP server](#9-upload-olympus-bmc-recovery-firmware-into-tftp-server) + 1. [Update firmware with FAS](#10-update-firmware-with-fas) + 1. [Prepare compute nodes](#11-prepare-compute-nodes) + 1. [Troubleshooting installation problems](#12-troubleshooting-installation-problems) +- [Post-Installation](#post-installation) + 1. [Kubernetes encryption](#1-kubernetes-encryption) + 1. [Export Nexus data](#2-export-nexus-data) +- [Installation of additional HPE Cray EX software products](#installation-of-additional-hpe-cray-ex-software-products) > **`NOTE`** If problems are encountered during the installation, > [Troubleshooting installation problems](#12-troubleshooting-installation-problems) and diff --git a/install/configure_administrative_access.md b/install/configure_administrative_access.md index 4a85ad90f9d6..2b86a1accec3 100644 --- a/install/configure_administrative_access.md +++ b/install/configure_administrative_access.md @@ -10,19 +10,17 @@ and BMC/controller passwords. ## Topics -- [Configure Administrative Access](#configure-administrative-access) - - [Topics](#topics) - - [1. Configure the Cray and SAT command line interfaces](#1-configure-the-cray-and-sat-command-line-interfaces) - - [Automatic configuration using temporary Keycloak account](#automatic-configuration-using-temporary-keycloak-account) - - [Manual configuration](#manual-configuration) - - [2. Set `Management` role on the BMCs of management nodes](#2-set-management-role-on-the-bmcs-of-management-nodes) - - [3. Lock management nodes](#3-lock-management-nodes) - - [4. Configure BMC and controller parameters with SCSD](#4-configure-bmc-and-controller-parameters-with-scsd) - - [5. Set up passwordless SSH](#5-set-up-passwordless-ssh) - - [6. Configure the root password and SSH keys in Vault](#6-configure-the-root-password-and-ssh-keys-in-vault) - - [7. Add switch admin password to Vault](#7-add-switch-admin-password-to-vault) - - [8. Configure management nodes with CFS](#8-configure-management-nodes-with-cfs) - - [9. Proceed to next topic](#9-proceed-to-next-topic) +1. [Configure the Cray and SAT command line interfaces](#1-configure-the-cray-and-sat-command-line-interfaces) + - [Automatic configuration using temporary Keycloak account](#automatic-configuration-using-temporary-keycloak-account) + - [Manual configuration](#manual-configuration) +1. [Set `Management` role on the BMCs of management nodes](#2-set-management-role-on-the-bmcs-of-management-nodes) +1. [Lock management nodes](#3-lock-management-nodes) +1. [Configure BMC and controller parameters with SCSD](#4-configure-bmc-and-controller-parameters-with-scsd) +1. [Set up passwordless SSH](#5-set-up-passwordless-ssh) +1. [Configure the root password and SSH keys in Vault](#6-configure-the-root-password-and-ssh-keys-in-vault) +1. [Add switch admin password to Vault](#7-add-switch-admin-password-to-vault) +1. [Configure management nodes with CFS](#8-configure-management-nodes-with-cfs) +1. [Proceed to next topic](#9-proceed-to-next-topic) > **`NOTE`** The procedures in this section of installation documentation are intended to be done in order, even though the topics are > administrative or operational procedures. The topics themselves do not have navigational links to the next topic in the sequence. diff --git a/introduction/deprecated_features/README.md b/introduction/deprecated_features/README.md index e268220c8292..8006abb6bd5a 100644 --- a/introduction/deprecated_features/README.md +++ b/introduction/deprecated_features/README.md @@ -41,7 +41,7 @@ in chronological order. - [Boot Orchestration Service (BOS)](../../glossary.md#boot-orchestration-service-bos) v1 - [Cray Advanced Platform Monitoring and Control (CAPMC)](../../glossary.md#cray-advanced-platform-monitoring-and-control-capmc) - is deprecated, starting in CSM 1.5, and may be removed in the future. It has been + is deprecated, starting in CSM 1.5, and may be removed in the future. It has been replaced with the [Power Control Service (PCS)](../../glossary.md#power-control-service-pcs). Everyone is encouraged to transition to PCS as soon as possible. - User Access Service diff --git a/operations/firmware/FAS_Paradise.md b/operations/firmware/FAS_Paradise.md index 5c6ed9f830ff..d84f473c4af7 100644 --- a/operations/firmware/FAS_Paradise.md +++ b/operations/firmware/FAS_Paradise.md @@ -1,412 +1,412 @@ -# Updating Foxconn Paradise Nodes with FAS - -Use the Firmware Action Service (FAS) to update the firmware on Foxconn Paradise devices. Each procedure includes the prerequisites and example recipes required to update the firmware. - -**NOTE:** Any node that is locked remains in the state `inProgress` with the `stateHelper` message of `"failed to lock"` until the action times out, or the lock is released. -If the action is timed out, these nodes report as `failed` with the `stateHelper` message of `"time expired; could not complete update"`. -This includes NCNs which are manually locked to prevent accidental rebooting and firmware updates. - -Refer to [FAS Filters](FAS_Filters.md) for more information on the content used in the example JSON files. - -The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to perform default updates to firmware and BIOS. - -## Prerequisites - -* The Cray command line interface \(CLI\) tool is initialized and configured on the system. -See [Configure the Cray CLI](../configure_cray_cli.md). -* The firmware images are loaded into S3 and to the TFTP server. -See [Upload Paradise images to TFTP server](#upload-paradise-images-to-tftp-server) - -The following targets can be updated with FAS on Paradise Nodes: - -1. [`bmc_active`](#update-paradise-bmc_active-procedure) -1. [`bios_active`](#update-paradise-bios_active-procedure) -1. [`erot_active`](#update-paradise-erot_active-procedure) -1. [`fpga_active`](#update-paradise-fpga_active-procedure) -1. [`pld_active`](#update-paradise-pld_active-procedure) - -## Update Paradise `bmc_active` procedure - -NOTE: If a reset of the BMC is required, follow [this procedure](#reset-bmc) before and after the update of each node. *Only do this if required!* - -The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `bmc_active` - use recipe `foxconn_nodeBMC_bmc.json` - -The BMC will reboot after the update is complete. - -To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) - -```json -{ -"stateComponentFilter": { - "deviceTypes": [ "nodeBMC" ] - }, -"inventoryHardwareFilter": { - "manufacturer": "foxconn" - }, -"targetFilter": { - "targets": [ "bmc_active" ] - }, -"command": { - "version": "latest", - "tag": "default", - "overrideDryrun": false, - "restoreNotPossibleOverride": true, - "timeLimit": 1000, - "description": "Dryrun upgrade of Foxconn bmc_active" - } -} -``` - -**IMPORTANT:** There is a known bug that causes the `hmcollector-poll` service to lose event subscriptions -after BMC firmware is updated. After updating BMC firmware, the `hmcollector-poll` service must be restarted to -work around this issue. After the update is complete, and you confirm the BMC has been rebooted, restart -the `hmcollector-poll` service with this command: - -```bash -kubectl -n services rollout restart deployment cray-hms-hmcollector-poll -``` - -## Update Paradise `bios_active` procedure - -The nodes must be **OFF** before updating the BIOS - -**IMPORTANT:** After the update has completed, the nodes must be turned on and **REMAIN ON FOR AT LEAST 6 MINUTES** - -**NOTE:** The version number reported by Redfish will NOT be updated until the node has fully booted. - -The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `bios_active` - use recipe `foxconn_nodeBMC_bios.json` - -To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) - -```json -{ -"stateComponentFilter": { - "deviceTypes": [ "nodeBMC" ] - }, -"inventoryHardwareFilter": { - "manufacturer": "foxconn" - }, -"targetFilter": { - "targets": [ "bios_active" ] - }, -"command": { - "version": "latest", - "tag": "default", - "overrideDryrun": false, - "restoreNotPossibleOverride": true, - "timeLimit": 1000, - "description": "Dryrun upgrade of Foxconn bios_active" - } -} -``` - -## Update Paradise `erot_active` procedure - -**NOTE:** After update of `erot_active` an AC power cycle is required for update to take affect. -To do an AC power cycle, run the following command (`ncn#`). - -```bash -ssh admin@$(xname) "ipmitool raw 0x38 0x02" -``` - -The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `erot_active` - use recipe `foxconn_nodeBMC_erot.json` - -To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) - -```json -{ -"stateComponentFilter": { - "deviceTypes": [ "nodeBMC" ] - }, -"inventoryHardwareFilter": { - "manufacturer": "foxconn" - }, -"targetFilter": { - "targets": [ "erot_active" ] - }, -"command": { - "version": "latest", - "tag": "default", - "overrideDryrun": false, - "restoreNotPossibleOverride": true, - "timeLimit": 1000, - "description": "Dryrun upgrade of Foxconn bios_active" - } -} -``` - -## Update Paradise `fpga_active` procedure - -**NOTE:** After update of `fpga_active` an AC power cycle is required for update to take affect. -To do an AC power cycle, run the following command (`ncn#`). - -```bash -ssh admin@$(xname) "ipmitool raw 0x38 0x02" -``` - -The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `fpga_active` - use recipe `foxconn_nodeBMC_fpga.json` - -To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) - -```json -{ -"stateComponentFilter": { - "deviceTypes": [ "nodeBMC" ] - }, -"inventoryHardwareFilter": { - "manufacturer": "foxconn" - }, -"targetFilter": { - "targets": [ - "fpga_active" - ] - }, -"command": { - "version": "latest", - "tag": "default", - "overrideDryrun": false, - "restoreNotPossibleOverride": true, - "timeLimit": 1000, - "description": "Dryrun upgrade of Foxconn bios_active" - } -} -``` - -## Update Paradise `pld_active` procedure - -**IMPORTANT:** The update of the target `pld_active` should only be applied to blade 1 (i.e. `x3000c0s3b1`) - applying to other blades at the same time may cause issues. To use the `FASUpdate.py script`, use the `--xnames` flag to specify `b1`. - -The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `pld_active` - use recipe `foxconn_nodeBMC_pld.json` - -To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) - -```json -{ -"stateComponentFilter": { - "xnames": [ "x3000c0s3b1" ], - "deviceTypes": [ "nodeBMC" ] - }, -"inventoryHardwareFilter": { - "manufacturer": "foxconn" - }, -"targetFilter": { - "targets": [ "pld_active" ] - }, -"command": { - "version": "latest", - "tag": "default", - "overrideDryrun": false, - "restoreNotPossibleOverride": true, - "timeLimit": 1000, - "description": "Dryrun upgrade of Foxconn bios_active" - } -} -``` - -## Update Paradise firmware using JSON file and Cray CLI - -**NOTE:** The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to perform default updates to firmware and BIOS. - -1. Create a JSON file using the example recipe. - -1. Initiate a dry-run to verify the firmware that will be updated and the version it will update to. - - 1. (`ncn#`) Create the dry-run session. - - The `overrideDryrun = false` value indicates that the command will do a dry run. - - ```bash - cray fas actions create nodeBMC.json --format toml - ``` - - Example output: - - ```toml - overrideDryrun = false - actionID = "fddd0025-f5ff-4f59-9e73-1ca2ef2a432d" - ``` - - 1. (`ncn#`) Describe the `actionID` for firmware update dry-run job. - - Replace the `actionID` value with the string returned in the previous step. In this example, `"fddd0025-f5ff-4f59-9e73-1ca2ef2a432d"` is used. - - ```bash - cray fas actions describe {actionID} --format toml - ``` - - Example output: - - ```toml - blockedBy = [] - state = "completed" - actionID = "fddd0025-f5ff-4f59-9e73-1ca2ef2a432d" - startTime = "2020-08-31 15:49:44.568271843 +0000 UTC" - snapshotID = "00000000-0000-0000-0000-000000000000" - endTime = "2020-08-31 15:51:35.426714612 +0000 UTC" - - [command] - description = "Update Foxconn Node BMCs Dryrun" - tag = "default" - restoreNotPossibleOverride = true - timeLimit = 10000 - version = "latest" - overrideDryrun = false - ``` - - If `state = "completed"`, the dry-run has found and checked all the nodes. Check the following sections for more information: - - * Lists the nodes that have a valid image for updating: - - ```toml - [operationSummary.succeeded] - ``` - - * Lists the nodes that will not be updated because they are already at the correct version: - - ```toml - [operationSummary.noOperation] - ``` - - * Lists the nodes that had an error when attempting to update: - - ```toml - [operationSummary.failed] - ``` - - * Lists the nodes that do not have a valid image for updating: - - ```toml - [operationSummary.noSolution] - ``` - -1. Update the firmware after verifying that the dry-run worked as expected. - - 1. Edit the JSON file and update the values so an actual firmware update can be run. - - The following example is for the `nodeBMC.json` file. Update the following values: - - ```json - "overrideDryrun":true, - "description":"Update Foxconn Node BMCs" - ``` - - 1. (`ncn#`) Run the firmware update. - - The output `overrideDryrun = true` indicates that an actual firmware update job was created. A new `actionID` will also be displayed. - - ```bash - cray fas actions create nodeBMC.json --format toml - ``` - - Example output: - - ```toml - overrideDryrun = true - actionID = "bc40f10a-e50c-4178-9288-8234b336077b" - ``` - - The time it takes for a firmware action to finish varies. It can be a few minutes or over 20 minutes. - - The BMC automatically reboots after the BMC firmware has been loaded. - -1. Retrieve the `operationID` and verify that the update is complete. - - ```bash - cray fas actions describe {actionID} --format toml - ``` - - Example output: - - ```toml - [operationSummary.failed] - [[operationSummary.failed.operationKeys]] - stateHelper = "unexpected change detected in firmware version. Expected nc.1.3.10-shasta-release.arm.2020-07-21T23:58:22+00:00.d479f59 got: nc.cronomatic-dev.arm.2019-09-24T13:20:24+00:00.9d0f8280" - fromFirmwareVersion = "nc.cronomatic-dev.arm.2019-09-24T13:20:24+00:00.9d0f8280" - xname = "x1005c6s4b0" - target = "BMC" - operationID = "e910c6ad-db98-44fc-bdc5-90477b23386f" - ``` - -1. (`ncn#`) View more details for an operation using the `operationID` from the previous step. - - Check the list of nodes for the `failed` or `completed` state. - - ```bash - cray fas operations describe {operationID} - ``` - - For example: - - ```bash - cray fas operations describe "e910c6ad-db98-44fc-bdc5-90477b23386f" --format toml - ``` - - Example output: - - ```toml - fromFirmwareVersion = "nc.cronomatic-dev.arm.2019-09-24T13:20:24+00:00.9d0f8280" - fromTag = "" - fromImageURL = "" - endTime = "2020-08-31 16:40:13.464321212 +0000 UTC" - actionID = "bc40f10a-e50c-4178-9288-8234b336077b" - startTime = "2020-08-31 16:28:01.228524446 +0000 UTC" - fromSemanticFirmwareVersion = "" - toImageURL = "" - model = "WNC_REV_B" - operationID = "e910c6ad-db98-44fc-bdc5-90477b23386f" - fromImageID = "00000000-0000-0000-0000-000000000000" - target = "BMC" - toImageID = "39c0e553-281d-4776-b68e-c46a2993485e" - toSemanticFirmwareVersion = "1.3.10" - refreshTime = "2020-08-31 16:40:13.464325422 +0000 UTC" - blockedBy = [] - toTag = "" - state = "failed" - stateHelper = "unexpected change detected in firmware version. Expected nc.1.3.10-shasta-release.arm.2020-07-21T23:58:22+00:00.d479f59 got: nc.cronomatic-dev.arm.2019-09-24T13:20:24+00:00.9d0f8280" - deviceType = "NodeBMC" - ``` - - Once the firmware and BIOS are updated, the compute nodes can be powered back on. - - If the nodes have never been powered on in the system before (they are being added during a hardware add procedure), then use the Boot Orchestration Service (BOS) to power them on. - Using BOS will prepare the initial boot artifacts required to boot them. If this is not the first time they have been powered on in this system, then you can use the Power Control Service \(PCS\) to power them on. - -## Upload Paradise images to TFTP server - -(`ncn#`) To check if a firmware is uploaded to the TFTP server: - -```bash -kubectl -n services exec -it `kubectl get pods -n services -l app.kubernetes.io/instance=cms-ipxe -o custom-columns=NS:.metadata.name --no-headers | head -1` -- ls /shared_tftp -``` - -If the firmware file you need is not listed, run the following command to copy the file from S3 to the TFTP server (`ncn#`) - -```bash -/usr/share/doc/csm/scripts/operations/firmware/upload_foxconn_images_tftp.py -``` - -## Reset BMC - -This will reset the BMC to factory resets - including resetting the BMC username and password. -*Only do this if required!* - -Before BMC firmware update (`ncn#`): - -The nodes must be **OFF** before updating BMC (when doing a reset) - -```bash -ssh admin@$(xname) 'fw_setenv openbmconce "factory-reset"' -``` - -**Update BMC firmware using one of the methods above** -NOTE: If the password changes after the boot of BMC, FAS will no longer be able to verify the update and will fail after the time limit. - -After firmware update(`ncn#`): - -If the password changed to something other than the what is stored in vault, update the BMC password: - -```bash -ssh admin@$(xname) 'ipmitool user set password 1 "password"' -``` - -Boot the node. +# Updating Foxconn Paradise Nodes with FAS + +Use the Firmware Action Service (FAS) to update the firmware on Foxconn Paradise devices. Each procedure includes the prerequisites and example recipes required to update the firmware. + +**NOTE:** Any node that is locked remains in the state `inProgress` with the `stateHelper` message of `"failed to lock"` until the action times out, or the lock is released. +If the action is timed out, these nodes report as `failed` with the `stateHelper` message of `"time expired; could not complete update"`. +This includes NCNs which are manually locked to prevent accidental rebooting and firmware updates. + +Refer to [FAS Filters](FAS_Filters.md) for more information on the content used in the example JSON files. + +The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to perform default updates to firmware and BIOS. + +## Prerequisites + +* The Cray command line interface \(CLI\) tool is initialized and configured on the system. +See [Configure the Cray CLI](../configure_cray_cli.md). +* The firmware images are loaded into S3 and to the TFTP server. +See [Upload Paradise images to TFTP server](#upload-paradise-images-to-tftp-server) + +The following targets can be updated with FAS on Paradise Nodes: + +* [`bmc_active`](#update-paradise-bmc_active-procedure) +* [`bios_active`](#update-paradise-bios_active-procedure) +* [`erot_active`](#update-paradise-erot_active-procedure) +* [`fpga_active`](#update-paradise-fpga_active-procedure) +* [`pld_active`](#update-paradise-pld_active-procedure) + +## Update Paradise `bmc_active` procedure + +NOTE: If a reset of the BMC is required, follow [this procedure](#reset-bmc) before and after the update of each node. *Only do this if required!* + +The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `bmc_active` - use recipe `foxconn_nodeBMC_bmc.json` + +The BMC will reboot after the update is complete. + +To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) + +```json +{ +"stateComponentFilter": { + "deviceTypes": [ "nodeBMC" ] + }, +"inventoryHardwareFilter": { + "manufacturer": "foxconn" + }, +"targetFilter": { + "targets": [ "bmc_active" ] + }, +"command": { + "version": "latest", + "tag": "default", + "overrideDryrun": false, + "restoreNotPossibleOverride": true, + "timeLimit": 1000, + "description": "Dryrun upgrade of Foxconn bmc_active" + } +} +``` + +**IMPORTANT:** There is a known bug that causes the `hmcollector-poll` service to lose event subscriptions +after BMC firmware is updated. After updating BMC firmware, the `hmcollector-poll` service must be restarted to +work around this issue. After the update is complete, and you confirm the BMC has been rebooted, restart +the `hmcollector-poll` service with this command: + +```bash +kubectl -n services rollout restart deployment cray-hms-hmcollector-poll +``` + +## Update Paradise `bios_active` procedure + +The nodes must be **OFF** before updating the BIOS + +**IMPORTANT:** After the update has completed, the nodes must be turned on and **REMAIN ON FOR AT LEAST 6 MINUTES** + +**NOTE:** The version number reported by Redfish will NOT be updated until the node has fully booted. + +The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `bios_active` - use recipe `foxconn_nodeBMC_bios.json` + +To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) + +```json +{ +"stateComponentFilter": { + "deviceTypes": [ "nodeBMC" ] + }, +"inventoryHardwareFilter": { + "manufacturer": "foxconn" + }, +"targetFilter": { + "targets": [ "bios_active" ] + }, +"command": { + "version": "latest", + "tag": "default", + "overrideDryrun": false, + "restoreNotPossibleOverride": true, + "timeLimit": 1000, + "description": "Dryrun upgrade of Foxconn bios_active" + } +} +``` + +## Update Paradise `erot_active` procedure + +**NOTE:** After update of `erot_active` an AC power cycle is required for update to take affect. +To do an AC power cycle, run the following command (`ncn#`). + +```bash +ssh admin@$(xname) "ipmitool raw 0x38 0x02" +``` + +The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `erot_active` - use recipe `foxconn_nodeBMC_erot.json` + +To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) + +```json +{ +"stateComponentFilter": { + "deviceTypes": [ "nodeBMC" ] + }, +"inventoryHardwareFilter": { + "manufacturer": "foxconn" + }, +"targetFilter": { + "targets": [ "erot_active" ] + }, +"command": { + "version": "latest", + "tag": "default", + "overrideDryrun": false, + "restoreNotPossibleOverride": true, + "timeLimit": 1000, + "description": "Dryrun upgrade of Foxconn bios_active" + } +} +``` + +## Update Paradise `fpga_active` procedure + +**NOTE:** After update of `fpga_active` an AC power cycle is required for update to take affect. +To do an AC power cycle, run the following command (`ncn#`). + +```bash +ssh admin@$(xname) "ipmitool raw 0x38 0x02" +``` + +The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `fpga_active` - use recipe `foxconn_nodeBMC_fpga.json` + +To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) + +```json +{ +"stateComponentFilter": { + "deviceTypes": [ "nodeBMC" ] + }, +"inventoryHardwareFilter": { + "manufacturer": "foxconn" + }, +"targetFilter": { + "targets": [ + "fpga_active" + ] + }, +"command": { + "version": "latest", + "tag": "default", + "overrideDryrun": false, + "restoreNotPossibleOverride": true, + "timeLimit": 1000, + "description": "Dryrun upgrade of Foxconn bios_active" + } +} +``` + +## Update Paradise `pld_active` procedure + +**IMPORTANT:** The update of the target `pld_active` should only be applied to blade 1 (i.e. `x3000c0s3b1`) - applying to other blades at the same time may cause issues. To use the `FASUpdate.py script`, use the `--xnames` flag to specify `b1`. + +The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to update `pld_active` - use recipe `foxconn_nodeBMC_pld.json` + +To update using a JSON file and the Cray CLI, use this example JSON file and follow the [Updating Paradise Firmware with JSON and the Cray CLI Procedure](#update-paradise-firmware-using-json-file-and-cray-cli) + +```json +{ +"stateComponentFilter": { + "xnames": [ "x3000c0s3b1" ], + "deviceTypes": [ "nodeBMC" ] + }, +"inventoryHardwareFilter": { + "manufacturer": "foxconn" + }, +"targetFilter": { + "targets": [ "pld_active" ] + }, +"command": { + "version": "latest", + "tag": "default", + "overrideDryrun": false, + "restoreNotPossibleOverride": true, + "timeLimit": 1000, + "description": "Dryrun upgrade of Foxconn bios_active" + } +} +``` + +## Update Paradise firmware using JSON file and Cray CLI + +**NOTE:** The [`FASUpdate.py script`](FASUpdate_Script.md) can be used to perform default updates to firmware and BIOS. + +1. Create a JSON file using the example recipe. + +1. Initiate a dry-run to verify the firmware that will be updated and the version it will update to. + + 1. (`ncn#`) Create the dry-run session. + + The `overrideDryrun = false` value indicates that the command will do a dry run. + + ```bash + cray fas actions create nodeBMC.json --format toml + ``` + + Example output: + + ```toml + overrideDryrun = false + actionID = "fddd0025-f5ff-4f59-9e73-1ca2ef2a432d" + ``` + + 1. (`ncn#`) Describe the `actionID` for firmware update dry-run job. + + Replace the `actionID` value with the string returned in the previous step. In this example, `"fddd0025-f5ff-4f59-9e73-1ca2ef2a432d"` is used. + + ```bash + cray fas actions describe {actionID} --format toml + ``` + + Example output: + + ```toml + blockedBy = [] + state = "completed" + actionID = "fddd0025-f5ff-4f59-9e73-1ca2ef2a432d" + startTime = "2020-08-31 15:49:44.568271843 +0000 UTC" + snapshotID = "00000000-0000-0000-0000-000000000000" + endTime = "2020-08-31 15:51:35.426714612 +0000 UTC" + + [command] + description = "Update Foxconn Node BMCs Dryrun" + tag = "default" + restoreNotPossibleOverride = true + timeLimit = 10000 + version = "latest" + overrideDryrun = false + ``` + + If `state = "completed"`, the dry-run has found and checked all the nodes. Check the following sections for more information: + + * Lists the nodes that have a valid image for updating: + + ```toml + [operationSummary.succeeded] + ``` + + * Lists the nodes that will not be updated because they are already at the correct version: + + ```toml + [operationSummary.noOperation] + ``` + + * Lists the nodes that had an error when attempting to update: + + ```toml + [operationSummary.failed] + ``` + + * Lists the nodes that do not have a valid image for updating: + + ```toml + [operationSummary.noSolution] + ``` + +1. Update the firmware after verifying that the dry-run worked as expected. + + 1. Edit the JSON file and update the values so an actual firmware update can be run. + + The following example is for the `nodeBMC.json` file. Update the following values: + + ```json + "overrideDryrun":true, + "description":"Update Foxconn Node BMCs" + ``` + + 1. (`ncn#`) Run the firmware update. + + The output `overrideDryrun = true` indicates that an actual firmware update job was created. A new `actionID` will also be displayed. + + ```bash + cray fas actions create nodeBMC.json --format toml + ``` + + Example output: + + ```toml + overrideDryrun = true + actionID = "bc40f10a-e50c-4178-9288-8234b336077b" + ``` + + The time it takes for a firmware action to finish varies. It can be a few minutes or over 20 minutes. + + The BMC automatically reboots after the BMC firmware has been loaded. + +1. Retrieve the `operationID` and verify that the update is complete. + + ```bash + cray fas actions describe {actionID} --format toml + ``` + + Example output: + + ```toml + [operationSummary.failed] + [[operationSummary.failed.operationKeys]] + stateHelper = "unexpected change detected in firmware version. Expected nc.1.3.10-shasta-release.arm.2020-07-21T23:58:22+00:00.d479f59 got: nc.cronomatic-dev.arm.2019-09-24T13:20:24+00:00.9d0f8280" + fromFirmwareVersion = "nc.cronomatic-dev.arm.2019-09-24T13:20:24+00:00.9d0f8280" + xname = "x1005c6s4b0" + target = "BMC" + operationID = "e910c6ad-db98-44fc-bdc5-90477b23386f" + ``` + +1. (`ncn#`) View more details for an operation using the `operationID` from the previous step. + + Check the list of nodes for the `failed` or `completed` state. + + ```bash + cray fas operations describe {operationID} + ``` + + For example: + + ```bash + cray fas operations describe "e910c6ad-db98-44fc-bdc5-90477b23386f" --format toml + ``` + + Example output: + + ```toml + fromFirmwareVersion = "nc.cronomatic-dev.arm.2019-09-24T13:20:24+00:00.9d0f8280" + fromTag = "" + fromImageURL = "" + endTime = "2020-08-31 16:40:13.464321212 +0000 UTC" + actionID = "bc40f10a-e50c-4178-9288-8234b336077b" + startTime = "2020-08-31 16:28:01.228524446 +0000 UTC" + fromSemanticFirmwareVersion = "" + toImageURL = "" + model = "WNC_REV_B" + operationID = "e910c6ad-db98-44fc-bdc5-90477b23386f" + fromImageID = "00000000-0000-0000-0000-000000000000" + target = "BMC" + toImageID = "39c0e553-281d-4776-b68e-c46a2993485e" + toSemanticFirmwareVersion = "1.3.10" + refreshTime = "2020-08-31 16:40:13.464325422 +0000 UTC" + blockedBy = [] + toTag = "" + state = "failed" + stateHelper = "unexpected change detected in firmware version. Expected nc.1.3.10-shasta-release.arm.2020-07-21T23:58:22+00:00.d479f59 got: nc.cronomatic-dev.arm.2019-09-24T13:20:24+00:00.9d0f8280" + deviceType = "NodeBMC" + ``` + + Once the firmware and BIOS are updated, the compute nodes can be powered back on. + + If the nodes have never been powered on in the system before (they are being added during a hardware add procedure), then use the Boot Orchestration Service (BOS) to power them on. + Using BOS will prepare the initial boot artifacts required to boot them. If this is not the first time they have been powered on in this system, then you can use the Power Control Service \(PCS\) to power them on. + +## Upload Paradise images to TFTP server + +(`ncn#`) To check if a firmware is uploaded to the TFTP server: + +```bash +kubectl -n services exec -it `kubectl get pods -n services -l app.kubernetes.io/instance=cms-ipxe -o custom-columns=NS:.metadata.name --no-headers | head -1` -- ls /shared_tftp +``` + +If the firmware file you need is not listed, run the following command to copy the file from S3 to the TFTP server (`ncn#`) + +```bash +/usr/share/doc/csm/scripts/operations/firmware/upload_foxconn_images_tftp.py +``` + +## Reset BMC + +This will reset the BMC to factory resets - including resetting the BMC username and password. +*Only do this if required!* + +Before BMC firmware update (`ncn#`): + +The nodes must be **OFF** before updating BMC (when doing a reset) + +```bash +ssh admin@$(xname) 'fw_setenv openbmconce "factory-reset"' +``` + +**Update BMC firmware using one of the methods above** +NOTE: If the password changes after the boot of BMC, FAS will no longer be able to verify the update and will fail after the time limit. + +After firmware update(`ncn#`): + +If the password changed to something other than the what is stored in vault, update the BMC password: + +```bash +ssh admin@$(xname) 'ipmitool user set password 1 "password"' +``` + +Boot the node. diff --git a/operations/firmware/FAS_Use_Cases.md b/operations/firmware/FAS_Use_Cases.md index 651a1dede63e..87cf42202d6f 100644 --- a/operations/firmware/FAS_Use_Cases.md +++ b/operations/firmware/FAS_Use_Cases.md @@ -19,11 +19,11 @@ See [Configure the Cray CLI](../configure_cray_cli.md). The following procedures are included in this section: -1. [Update liquid-cooled compute node BMC, FPGA, management Ethernet, `AccVBIOS`, `AccUC` and BIOS](#liquid-cooled-nodes-update-procedures) -1. [Update air-cooled compute node BMC, BIOS, iLO 5, iLO 6, and system ROM](#update-air-cooled-compute-node-bmc-bios-ilo-5-ilo-6-and-system-rom) -1. [Update Chassis Management Module (CMM) firmware](#update-chassis-management-module-firmware) -1. [Update NCN BIOS and BMC firmware with FAS](#update-non-compute-node-ncn-bios-and-bmc-firmware) -1. [Compute node BIOS workaround for HPE CRAY EX425](#compute-node-bios-workaround-for-hpe-cray-ex425) +* [Update liquid-cooled compute node BMC, FPGA, management Ethernet, `AccVBIOS`, `AccUC` and BIOS](#liquid-cooled-nodes-update-procedures) +* [Update air-cooled compute node BMC, BIOS, iLO 5, iLO 6, and system ROM](#update-air-cooled-compute-node-bmc-bios-ilo-5-ilo-6-and-system-rom) +* [Update Chassis Management Module (CMM) firmware](#update-chassis-management-module-firmware) +* [Update NCN BIOS and BMC firmware with FAS](#update-non-compute-node-ncn-bios-and-bmc-firmware) +* [Compute node BIOS workaround for HPE CRAY EX425](#compute-node-bios-workaround-for-hpe-cray-ex425) > **NOTE:** To update switch Controllers \(sC\) or `RouterBMCs`, refer to the Rosetta documentation. @@ -200,7 +200,7 @@ If nodes are not off when the update command is issued, it will report as a fail It is recommended that the `Node0/1` BIOS be updated in a separate action, after a BMC update. It is also recommended that the nodes be powered back on after the updates are completed. If the nodes have never been powered on in the system before (they are being added during a hardware add procedure), then use the Boot Orchestration Service (BOS) to power them on. -Using BOS will prepare the initial boot artifacts required to boot them. If this is not the first time they have been powered on in this system, then you can use the Power Control Service \(PCS\) to power them on. +Using BOS will prepare the initial boot artifacts required to boot them. If this is not the first time they have been powered on in this system, then you can use the Power Control Service \(PCS\) to power them on. ```json { diff --git a/operations/iuf/workflows/deploy_product.md b/operations/iuf/workflows/deploy_product.md index 50b8fbc73796..c16ca5882d66 100644 --- a/operations/iuf/workflows/deploy_product.md +++ b/operations/iuf/workflows/deploy_product.md @@ -1,8 +1,8 @@ # Deploy product -- [1. Execute the IUF `deploy-product` stage](#1-execute-the-iuf-deploy-product-stage) -- [2. Upgrade Kubernetes](#2-upgrade-kubernetes) -- [3. Next steps](#3-next-steps) +1. [Execute the IUF `deploy-product` stage](#1-execute-the-iuf-deploy-product-stage) +1. [Upgrade Kubernetes](#2-upgrade-kubernetes) +1. [Next steps](#3-next-steps) ## 1. Execute the IUF `deploy-product` stage diff --git a/operations/iuf/workflows/product_delivery.md b/operations/iuf/workflows/product_delivery.md index 59618deeef22..ab46dcde7a04 100644 --- a/operations/iuf/workflows/product_delivery.md +++ b/operations/iuf/workflows/product_delivery.md @@ -2,11 +2,11 @@ This section ensures the product content is loaded onto the system and available for later steps in the workflow. -- [1. Execute the IUF `process-media` and `pre-install-check` stages](#1-execute-the-iuf-process-media-and-pre-install-check-stages) -- [2. Update `customizations.yaml`](#2-update-customizationsyaml) -- [3. Execute the IUF `deliver-product` stage](#3-execute-the-iuf-deliver-product-stage) -- [4. Perform manual product delivery operations](#4-perform-manual-product-delivery-operations) -- [5. Next steps](#5-next-steps) +1. [Execute the IUF `process-media` and `pre-install-check` stages](#1-execute-the-iuf-process-media-and-pre-install-check-stages) +1. [Update `customizations.yaml`](#2-update-customizationsyaml) +1. [Execute the IUF `deliver-product` stage](#3-execute-the-iuf-deliver-product-stage) +1. [Perform manual product delivery operations](#4-perform-manual-product-delivery-operations) +1. [Next steps](#5-next-steps) ## 1. Execute the IUF `process-media` and `pre-install-check` stages diff --git a/operations/iuf/workflows/upgrade_csm_iuf_additional_products_with_iuf.md b/operations/iuf/workflows/upgrade_csm_iuf_additional_products_with_iuf.md index 5cee71302688..07eee6985744 100644 --- a/operations/iuf/workflows/upgrade_csm_iuf_additional_products_with_iuf.md +++ b/operations/iuf/workflows/upgrade_csm_iuf_additional_products_with_iuf.md @@ -1,6 +1,6 @@ # Upgrade CSM and additional products with IUF -**Note: From CSM 1.6 , CSM supports upgrade through IUF. All CSM specific steps mentioned in [Upgrade CSM manually and additional products with IUF](upgrade_csm_manual_and_additional_products_with_iuf.md) are now part of CSM Upgrade with IUF.** +**Note: From CSM 1.6, CSM supports upgrade through IUF. All CSM-specific steps mentioned in [Upgrade CSM manually and additional products with IUF](upgrade_csm_manual_and_additional_products_with_iuf.md) are now part of CSM Upgrade with IUF.** All stages of `iuf` are executed in this option. All of the new product software provided in the recipe release is deployed and all [management NCNs](../../../glossary.md#management-nodes) and managed diff --git a/operations/multi-tenancy/Create_a_Tenant.md b/operations/multi-tenancy/Create_a_Tenant.md index 474db174564e..e8e497452b45 100644 --- a/operations/multi-tenancy/Create_a_Tenant.md +++ b/operations/multi-tenancy/Create_a_Tenant.md @@ -310,7 +310,7 @@ spec: accessModes: - ReadWriteOnce storage: 512Gi - # Backup daily at 9:10PM (doesn't conflict with other CSM DB backups) + # Backup daily at 9:10PM (does not conflict with other CSM DB backups) schedule: "10 21 * * *" keep: 3 resources: diff --git a/operations/network/management_network/aruba/mstp.md b/operations/network/management_network/aruba/mstp.md index 30239f5a275c..51f42aea7137 100644 --- a/operations/network/management_network/aruba/mstp.md +++ b/operations/network/management_network/aruba/mstp.md @@ -1,26 +1,28 @@ # Multiple Spanning Tree Protocol (MSTP) -MSTP (802.1s) ensures that only one active path exists between any two nodes in a spanning-tree instance. A spanning-tree instance comprises a unique set of VLANs. MSTP instances significantly improve network resource utilization while maintaining a loop-free environment. +MSTP (802.1s) ensures that only one active path exists between any two nodes in a spanning-tree instance. +A spanning-tree instance comprises a unique set of VLANs. MSTP instances significantly improve network +resource utilization while maintaining a loop-free environment. -## Configuration Commands +## Configuration commands -Enable MSTP (default mode for spanning-tree): +(`sw#`) Enable MSTP (default mode for spanning-tree): ```text -switch(config)# spanning-tree -switch(config)# spanning-tree config-name -switch(config)# spanning-tree config-revision Configure an MSTP instance and priority -switch(config)# spanning-tree instance VALUE vlan VLANS -switch(config)# spanning-tree instance VALUE priority VALUE +spanning-tree +spanning-tree config-name +spanning-tree config-revision Configure an MSTP instance and priority +spanning-tree instance VALUE vlan VLANS +spanning-tree instance VALUE priority VALUE ``` -Show commands to validate functionality: +(`sw#`) Show commands to validate functionality: ```text show spanning-tree mst detail ``` -## Example Output +## Example output ```text show span @@ -44,7 +46,7 @@ Port Role State Cost Priority Type ------------ -------------- ------------ ------- ---------- ---------- ``` -## Expected Results +## Expected results 1. Spanning-tree mode is configured 2. Spanning-tree is enabled, if loops are detected ports should go blocked state diff --git a/operations/network/management_network/dell/mstp.md b/operations/network/management_network/dell/mstp.md index ee3c80932a55..38a26e811ed7 100644 --- a/operations/network/management_network/dell/mstp.md +++ b/operations/network/management_network/dell/mstp.md @@ -1,12 +1,12 @@ # Configure Multiple Spanning Tree Protocol (MSTP) MSTP (802.1s) ensures that only one active path exists between any two nodes in a spanning-tree instance. -A spanning-tree instance comprises a unique set of VLANs. -MSTP instances significantly improve network resource utilization while maintaining a loop-free environment. +A spanning-tree instance comprises a unique set of VLANs. MSTP instances significantly improve network +resource utilization while maintaining a loop-free environment. -## Configuration Commands +## Configuration commands -Enable MSTP (default mode for spanning-tree): +(`sw#`) Enable MSTP (default mode for spanning-tree): ```text spanning-tree mode mst @@ -14,13 +14,13 @@ name my-mstp-region revision 0 ``` -Show commands to validate functionality: +(`sw#`) Show commands to validate functionality: ```text show spanning-tree mst ``` -## Expected Results +## Expected results 1. Spanning-tree mode is configured 2. Spanning-tree is enabled, if loops are detected ports should go blocked state diff --git a/operations/network/management_network/mellanox/mstp.md b/operations/network/management_network/mellanox/mstp.md index 21d6749bd629..a2b6adf98036 100644 --- a/operations/network/management_network/mellanox/mstp.md +++ b/operations/network/management_network/mellanox/mstp.md @@ -1,28 +1,30 @@ # Multiple spanning tree protocol (MSTP) -MSTP (802.1s) ensures that only one active path exists between any two nodes in a spanning-tree instance. A spanning-tree instance comprises a unique set of VLANs. MSTP instances significantly improve network resource utilization while maintaining a loop-free environment. +MSTP (802.1s) ensures that only one active path exists between any two nodes in a spanning-tree instance. +A spanning-tree instance comprises a unique set of VLANs. MSTP instances significantly improve network +resource utilization while maintaining a loop-free environment. -Relevant Configuration +## Configuration commands -Enable MSTP (default mode for spanning-tree) +(`sw#`) Enable MSTP (default mode for spanning-tree) -``` -switch(config)# spanning-tree -switch(config)# spanning-tree mode mstp -switch(config)# spanning-tree mst revision 1 -switch(config)# spanning-tree mst name mellanox +```text +spanning-tree +spanning-tree mode mstp +spanning-tree mst revision 1 +spanning-tree mst name mellanox ``` -Show Commands to Validate Functionality +Show commands to validate functionality -``` +```text show spanning-tree ``` -Expected Results +## Expected results -* Step 1: Spanning-tree mode is configured -* Step 2: Spanning-tree is enabled, if loops are detected ports should go blocked state. -* Step 3: Spanning-tree splits traffic domain between two DUTs +1. Spanning-tree mode is configured +1. Spanning-tree is enabled, if loops are detected ports should go blocked state. +1. Spanning-tree splits traffic domain between two DUTs [Back to Index](../README.md) diff --git a/operations/node_management/Replacing_Foxconn_User_Pass.md b/operations/node_management/Replacing_Foxconn_User_Pass.md index 6a9ba454d4b2..268f21ffc8dc 100644 --- a/operations/node_management/Replacing_Foxconn_User_Pass.md +++ b/operations/node_management/Replacing_Foxconn_User_Pass.md @@ -1,43 +1,43 @@ -# Replacing Foxconn Username and Passwords in Vault - -Foxconn (Paradise) nodes may be shipped with a different default username and password then the system password. -Because of the difference in user/password, these nodes will not be able to be discovered. -Vault needs to be updated with the Foxconn username and password using the `FoxconnUserPass.py` script or manually. - -## Procedure using the `FoxconnUserPass.py` script - -1. (`ncn-mw#`) Set up API token. - - ```bash - export TOKEN=$(curl -k -s -S -d grant_type=client_credentials -d client_id=admin-client -d client_secret=$(kubectl get secrets admin-client-auth -o jsonpath='{.data.client-secret}' | base64 -d) https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token | jq -r '.access_token') - ``` - -1. (`ncn-mw#`) Set helper variable. - - ```bash - DOCS_DIR=/usr/share/doc/csm/scripts - ``` - -1. (`ncn-mw#`) Run the Foxconn update script - - ```bash - $DOCS_DIR/operations/hardware_state_manager/FoxconnUserPass.py - ``` - - This will ask for the BMC username and password for the Paradise nodes. - The scirpt will look for undiscovered nodes, if it finds a Foxconn node, update vault with correct credentials. - -1. (`ncn-mw#`) Wait 10+ minutes for changes to take affect and nodes to be discovered. To check nodes which have failed to be discovered: - - ```bash - cray hsm inventory redfishEndpoints list --format json | jq '.[] | .[] | select (.DiscoveryInfo.LastDiscoveryStatus!="DiscoverOK")' - ``` - -## Manual procedure to update credentials in vault - -1. (`ncn-mw#`) Use the Cray CLI to update vault through HSM (replace `BMC_xname` with the xname of the BMC, `Foxconn_user` with the Foxconn default username, and `Foxconn_pass` with the Foxconn default password): - NOTE: `BMC_xname` needs to be in the line twice - - ```bash - cray hsm inventory redfishEndpoints update BMC_xname -id BMC_xname --user Foxconn_user --password Foxconn_pass - ``` +# Replacing Foxconn Username and Passwords in Vault + +Foxconn (Paradise) nodes may be shipped with a different default username and password then the system password. +Because of the difference in user/password, these nodes will not be able to be discovered. +Vault needs to be updated with the Foxconn username and password using the `FoxconnUserPass.py` script or manually. + +## Procedure using the `FoxconnUserPass.py` script + +1. (`ncn-mw#`) Set up API token. + + ```bash + export TOKEN=$(curl -k -s -S -d grant_type=client_credentials -d client_id=admin-client -d client_secret=$(kubectl get secrets admin-client-auth -o jsonpath='{.data.client-secret}' | base64 -d) https://api-gw-service-nmn.local/keycloak/realms/shasta/protocol/openid-connect/token | jq -r '.access_token') + ``` + +1. (`ncn-mw#`) Set helper variable. + + ```bash + DOCS_DIR=/usr/share/doc/csm/scripts + ``` + +1. (`ncn-mw#`) Run the Foxconn update script + + ```bash + $DOCS_DIR/operations/hardware_state_manager/FoxconnUserPass.py + ``` + + This will ask for the BMC username and password for the Paradise nodes. + The script will look for undiscovered nodes; if it finds a Foxconn node, it will update vault with correct credentials. + +1. (`ncn-mw#`) Wait 10+ minutes for changes to take effect and nodes to be discovered. To check nodes which have failed to be discovered: + + ```bash + cray hsm inventory redfishEndpoints list --format json | jq '.[] | .[] | select (.DiscoveryInfo.LastDiscoveryStatus!="DiscoverOK")' + ``` + +## Manual procedure to update credentials in vault + +1. (`ncn-mw#`) Use the Cray CLI to update vault through HSM (replace `BMC_xname` with the xname of the BMC, `Foxconn_user` with the Foxconn default username, and `Foxconn_pass` with the Foxconn default password): + NOTE: `BMC_xname` needs to be in the line twice + + ```bash + cray hsm inventory redfishEndpoints update BMC_xname -id BMC_xname --user Foxconn_user --password Foxconn_pass + ``` diff --git a/operations/power_management/Cray_Advanced_Platform_Monitoring_and_Control_CAPMC.md b/operations/power_management/Cray_Advanced_Platform_Monitoring_and_Control_CAPMC.md index 6647a1cba447..b56355b20965 100644 --- a/operations/power_management/Cray_Advanced_Platform_Monitoring_and_Control_CAPMC.md +++ b/operations/power_management/Cray_Advanced_Platform_Monitoring_and_Control_CAPMC.md @@ -1,6 +1,6 @@ # Cray Advanced Platform Monitoring and Control (CAPMC) -NOTE: CAPMC was deprecated in CSM 1.5 and may be removed in the future. See [Power Control Service (PCS)](../../glossary.md#power-control-service-pcs) for its replacement. +NOTE: CAPMC was deprecated in CSM 1.5 and may be removed in the future. See [Power Control Service (PCS)](../../glossary.md#power-control-service-pcs) for its replacement. The Cray Advanced Platform Monitoring and Control (CAPMC) service enables direct hardware control of nodes, compute blades, router modules, and liquid-cooled diff --git a/operations/power_management/Recover_from_a_Liquid_Cooled_Cabinet_EPO_Event.md b/operations/power_management/Recover_from_a_Liquid_Cooled_Cabinet_EPO_Event.md index 7add04dcb0a8..96e3629f7fc3 100644 --- a/operations/power_management/Recover_from_a_Liquid_Cooled_Cabinet_EPO_Event.md +++ b/operations/power_management/Recover_from_a_Liquid_Cooled_Cabinet_EPO_Event.md @@ -10,7 +10,7 @@ If a Cray EX liquid-cooled cabinet or cooling group experiences an EPO event, th 1. Verify that the EPO event did not damage the system hardware. -2. From `ncn-m001`, check the status of the chassis. +1. (`ncn-mw#`) Check the status of the chassis. ```bash cray power status list --xnames x9000c[1,3] --format toml @@ -18,7 +18,7 @@ If a Cray EX liquid-cooled cabinet or cooling group experiences an EPO event, th Example output: - ```text + ```toml [[status]] xname = "x9000c1" powerState = "off" @@ -36,7 +36,7 @@ If a Cray EX liquid-cooled cabinet or cooling group experiences an EPO event, th lastUpdated = "2024-02-04T01:48:48.240138908Z" ``` -3. Check the Chassis Controller Module \(CCM\) log for `Critical` messages and the EPO event. +1. (`ncn#`) Check the Chassis Controller Module \(CCM\) log for `Critical` messages and the EPO event. ```bash ssh x9000c1b0 egrep \"Critical\|= No\" /var/log/messages @@ -51,7 +51,7 @@ If a Cray EX liquid-cooled cabinet or cooling group experiences an EPO event, th Apr 11 04:00:06 x9000c1 user.info redfish-cmmd[4453]: rbe_set_chassis_status: Update Chassis 'Enclosure' Status: UnavailableOffline, Critical ``` -4. Disable the hms-discovery Kubernetes cron job. +1. (`ncn-mw#`) Disable the `hms-discovery` Kubernetes cron job. ```bash kubectl -n services patch cronjobs hms-discovery -p '{"spec" : {"suspend" : true }}' @@ -59,7 +59,7 @@ If a Cray EX liquid-cooled cabinet or cooling group experiences an EPO event, th **CAUTION:** Do not power the system on until it is safe to do so. Determine why the EPO event occurred before clearing the EPO state. -5. **If it is safe to power on the hardware**, clear all chassis in the EPO state in the cooling group. +1. (`ncn-mw#`) **If it is safe to power on the hardware**, clear all chassis in the EPO state in the cooling group. All chassis in cabinets 1000-1003 are forced off in this example. Power off all chassis in a cooling group simultaneously, or the EPO condition may persist. @@ -73,23 +73,26 @@ If a Cray EX liquid-cooled cabinet or cooling group experiences an EPO event, th cray power transition force-off --xnames x9000c[1,3] ``` -6. Restart the hms-discovery cron job. +1. (`ncn-mw#`) Restart the `hms-discovery` cron job. ```bash kubectl -n services patch cronjobs hms-discovery -p '{"spec" : {"suspend" : false }}' ``` - About 5 minutes after hms-discovery restarts, the service will power on the chassis enclosures, switches, and compute blades. If components are not being powered back on, then power them on manually. + About 5 minutes after `hms-discovery` restarts, the service will power on the chassis enclosures, switches, and compute blades. + If components are not being powered back on, then power them on manually. ```bash cray power transition on -xnames x[1000-1003]c[0-7]r[0-7],x[1000-1003]c[0-7]s[0-7] --include parents ``` -7. Verify the Slingshot fabric is up and healthy. +1. Verify the Slingshot fabric is up and healthy. + Refer to the following documentation for more information on how to verify the health of the Slingshot Fabric: + * The *Slingshot Administration Guide* PDF for HPE Cray EX systems. * The *Slingshot Troubleshooting Guide* PDF. -8. After the components have powered on, boot the nodes using the Boot Orchestration Services \(BOS\). +1. After the components have powered on, boot the nodes using the Boot Orchestration Services \(BOS\). See [Power On and Boot Managed Nodes](Power_On_and_Boot_Managed_Nodes.md). diff --git a/operations/security_and_authentication/Change_EX_Liquid-Cooled_Cabinet_Global_Default_Password.md b/operations/security_and_authentication/Change_EX_Liquid-Cooled_Cabinet_Global_Default_Password.md index c93356d7040c..2769d6acf393 100644 --- a/operations/security_and_authentication/Change_EX_Liquid-Cooled_Cabinet_Global_Default_Password.md +++ b/operations/security_and_authentication/Change_EX_Liquid-Cooled_Cabinet_Global_Default_Password.md @@ -10,21 +10,21 @@ procedures. - The Cray command line interface (CLI) tool is initialized and configured on the system. See [Configure the Cray Command Line Interface (`cray` CLI)](../configure_cray_cli.md) for more information. - Review procedures in [Manage System Passwords](Manage_System_Passwords.md). -### Procedure +## Procedure -1. If necessary, shut down compute nodes in each cabinet. Refer to [Shut Down and Power Off Managed Nodes](../power_management/Shut_Down_and_Power_Off_Managed_Nodes.md). +1. (`ncn-mw#`) If necessary, shut down compute nodes in each cabinet. Refer to [Shut Down and Power Off Managed Nodes](../power_management/Shut_Down_and_Power_Off_Managed_Nodes.md). ```screen sat bootsys shutdown --stage bos-operations --bos-templates COS_SESSION_TEMPLATE ``` -1. Disable the `hms-discovery` Kubernetes cron job. +1. (`ncn-mw#`) Disable the `hms-discovery` Kubernetes cron job. ```screen kubectl -n services patch cronjobs hms-discovery -p '{"spec" : {"suspend" : true }}' ``` -1. Power off all compute slots in the cabinets the passwords are to be changed on. +1. (`ncn-mw#`) Power off all compute slots in the cabinets the passwords are to be changed on. > **`NOTE`**: If a chassis is not fully populated, specify each slot individually. diff --git a/operations/security_and_authentication/Update_Default_ServerTech_PDU_Credentials_used_by_the_Redfish_Translation_Service.md b/operations/security_and_authentication/Update_Default_ServerTech_PDU_Credentials_used_by_the_Redfish_Translation_Service.md index f59e7aaf05c4..3050be32a0b3 100644 --- a/operations/security_and_authentication/Update_Default_ServerTech_PDU_Credentials_used_by_the_Redfish_Translation_Service.md +++ b/operations/security_and_authentication/Update_Default_ServerTech_PDU_Credentials_used_by_the_Redfish_Translation_Service.md @@ -7,8 +7,8 @@ ServerTech PDUs and management network switches which do not natively support Re There are two sets of default credentials that are required for RTS to function: -1. The default credentials to use when new ServerTech PDUs are discovered in the system. -1. The global default credential that RTS uses for its Redfish interface with other CSM services. +- The default credentials to use when new ServerTech PDUs are discovered in the system. +- The global default credential that RTS uses for its Redfish interface with other CSM services. ***NOTE*** RTS management network switch Redfish interfaces only use the global default RTS password. The username comes from the SNMP credentials pushed by HMS Discovery. See [Update Default Air-Cooled BMC and Leaf-BMC Switch SNMP Credentials](Update_Default_Air-Cooled_BMC_and_Leaf_BMC_Switch_SNMP_Credentials.md) to manage the SNMP credentials. @@ -20,13 +20,11 @@ credential when getting added to the system. > [Change Credentials on ServerTech PDUs](Change_Credentials_on_ServerTech_PDUs.md) procedure. However, this procedure will update the global default credential that RTS > uses for its Redfish interface to other CSM services. -- [Procedure](#procedure) - - 1. [Update credentials and redeploy RTS](#1-update-credentials-and-redeploy-rts) - 1. [Restart the SNMP-backed RTS to pick up the global RTS credential changes](#2-restart-the-snmp-backed-rts-to-pick-up-the-global-rts-credential-changes) - ## Procedure +1. [Update credentials and redeploy RTS](#1-update-credentials-and-redeploy-rts) +1. [Restart the SNMP-backed RTS to pick up the global RTS credential changes](#2-restart-the-snmp-backed-rts-to-pick-up-the-global-rts-credential-changes) + ### 1. Update credentials and redeploy RTS Follow the [Redeploying a Chart](../CSM_product_management/Redeploying_a_Chart.md) procedure **with the following specifications**: diff --git a/scripts/operations/configuration/copy_ims_data_from_minio.py b/scripts/operations/configuration/copy_ims_data_from_minio.py index 79ca202d679d..b2dd62272fa9 100755 --- a/scripts/operations/configuration/copy_ims_data_from_minio.py +++ b/scripts/operations/configuration/copy_ims_data_from_minio.py @@ -282,7 +282,7 @@ def assign_artifacts(self, all_artifacts: List[Artifact]) -> None: directory. Then print a summary of how much has been assigned to each directory. """ - logging.info("Determing download location for each artifact") + logging.info("Determining download location for each artifact") for artifact in sorted(all_artifacts, key=lambda a: a.size_bytes, reverse=True): self.add_artifact(artifact) self.print_artifact_summary() diff --git a/scripts/operations/hardware_state_manager/FoxconnUserPass.py b/scripts/operations/hardware_state_manager/FoxconnUserPass.py index 13ce6edbb77b..235bb24f2906 100755 --- a/scripts/operations/hardware_state_manager/FoxconnUserPass.py +++ b/scripts/operations/hardware_state_manager/FoxconnUserPass.py @@ -31,7 +31,7 @@ token = os.environ.get('TOKEN') if token is None or token == "": - print("Error environment variable TOKEN was not set") + print("ERROR: Environment variable TOKEN was not set") print('Run the following to set the TOKEN:') print('''export TOKEN=$(curl -k -s -S -d grant_type=client_credentials \\ -d client_id=admin-client -d client_secret=`kubectl get secrets admin-client-auth \\ @@ -86,7 +86,7 @@ if "Vendor" in redfish: vendor = redfish["Vendor"] if vendor == "Foxconn": - print("UPDATE VAULT AUTHINCATION") + print("UPDATE VAULT AUTHENTICATION") url = "https://api-gw-service-nmn.local/apis/smd/hsm/v2/Inventory/RedfishEndpoints/" + xname headers = { 'Content-Type': "application/json", diff --git a/troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md b/troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md index fdc35cc1faaa..a100da630ec4 100644 --- a/troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md +++ b/troubleshooting/kubernetes/Kubernetes_Kube_apiserver_failing.md @@ -119,7 +119,7 @@ E0724 19:46:37.872059 1 cacher.go:420] cacher (*core.Secret): unexpected L SRC_NODE=ncn-m002 ``` - 1. Copy `/etc/cray/kubernetes/encryption` files from the `SRC_NODE` to the node where encryption needs to be enaled. + 1. Copy `/etc/cray/kubernetes/encryption` files from the `SRC_NODE` to the node where encryption needs to be enabled. ```bash scp ${SRC_NODE}:/etc/cray/kubernetes/encryption/* /etc/cray/kubernetes/encryption/ diff --git a/upgrade/scripts/upgrade/prerequisites.sh b/upgrade/scripts/upgrade/prerequisites.sh index b089f2bcebf0..a1f38d3aca58 100755 --- a/upgrade/scripts/upgrade/prerequisites.sh +++ b/upgrade/scripts/upgrade/prerequisites.sh @@ -801,7 +801,7 @@ if [[ ${state_recorded} == "0" && $(hostname) == "${PRIMARY_NODE}" ]]; then fi # Ensure the cert-manager namespace is deleted in a case of both helm charts - # removed but there might be detritous leftover in the namespace. + # removed but there might be detritus left over in the namespace. kubectl delete namespace "${cmns}" || : tmp_manifest=/tmp/certmanager-tmp-manifest.yaml