diff --git a/scripts/reboot b/scripts/reboot index 044334af3e..70925e63d3 100755 --- a/scripts/reboot +++ b/scripts/reboot @@ -37,10 +37,17 @@ EXIT_NEXT_IMAGE_NOT_EXISTS=4 EXIT_SONIC_INSTALLER_VERIFY_REBOOT=21 EXIT_PLATFORM_FW_AU_FAILURE=22 PLATFORM_FWUTIL_AU_REBOOT_HANDLE="platform_fw_au_reboot_handle" +PLATFORM_JSON_FILE="platform.json" +PLATFORM_JSON_PATH="${DEVPATH}/${PLATFORM}/${PLATFORM_JSON_FILE}" REBOOT_SCRIPT_NAME=$(basename $0) REBOOT_TYPE="${REBOOT_SCRIPT_NAME}" TAG_LATEST=no REBOOT_FLAGS="" +FORCE_REBOOT="no" +SMART_SWITCH="no" +DPU_MODULE_NAME="" +REBOOT_DPU="no" +PRE_SHUTDOWN="no" function debug() { @@ -128,6 +135,8 @@ function show_help_and_exit() echo " " echo " Available options:" echo " -h, -? : getting this help" + echo " -d : DPU module name on a smart switch, option is invalid when on DPU" + echo " -p : Pre-shutdown steps on DPU, invalid on NPU" exit ${EXIT_SUCCESS} } @@ -154,7 +163,7 @@ function reboot_pre_check() ${DEVPATH}/${PLATFORM}/${PLATFORM_REBOOT_PRE_CHECK} [[ $? -ne 0 ]] && exit $? fi - + # Verify the next image by sonic-installer local message=$(sonic-installer verify-next-image 2>&1) if [ $? -ne 0 ]; then @@ -176,9 +185,128 @@ function check_conflict_boot_in_fw_update() fi } +# Function to retrieve DPU IP from CONFIG_DB +function get_dpu_ip() +{ + local DPU_NAME=$1 + dpu_ip=$(sonic-db-cli CONFIG_DB HGET "DHCP_SERVER_IPV4_PORT|bridge-midplane|${DPU_NAME}" "ips@") + if [ $? -ne 0 ] || [ -z "$dpu_ip" ]; then + echo "Error: Failed to retrieve DPU IP address for ${DPU_NAME}" + exit ${EXIT_ERROR} + fi + debug "$DPU_NAME ip: $dpu_ip" +} + +# Function to retrieve GNMI port from CONFIG_DB +function get_gnmi_port() { + local DPU_NAME=$1 + port=$(sonic-db-cli CONFIG_DB HGET "DPU_PORT|$DPU_NAME" "gnmi") + if [ $? -ne 0 ] || [ -z "$port" ]; then + echo "Error: Failed to retrieve GNMI port" + exit ${EXIT_ERROR} + fi + debug "$DPU_NAME GNMI port:$port" +} + +# Function to get reboot status from DPU +function get_reboot_status() +{ + local dpu_ip=$1 + local port=$2 + reboot_status=$(docker exec -i gnmi gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc RebootStatus) + if [ $? -ne 0 ] || [ -z "$reboot_status" ]; then + echo "Error: Failed to send reboot status command to DPU ${DPU_NAME}" + exit ${EXIT_ERROR} + fi + debug "$reboot_status" +} + +# Function to retrieve DPU bus info from platform JSON +function get_dpu_bus_info() { + local DPU_NAME=$1 + DPU_BUS_INFO=$(jq -r --arg DPU_NAME "${DPU_NAME}" '.DPUS[] | select(has($DPU_NAME)) | .[$DPU_NAME].bus_info' "$PLATFORM_JSON_PATH") + if [ -z "$DPU_BUS_INFO" ]; then + echo "Error: bus_info not found for DPU ${DPU_NAME}" + exit ${EXIT_ERROR} + fi + debug "$DPU_NAME : $DPU_BUS_INFO" +} + +# Function to reboot the platform module +function reboot_platform_module() { + local DPU_NAME=$1 + reboot_status=$(python3 -c "import reboot_helper; reboot_helper.reboot_module('${DPU_NAME}')") + if [ -z "$reboot_status" ] || [ "$reboot_status" = "false" ]; then + echo "Error: Failed to reboot the platform" + exit ${EXIT_ERROR} + fi +} + +function reboot_dpu_module() +{ + local DPU_NAME=$1 + local DPU_INDEX=${DPU_NAME//[!0-9]/} + + debug "User requested rebooting device ${DPU_NAME} ..." + + # Retrieve DPU IP and GNMI port + dpu_ip=$(get_dpu_ip "${DPU_NAME}") + port=$(get_gnmi_port "${DPU_NAME}") + + if [ -z "$dpu_ip" ] || [ -z "$port" ]; then + echo "Error: Failed to retrieve DPU IP or GNMI port for ${DPU_NAME}" + exit ${EXIT_ERROR} + fi + + # Issue GNOI client command to reboot the DPU + docker exec -i gnmi gnoi_client -target ${dpu_ip}:${port} -logtostderr -insecure -rpc Reboot -jsonin '{"method":3}' + if [ $? -ne 0 ]; then + echo "Error: Failed to send reboot command to DPU ${DPU_NAME}" + exit ${EXIT_ERROR} + fi + + # Retrieve dpu_halt_services_timeout value using jq + dpu_halt_services_timeout=$(jq -r '.dpu_halt_services_timeout' "$PLATFORM_JSON_PATH" 2>/dev/null) + if [ $? -ne 0 ]; then + echo "Error: Failed to retrieve dpu_halt_services_timeout from ${PLATFORM_JSON_PATH}" + exit ${EXIT_ERROR} + fi + + # Poll on reboot status response with a timeout mechanism + poll_interval=5 + waited_time=0 + while true; do + reboot_status=$(get_reboot_status "${dpu_ip}" "${port}") + debug "GNOI RebootStatus response ${reboot_status}" + is_reboot_active=$(echo "$reboot_status" | grep "active" | awk '{print $2}') + if [ "$is_reboot_active" == "false" ]; then + break + fi + + sleep "$poll_interval" + waited_time=$((waited_time + poll_interval)) + if [ $waited_time -ge $dpu_halt_services_timeout ]; then + echo "Error: Timeout waiting for DPU ${DPU_NAME} to finish rebooting" + exit ${EXIT_ERROR} + fi + done + + # Check if DPU exists and retrieve bus info + DPU_BUS_INFO=$(get_dpu_bus_info "${DPU_NAME}") + + # Update STATE_DB and handle PCIe removal and rescan + sonic-db-cli state_db set "PCIE_DETACH_INFO|${DPU_NAME}" '{"dpu_id": "'${DPU_INDEX}'", "dpu_state": "detaching", "bus_info": "'${DPU_BUS_INFO}'"}' + + echo 1 > /sys/bus/pci/devices/${DPU_BUS_INFO}/remove + reboot_platform_module "${DPU_NAME}" + echo 1 > /sys/bus/pci/rescan + + sonic-db-cli state_db del "PCIE_DETACH_INFO|${DPU_NAME}" +} + function parse_options() { - while getopts "h?vf" opt; do + while getopts "h?vfpd" opt; do case ${opt} in h|\? ) show_help_and_exit @@ -192,6 +320,13 @@ function parse_options() f ) REBOOT_FLAGS+=" -f" ;; + d ) + REBOOT_DPU="yes" + DPU_MODULE_NAME="$OPTARG" + ;; + p ) + PRE_SHUTDOWN="yes" + ;; esac done } @@ -215,6 +350,56 @@ function linecard_reboot_notify_supervisor() fi } +# Function to reboot all DPUs in parallel +function reboot_all_dpus() { + local NUM_DPU=$1 + + for (( i=0; i<"$NUM_DPU"; i++ )); do + echo "Rebooting DPU module dpu$i" + reboot_dpu_module "dpu$i" & + done + wait +} + +# Function to handle scenarios on smart switch +function handle_smart_switch() { + if [ -f "$PLATFORM_JSON_PATH" ]; then + NUM_DPU=$(jq -r '.DPUS | length' "$PLATFORM_JSON_PATH" 2>/dev/null) + if [ "$NUM_DPU" -gt 0 ]; then + SMART_SWITCH="yes" + fi + fi + + if [[ "$REBOOT_DPU" == "yes" ]]; then + if [[ "$SMART_SWITCH" == "yes" ]]; then + echo "User requested to reboot the device ${DPU_MODULE_NAME}" + reboot_dpu_module "$DPU_MODULE_NAME" + else + echo "Invalid '-d' option specified for a non-smart switch" + exit ${EXIT_ERROR} + fi + fi + + is_dpu=$(python3 -c "import reboot_helper; reboot_helper.is_dpu()") + debug "Is the platform DPU: $is_dpu" + + # Check if system is a DPU and handle -p option accordingly + if [[ "$is_dpu" == "True" && "$PRE_SHUTDOWN" != "yes" ]]; then + echo "Invalid, '-p' option not specified for a DPU" + exit ${EXIT_ERROR} + elif [[ "$is_dpu" != "True" && "$PRE_SHUTDOWN" == "yes" ]]; then + echo "Invalid '-p' option specified for a non-DPU" + exit ${EXIT_ERROR} + fi + + if [[ "$SMART_SWITCH" == "yes" ]]; then + # If not a DPU, reboot all DPUs in parallel + if [[ "$is_dpu" != "True" ]]; then + reboot_all_dpus "$NUM_DPU" + fi + fi +} + parse_options $@ # Exit if not superuser @@ -225,6 +410,8 @@ fi debug "User requested rebooting device ..." +handle_smart_switch + check_conflict_boot_in_fw_update setup_reboot_variables @@ -287,6 +474,11 @@ if [ -x ${WATCHDOG_UTIL} ]; then ${WATCHDOG_UTIL} arm fi +if [[ "${PRE_SHUTDOWN}" == "yes" ]]; then + echo "${DPU_MODULE_NAME} pre-shutdown steps are completed" + exit ${EXIT_SUCCESS} +fi + if [ -x ${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} ]; then VERBOSE=yes debug "Rebooting with platform ${PLATFORM} specific tool ..." ${DEVPATH}/${PLATFORM}/${PLAT_REBOOT} $@ diff --git a/scripts/reboot_helper.py b/scripts/reboot_helper.py new file mode 100644 index 0000000000..113b7d6c69 --- /dev/null +++ b/scripts/reboot_helper.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +# +# reboot_helper.py +# +# Utility helper for reboot within SONiC + +import os +import re +import sys +import json +import sonic_platform +from sonic_py_common import logger, device_info +from utilities_common.chassis import is_smartswitch + +SYSLOG_IDENTIFIER = "reboot_helper" + +EXIT_FAIL = -1 +EXIT_SUCCESS = 0 +ERROR_NOT_IMPLEMENTED = 1 +ERROR_EXCEPTION = 2 + +# Global logger instance +log = logger.Logger(SYSLOG_IDENTIFIER) + +# Global variable for platform chassis +platform_chassis = None + + +def get_all_dpus(): + """ + Retrieve a list of all DPUs (Data Processing Units) in the system. + This function checks if the platform is a smartswitch and then loads the platform.json + file to extract the DPUs dictionary. It converts the DPU names to uppercase and returns + them as a list. + + Returns: + list: A list of DPU names in uppercase. + """ + dpu_list = [] + + if not is_smartswitch(): + return dpu_list + + # Load platform.json + platform_info = device_info.get_platform_info() + platform = platform_info.get('platform') + if not platform: + log.log_error("Platform does not exist in platform_info") + return dpu_list + platform_json_path = os.path.join("/usr/share/sonic/device", platform, "platform.json") + try: + with open(platform_json_path, 'r') as platform_json: + config_data = json.load(platform_json) + + # Extract DPUs dictionary + dpus = config_data.get("DPUS", {}) + + # Convert DPU names to uppercase and append to the list + dpu_list = [dpu.upper() for dpu in dpus] + + except FileNotFoundError: + log.log_error("platform.json not found") + except json.JSONDecodeError: + log.log_error("Failed to parse platform.json") + sys.exit(ERROR_EXCEPTION) + + return dpu_list + + +def load_platform_chassis(): + """ + Load the platform chassis using the SONiC platform API. + + This function attempts to instantiate the platform chassis object. + If successful, it sets the global variable `platform_chassis` to the instantiated object. + + Returns: + bool: True if the platform chassis is successfully loaded, False otherwise. + """ + global platform_chassis + + # Load new platform API class + try: + platform_chassis = sonic_platform.platform.Platform().get_chassis() + except Exception as e: + log.log_error("Failed to instantiate Chassis due to {}".format(repr(e))) + sys.exit(ERROR_EXCEPTION) + + if not platform_chassis: + log.log_error("Platform chassis is not loaded") + sys.exit(EXIT_FAIL) + + return True + + +def reboot_module(module_name): + """ + Reboot the specified module by invoking the platform API. + + Args: + module_name (str): The name of the module to reboot. + + Returns: + bool: True if the reboot command was successfully sent, False otherwise. + """ + + # Load the platform chassis if not already loaded + load_platform_chassis() + + if not is_smartswitch(): + log.log_error("Platform is not a smartswitch to reboot module") + return False + + # Attempt to reboot the module + if hasattr(platform_chassis, 'reboot'): + platform_chassis.reboot(module_name) + else: + log.log_error("Reboot method not found in platform chassis") + return False + + if module_name.upper() not in get_all_dpus(): + log.log_error("Module {} not found".format(module_name)) + return False + + log.log_info("Rebooting module {}...".format(module_name)) + try: + platform_chassis.reboot(module_name) + log.log_info("Reboot command sent for module {}".format(module_name)) + except NotImplementedError: + log.log_error("Reboot not implemented on this platform") + sys.exit(ERROR_NOT_IMPLEMENTED) + except Exception as e: + log.log_error("An error occurred while rebooting module {}: {}".format(module_name, e)) + sys.exit(ERROR_EXCEPTION) + + return True + + +def is_dpu(): + """Check if script is running on DPU module""" + + # Load the platform chassis if not already loaded + load_platform_chassis() + + if not is_smartswitch(): + return False + + # Load platform.json + platform_info = device_info.get_platform_info() + platform = platform_info.get('platform') + if not platform: + log.log_error("Platform does not exist in platform_info") + return False + platform_json_path = os.path.join("/usr/share/sonic/device", platform, "platform.json") + try: + with open(platform_json_path, 'r') as platform_json: + config_data = json.load(platform_json) + + # Check for any key matching the .DPU pattern + for key in config_data.keys(): + if re.search(r'\.DPU$', key): + return True + except FileNotFoundError: + log.log_error("platform.json not found") + except json.JSONDecodeError: + log.log_error("Failed to parse platform.json") + sys.exit(ERROR_EXCEPTION) + + return False + + +if __name__ == "__main__": + if len(sys.argv) < 3: + print("Usage: reboot_helper.py ") + sys.exit(EXIT_FAIL) + + command = sys.argv[1] + module_name = sys.argv[2] + + if command == "reboot": + success = reboot_module(module_name) + if not success: + sys.exit(EXIT_FAIL) + else: + print("Reboot command sent for module {module_name}") + elif command == "is_dpu": + if is_dpu(): + print("Script is running on DPU module") + else: + sys.exit(EXIT_FAIL) + else: + print("Unknown command: {command}") + sys.exit(EXIT_FAIL) diff --git a/setup.py b/setup.py index dc5fa4a9b4..c1e579857c 100644 --- a/setup.py +++ b/setup.py @@ -161,6 +161,7 @@ 'scripts/psushow', 'scripts/queuestat', 'scripts/reboot', + 'scripts/reboot_helper.py', 'scripts/route_check.py', 'scripts/route_check_test.sh', 'scripts/vnet_route_check.py', diff --git a/tests/test_reboot_helper.py b/tests/test_reboot_helper.py new file mode 100644 index 0000000000..5eaa7f07a8 --- /dev/null +++ b/tests/test_reboot_helper.py @@ -0,0 +1,165 @@ +import os +import sys +import unittest +from unittest.mock import patch, MagicMock, mock_open + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) + +sys.modules['sonic_platform'] = MagicMock() +import scripts.reboot_helper # noqa: E402 + + +class TestRebootHelper(unittest.TestCase): + + @patch( + 'scripts.reboot_helper.is_smartswitch', + MagicMock(return_value=False) + ) + def test_get_all_dpus_not_smartswitch(self): + dpu_list = scripts.reboot_helper.get_all_dpus() + self.assertEqual(dpu_list, []) + + @patch( + 'os.path.join', MagicMock(return_value="/mock/path/platform.json") + ) + @patch( + 'builtins.open', new_callable=mock_open, + read_data='{"DPUS": {"DPU1": {}, "DPU2": {}}}' + ) + @patch( + 'scripts.reboot_helper.device_info.get_platform_info', + MagicMock(return_value={'platform': 'mock_platform'}) + ) + @patch( + 'scripts.reboot_helper.is_smartswitch', MagicMock(return_value=True) + ) + def test_get_all_dpus_valid_json(self, mock_is_smartswitch): + dpu_list = scripts.reboot_helper.get_all_dpus() + self.assertEqual(dpu_list, ["DPU1", "DPU2"]) + + @patch( + 'scripts.reboot_helper.sonic_platform.platform.Platform.get_chassis' + ) + def test_load_platform_chassis_success(self, mock_get_chassis): + mock_get_chassis.return_value = MagicMock() + result = scripts.reboot_helper.load_platform_chassis() + self.assertTrue(result) + + @patch( + 'scripts.reboot_helper.load_platform_chassis', + MagicMock(return_value=False) + ) + def test_reboot_module_chassis_fail(self): + result = scripts.reboot_helper.reboot_module("DPU1") + self.assertFalse(result) + + @patch( + 'scripts.reboot_helper.load_platform_chassis', + MagicMock(return_value=True) + ) + @patch( + 'scripts.reboot_helper.is_smartswitch', MagicMock(return_value=False) + ) + def test_reboot_module_not_smartswitch(self): + result = scripts.reboot_helper.reboot_module("DPU1") + self.assertFalse(result) + + @patch( + 'scripts.reboot_helper.get_all_dpus', MagicMock(return_value=["DPU1"]) + ) + @patch( + 'scripts.reboot_helper.load_platform_chassis', + MagicMock(return_value=True) + ) + @patch( + 'scripts.reboot_helper.is_smartswitch', MagicMock(return_value=True) + ) + def test_reboot_module_not_found(self): + result = scripts.reboot_helper.reboot_module("DPU2") + self.assertFalse(result) + + @patch( + 'scripts.reboot_helper.get_all_dpus', MagicMock(return_value=["DPU1"]) + ) + @patch( + 'scripts.reboot_helper.load_platform_chassis', + MagicMock(return_value=True) + ) + @patch( + 'scripts.reboot_helper.is_smartswitch', MagicMock(return_value=True) + ) + @patch( + 'scripts.reboot_helper.platform_chassis.reboot', + MagicMock(return_value=True) + ) + def test_reboot_module_success(self): + result = scripts.reboot_helper.reboot_module("DPU1") + self.assertTrue(result) + + @patch( + 'scripts.reboot_helper.load_platform_chassis', + MagicMock(return_value=False) + ) + def test_is_dpu_load_platform_chassis_fail(self): + result = scripts.reboot_helper.is_dpu() + self.assertFalse(result) + + @patch( + 'scripts.reboot_helper.load_platform_chassis', + MagicMock(return_value=True) + ) + @patch( + 'scripts.reboot_helper.is_smartswitch', MagicMock(return_value=False) + ) + def test_is_dpu_not_smartswitch(self): + result = scripts.reboot_helper.is_dpu() + self.assertFalse(result) + + @patch( + 'os.path.join', MagicMock(return_value="/mock/path/platform.json") + ) + @patch( + 'builtins.open', new_callable=mock_open, + read_data='{".DPU": {}}' + ) + @patch( + 'scripts.reboot_helper.device_info.get_platform_info', + MagicMock(return_value={'platform': 'mock_platform'}) + ) + @patch('scripts.reboot_helper.load_platform_chassis') + @patch('scripts.reboot_helper.is_smartswitch') + def test_is_dpu_found(self, mock_is_smartswitch, + mock_load_platform_chassis, + mock_get_platform_info): + mock_is_smartswitch.return_value = True + mock_load_platform_chassis.return_value = True + result = scripts.reboot_helper.is_dpu() + self.assertTrue(result) + + @patch( + 'os.path.join', MagicMock(return_value="/mock/path/platform.json") + ) + @patch( + 'builtins.open', new_callable=mock_open, + read_data='{}' + ) + @patch( + 'scripts.reboot_helper.device_info.get_platform_info', + MagicMock(return_value={'platform': 'mock_platform'}) + ) + @patch( + 'scripts.reboot_helper.load_platform_chassis', + MagicMock(return_value=True) + ) + @patch( + 'scripts.reboot_helper.is_smartswitch', MagicMock(return_value=True) + ) + def test_is_dpu_not_found(self, mock_is_smartswitch): + result = scripts.reboot_helper.is_dpu() + self.assertFalse(result) + + +if __name__ == '__main__': + unittest.main() diff --git a/utilities_common/chassis.py b/utilities_common/chassis.py index 1283bca580..667f2ab155 100644 --- a/utilities_common/chassis.py +++ b/utilities_common/chassis.py @@ -16,3 +16,7 @@ def get_chassis_local_interfaces(): lst = data[1].split(",") return lst return lst + + +def is_smartswitch(): + return hasattr(device_info, 'is_smartswitch') and device_info.is_smartswitch()