From 3282aaa8de37d50d980995f6c852ad96c2cd9fb5 Mon Sep 17 00:00:00 2001 From: Ori Messinger Date: Tue, 30 Apr 2024 20:12:05 -0400 Subject: [PATCH 01/10] ROCm SMI LIB: Add Ring Hang Event Enums This patch adds 'ring hang' enums to ROCM SMI LIB. This event type name is KFD_SMI_EVENT_RING_HANG. Signed-off-by: Ori Messinger Change-Id: I9b886eb1fc027f03bcca1e5d1a89a2a186b64bf5 --- include/rocm_smi/kfd_ioctl.h | 1 + include/rocm_smi/rocm_smi.h | 3 ++- python_smi_tools/rsmiBindings.py | 16 +++++++++------- python_smi_tools/rsmiBindings.py.in | 11 ++++++----- tests/rocm_smi_test/test_utils.cc | 1 + 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/rocm_smi/kfd_ioctl.h b/include/rocm_smi/kfd_ioctl.h index 3b781ce1..6477f448 100755 --- a/include/rocm_smi/kfd_ioctl.h +++ b/include/rocm_smi/kfd_ioctl.h @@ -553,6 +553,7 @@ enum kfd_smi_event { KFD_SMI_EVENT_THERMAL_THROTTLE = 2, KFD_SMI_EVENT_GPU_PRE_RESET = 3, KFD_SMI_EVENT_GPU_POST_RESET = 4, + KFD_SMI_EVENT_RING_HANG = 5, }; #define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1)) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 963d1e0c..0ecfc20a 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -316,8 +316,9 @@ typedef enum { RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE, RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET, RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET, + RSMI_EVT_NOTIF_RING_HANG = KFD_SMI_EVENT_RING_HANG, - RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET + RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG } rsmi_evt_notification_type_t; /** diff --git a/python_smi_tools/rsmiBindings.py b/python_smi_tools/rsmiBindings.py index 0483dacf..3dfab10f 100644 --- a/python_smi_tools/rsmiBindings.py +++ b/python_smi_tools/rsmiBindings.py @@ -102,16 +102,18 @@ class rsmi_dev_perf_level_t(c_int): RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 -notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_RESET'] +notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET', 'RING_HANG'] class rsmi_evt_notification_type_t(c_int): - RSMI_EVT_NOTIF_VMFAULT = 0 - RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT - RSMI_EVT_NOTIF_THERMAL_THROTTLE = 1 - RSMI_EVT_NOTIF_GPU_PRE_RESET = 2 - RSMI_EVT_NOTIF_GPU_POST_RESET = 3 - RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET + RSMI_EVT_NOTIF_NONE = 0 + RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_NONE + RSMI_EVT_NOTIF_VMFAULT = 1 + RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2 + RSMI_EVT_NOTIF_GPU_PRE_RESET = 3 + RSMI_EVT_NOTIF_GPU_POST_RESET = 4 + RSMI_EVT_NOTIF_RING_HANG = 5 + RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG class rsmi_voltage_metric_t(c_int): diff --git a/python_smi_tools/rsmiBindings.py.in b/python_smi_tools/rsmiBindings.py.in index d53010f4..b24665cf 100644 --- a/python_smi_tools/rsmiBindings.py.in +++ b/python_smi_tools/rsmiBindings.py.in @@ -138,12 +138,13 @@ notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_RESET'] class rsmi_evt_notification_type_t(c_int): - RSMI_EVT_NOTIF_VMFAULT = 0 + RSMI_EVT_NOTIF_VMFAULT = 1 RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT - RSMI_EVT_NOTIF_THERMAL_THROTTLE = 1 - RSMI_EVT_NOTIF_GPU_PRE_RESET = 2 - RSMI_EVT_NOTIF_GPU_POST_RESET = 3 - RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET + RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2 + RSMI_EVT_NOTIF_GPU_PRE_RESET = 3 + RSMI_EVT_NOTIF_GPU_POST_RESET = 4 + RSMI_EVT_NOTIF_RING_HANG = 5 + RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_RING_HANG class rsmi_voltage_metric_t(c_int): diff --git a/tests/rocm_smi_test/test_utils.cc b/tests/rocm_smi_test/test_utils.cc index d27f185b..c844cd25 100755 --- a/tests/rocm_smi_test/test_utils.cc +++ b/tests/rocm_smi_test/test_utils.cc @@ -85,6 +85,7 @@ static const std::map {RSMI_EVT_NOTIF_THERMAL_THROTTLE, "RSMI_EVT_NOTIF_THERMAL_THROTTLE"}, {RSMI_EVT_NOTIF_GPU_PRE_RESET, "RSMI_EVT_NOTIF_GPU_PRE_RESET"}, {RSMI_EVT_NOTIF_GPU_POST_RESET, "RSMI_EVT_NOTIF_GPU_POST_RESET"}, + {RSMI_EVT_NOTIF_RING_HANG, "RSMI_EVT_NOTIF_RING_HANG"}, }; const char * NameFromEvtNotifType(rsmi_evt_notification_type_t evt) { From 48ddd9abd74793826a5541eb9ffdc2b23c61b81a Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Fri, 26 Apr 2024 23:48:15 -0500 Subject: [PATCH 02/10] fix: [SWDEV-458862] [rocm/rocm_smi_lib] Fixes reading pp_od_clk_voltage new variable format and size. Code changes related to the following: * get_od_clk_volt_info() * get_od_clk_volt_curve_regions() * Unit tests * CLI options removed: --showclkvolt, --showvc, --showvoltagerange, --setvc Change-Id: Ieedb845eeadcea2f2e447ec576c253ad2a814176 Signed-off-by: Oliveira, Daniel --- include/rocm_smi/rocm_smi.h | 4 + include/rocm_smi/rocm_smi_utils.h | 304 +++++++++++++++++- python_smi_tools/rocm_smi.py | 148 +-------- src/rocm_smi.cc | 162 ++++------ src/rocm_smi_utils.cc | 9 +- .../functional/mutual_exclusion.cc | 8 +- .../functional/volt_freq_curv_read.cc | 43 +-- 7 files changed, 390 insertions(+), 288 deletions(-) diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index 0ecfc20a..d82a21d3 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -2869,6 +2869,8 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, /** * @brief This function sets 1 of the 3 voltage curve points. * + * @deprecated This function is deprecated due to driver changes. + * * @details Given a device index @p dv_ind, a voltage point @p vpoint * and a voltage value @p voltvalue this function will set voltage curve point * @@ -2894,6 +2896,8 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, * @brief This function will retrieve the current valid regions in the * frequency/voltage space. * + * @deprecated This function is deprecated due to driver changes. + * * @details Given a device index @p dv_ind, a pointer to an unsigned integer * @p num_regions and a buffer of ::rsmi_freq_volt_region_t structures, @p * buffer, this function will populate @p buffer with the current diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h index 40f24eca..8196822c 100755 --- a/include/rocm_smi/rocm_smi_utils.h +++ b/include/rocm_smi/rocm_smi_utils.h @@ -45,14 +45,17 @@ #include -#include +#include #include -#include -#include #include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include #include "rocm_smi/rocm_smi_device.h" @@ -123,6 +126,12 @@ std::string print_rsmi_od_volt_freq_regions(uint32_t num_regions, bool is_sudo_user(); rsmi_status_t rsmi_get_gfx_target_version(uint32_t dv_ind, std::string *gfx_version); + +std::string leftTrim(const std::string &s); +std::string rightTrim(const std::string &s); +std::string trim(const std::string &s); +std::string removeNewLines(const std::string &s); + std::string removeString(const std::string origStr, const std::string &removeMe); template @@ -296,6 +305,291 @@ class ScopedAcquire { // In VM environment, the /proc/cpuinfo set hypervisor flag by default bool is_vm_guest(); + +// +enum class TagSplitterPositional_t +{ + kFIRST, + kBETWEEN, + kLAST, + kNONE, +}; + +template +class TagTextContents_t +{ + public: + using TextLines_t = std::vector; + using PrimaryList_t = std::vector; + using SecondaryList_t = std::vector; + using PrimaryKeyTbl_t = std::map; + using SecondaryKeyTbl_t = std::map; + using StructuredKeysTbl_t = std::map>; + + // + TagTextContents_t() = default; + TagTextContents_t(const TagTextContents_t&) = delete; + TagTextContents_t(TagTextContents_t&&) = delete; + TagTextContents_t& operator=(const TagTextContents_t&) = delete; + TagTextContents_t& operator=(TagTextContents_t&&) = delete; + + explicit TagTextContents_t(const TextLines_t& text_content) + : m_text_content(text_content) {} + + TagTextContents_t& set_text_content(const TextLines_t& text_content) + { + m_text_content = text_content; + } + + TagTextContents_t& set_title_terminator(const std::string& title_mark, + TagSplitterPositional_t title_mark_position) { + m_title_mark = title_mark; + m_title_mark_position = title_mark_position; + + return *this; + } + + TagTextContents_t& set_key_data_splitter(const std::string& line_splitter_mark, + TagSplitterPositional_t line_mark_position) { + m_line_splitter_mark = line_splitter_mark; + m_line_mark_position = line_mark_position; + + return *this; + } + + TagTextContents_t& structure_content() { + // Sanitizes the content. + if (!m_text_content.empty()) { + std::for_each(m_text_content.begin(), m_text_content.end(), trim); + section_title_lookup(); + section_data_lookup(); + } + + return *this; + } + + decltype(auto) get_title_size() { + return m_primary.size(); + } + + decltype(auto) get_structured_subkeys_size(const PrimaryKeyType& prim_key) { + return m_structured[prim_key].size(); + } + + decltype(auto) contains_title_key(const PrimaryKeyType& key) { + return (m_primary.find(key) != m_primary.end()); + } + + decltype(auto) contains_structured_key(const PrimaryKeyType& prim_key, + const SecondaryKeyType& sec_key) { + if (auto first_key_itr = m_structured.find(prim_key); + first_key_itr != m_structured.end()) { + if (auto sec_key_itr = first_key_itr->second.find(sec_key); + sec_key_itr != first_key_itr->second.end()) { + return true; + } + } + + return false; + } + + decltype(auto) get_structured_value_by_keys(const PrimaryKeyType& prim_key, + const SecondaryKeyType& sec_key, + bool is_value_id = true) { + if (auto first_key_itr = m_structured.find(prim_key); + first_key_itr != m_structured.end()) { + if (auto sec_key_itr = first_key_itr->second.find(sec_key); + sec_key_itr != first_key_itr->second.end()) { + SecondaryDataType key_value{}; + if (is_value_id) { + key_value = SecondaryDataType(sec_key_itr->first) + " "; + } + key_value += sec_key_itr->second; + return key_value; + } + } + + return SecondaryDataType{}; + } + + decltype(auto) get_structured_data_subkey_by_position(const PrimaryKeyType& prim_key, + uint32_t key_position) { + auto key_counter = uint32_t(0); + SecondaryKeyType data_key{}; + if (key_position < (get_structured_subkeys_size(prim_key))) { + for (const auto& [sec_key, sec_value] : m_structured[prim_key]) { + if (key_counter == key_position) { + data_key = static_cast(sec_key); + return data_key; + } + ++key_counter; + } + } + + return data_key; + } + + decltype(auto) get_structured_data_subkey_first(const PrimaryKeyType& prim_key) { + return (get_structured_value_by_keys(prim_key, + get_structured_data_subkey_by_position(prim_key, 0))); + } + + decltype(auto) get_structured_data_subkey_last(const PrimaryKeyType& prim_key) { + return (get_structured_value_by_keys(prim_key, get_structured_data_subkey_by_position(prim_key, + (get_structured_subkeys_size(prim_key) - 1)))); + } + + void reset() { + m_text_content.clear(); + m_primary.clear(); + m_structured.clear(); + m_title_mark.clear(); + m_line_splitter_mark.clear(); + m_title_mark_position = TagSplitterPositional_t::kNONE; + m_line_mark_position = TagSplitterPositional_t::kNONE; + } + + decltype(auto) dump_structured_content() { + std::ostringstream ostrstream; + ostrstream << __PRETTY_FUNCTION__ << "| ======= start =======" << "\n"; + ostrstream << "** Primary Table **" << "\n"; + for (const auto& [key, values] : m_primary) { + ostrstream << "key: " << key << " values: " << values.size() << "\n"; + for (const auto& value : values) { + ostrstream << "\t value: " << value << "\n"; + } + } + + ostrstream << "\n ** Structured Table **" << "\n"; + for (const auto& [prim_key, prim_values] : m_structured) { + ostrstream << "key: " << prim_key << "\n"; + for (const auto& [sec_key, sec_value] : prim_values) { + ostrstream << "\t key: " << sec_key << " -> " << sec_value << "\n"; + } + } + ostrstream << "\n\n"; + + return ostrstream.str(); + } + + + private: + TextLines_t m_text_content; + PrimaryKeyTbl_t m_primary; + StructuredKeysTbl_t m_structured; + std::string m_title_mark; + std::string m_line_splitter_mark; + TagSplitterPositional_t m_title_mark_position; + TagSplitterPositional_t m_line_mark_position; + + // + // Note: Organizes table with Title as a Key, and a list of values. + // + decltype(auto) section_title_lookup() { + if (m_title_mark.empty() || + m_title_mark_position == TagSplitterPositional_t::kNONE) { + return; + } + + // + // Note: + // - top_title_line: Left pointer for the sliding window + // - bottom_title_line: Right pointer for the sliding window + // + auto top_title_line = uint32_t(std::numeric_limits::max()); + auto bottom_title_line = uint32_t(std::numeric_limits::max()); + auto line_counter = uint32_t(0); + + // + // Note: This whole interval/window where the section/title starts, and where it ends. + // + auto update_primary_tbl = [&](const uint32_t& from_line, const uint32_t& to_line) { + auto key = static_cast(m_text_content[from_line]); + for (auto line_num(from_line + 1); line_num < to_line; ++line_num) { + if ((line_num < m_text_content.size()) && !m_text_content[line_num].empty()) { + m_primary[key].push_back(m_text_content[line_num]); + } + } + }; + + auto adjust_sliding_window = [&](const uint32_t& title_line) { + // First time top_title_line gets adjusted. + if (top_title_line == uint32_t(std::numeric_limits::max())) { + top_title_line = title_line; + bottom_title_line = top_title_line; + return; + } + if (title_line > bottom_title_line) { + bottom_title_line = title_line; + update_primary_tbl(top_title_line, bottom_title_line); + top_title_line = bottom_title_line; + } + }; + + for (const auto& line : m_text_content) { + auto was_title_found{false}; + switch (m_title_mark_position) { + case TagSplitterPositional_t::kFIRST: + // Section/Title Mark was found at the first position + if (line.find_first_of(m_title_mark.c_str()) == 0) { + was_title_found = true; + } + break; + + case TagSplitterPositional_t::kLAST: + // Section/Title Mark was found at the last position + if ((line.find_last_of(m_title_mark.c_str()) + 1) == line.size()) { + was_title_found = true; + } + break; + + default: + break; + } + + if (was_title_found) { + adjust_sliding_window(line_counter); + } + ++line_counter; + } + + // Any remaining elements? + if (line_counter > bottom_title_line) { + update_primary_tbl(bottom_title_line, (line_counter - 1)); + } + } + + decltype(auto) section_data_lookup() { + if (m_line_splitter_mark.empty() || + m_line_mark_position == TagSplitterPositional_t::kNONE) { + return; + } + + // + // Note: Organizes table with Title as a Key, a Key/ID for values and values. + // It takes into consideration the initial constraints were all good and + // that the primary table has been populated. + for (const auto& [prim_key, prim_values] : m_primary) { + for (const auto& value : prim_values) { + if (auto mark_pos = value.find_first_of(m_line_splitter_mark.c_str()); + mark_pos != std::string::npos) { + auto sec_key = trim(value.substr(0, mark_pos + 1)); + auto sec_data = trim(value.substr((mark_pos + 1), value.size())); + if (!sec_key.empty()) { + m_structured[prim_key].insert(std::make_pair(sec_key, sec_data)); + } + } + } + } + } + +}; + +using TextFileTagContents_t = TagTextContents_t; + + } // namespace smi } // namespace amd diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index aed3292d..334fb9be 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -157,7 +157,7 @@ def formatMatrixToJSON(deviceList, matrix, metricName): :param deviceList: List of DRM devices (can be a single-item list) :param metricName: Title of the item to print to the log :param matrix: symmetric matrix full of values of every permutation of DRM devices. - + Matrix example: .. math:: @@ -554,9 +554,9 @@ def getPidList(): def getPower(device): """ Return dictionary of power responses. Response power dictionary: - + .. code-block:: python - + { 'power': string wattage response or 'N/A' (for not RSMI_STATUS_SUCCESS), 'power_type': power type string - 'Current Socket' or 'Average', @@ -566,7 +566,7 @@ def getPower(device): :param device: DRM device identifier """ - + power = c_int64(0) power_type = rsmi_power_type_t() power_ret_dict = { @@ -668,7 +668,7 @@ def getPowerLabel(deviceList): return powerLabel device=deviceList[0] power_dict = getPower(device) - if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and + if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and power_dict['power_type'] == 'CURRENT SOCKET'): powerLabel = rsmi_power_label.CURRENT_SOCKET_POWER return powerLabel @@ -1251,7 +1251,7 @@ def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond): if level == "max": point = 1 try: - int(clkValue) + int(clkValue) except ValueError: printErrLog(None, 'Unable to set %s' % (clkValue)) logging.error('%s is not an integer', clkValue) @@ -1270,34 +1270,6 @@ def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond): printLog(device, 'Setting %s %s clock is not supported for this device.' % (level, clkType), None) -def setVoltageCurve(deviceList, point, clk, volt, autoRespond): - """ Set voltage curve for a point in the PowerPlay table for a list of devices. - - :param deviceList: List of DRM devices (can be a single-item list) - :param point: Point on the voltage curve to modify - :param clk: Clock speed specified for this curve point - :param volt: Voltage specified for this curve point - :param autoRespond: Response to automatically provide for all prompts - """ - global RETCODE - value = '%s %s %s' % (point, clk, volt) - try: - any(int(item) for item in value.split()) - except ValueError: - printErrLog(None, 'Unable to set Voltage curve') - printErrLog(None, 'Non-integer characters are present in %s' %value) - RETCODE = 1 - return - confirmOutOfSpecWarning(autoRespond) - for device in deviceList: - ret = rocmsmi.rsmi_dev_od_volt_info_set(device, int(point), int(clk), int(volt)) - if rsmi_ret_ok(ret, device, 'set_voltage_curve'): - printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None) - else: - printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt)) - RETCODE = 1 - - def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond): """ Set clock frequency and voltage for a level in the PowerPlay table for a list of devices. @@ -1972,7 +1944,7 @@ def showAllConcise(deviceList): temp_val += degree_sign + 'C' power_dict = getPower(device) powerVal = 'N/A' - if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and + if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and power_dict['power_type'] != 'INVALID_POWER_TYPE'): if power_dict['power'] != 0: powerVal = power_dict['power'] + power_dict['unit'] @@ -2001,7 +1973,7 @@ def showAllConcise(deviceList): values['card%s' % (str(device))] = [device, getNodeId(device), str(getDRMDeviceId(device)) + ", ", str(getGUID(device)), - temp_val, powerVal, + temp_val, powerVal, combined_partition_data, sclk, mclk, fan, str(perf).lower(), str(pwrCap), @@ -2371,7 +2343,7 @@ def getCoarseGrainUtil(device, typeName=None): for ut_counter in utilization_counters: printLog(device, utilization_counter_name[ut_counter.type], ut_counter.val) - + :param device: DRM device identifier :param typeName: 'GFX Activity', 'Memory Activity' """ @@ -2695,10 +2667,10 @@ def showPower(deviceList): for device in deviceList: power_dict = getPower(device) power = 'N/A' - if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and + if (power_dict['ret'] == rsmi_status_t.RSMI_STATUS_SUCCESS and power_dict['power_type'] != 'INVALID_POWER_TYPE'): power = power_dict['power'] - printLog(device, power_dict['power_type'].title() + ' Graphics Package Power (' + printLog(device, power_dict['power_type'].title() + ' Graphics Package Power (' + power_dict['unit'] + ')', power) elif checkIfSecondaryDie(device): @@ -2711,49 +2683,6 @@ def showPower(deviceList): printLogSpacer() -def showPowerPlayTable(deviceList): - """ Display current GPU Memory clock frequencies and voltages for a list of devices - - :param deviceList: List of DRM devices (can be a single-item list) - """ - global PRINT_JSON - if PRINT_JSON: - return - printLogSpacer(' GPU Memory clock frequencies and voltages ') - odvf = rsmi_od_volt_freq_data_t() - for device in deviceList: - ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf)) - if rsmi_ret_ok(ret, device, 'get_od_volt'): - # TODO: Make this more dynamic and less hard-coded if possible - printLog(device, 'OD_SCLK:', None) - printLog(device, '0: %sMhz' % (int(odvf.curr_sclk_range.lower_bound / 1000000)), None) - printLog(device, '1: %sMhz' % (int(odvf.curr_sclk_range.upper_bound / 1000000)), None) - printLog(device, 'OD_MCLK:', None) - printLog(device, '1: %sMhz' % (int(odvf.curr_mclk_range.upper_bound / 1000000)), None) - if odvf.num_regions > 0: - printLog(device, 'OD_VDDC_CURVE:', None) - for position in range(3): - printLog(device, '%d: %sMhz %smV' % ( - position, int(list(odvf.curve.vc_points)[position].frequency / 1000000), - int(list(odvf.curve.vc_points)[position].voltage)), None) - if odvf.sclk_freq_limits.lower_bound > 0 or odvf.sclk_freq_limits.upper_bound > 0 \ - or odvf.mclk_freq_limits.lower_bound >0 or odvf.mclk_freq_limits.upper_bound > 0: - printLog(device, 'OD_RANGE:', None) - if odvf.sclk_freq_limits.lower_bound > 0 or odvf.sclk_freq_limits.upper_bound > 0: - printLog(device, 'SCLK: %sMhz %sMhz' % ( - int(odvf.sclk_freq_limits.lower_bound / 1000000), int(odvf.sclk_freq_limits.upper_bound / 1000000)), None) - if odvf.mclk_freq_limits.lower_bound >0 or odvf.mclk_freq_limits.upper_bound > 0: - printLog(device, 'MCLK: %sMhz %sMhz' % ( - int(odvf.mclk_freq_limits.lower_bound / 1000000), int(odvf.mclk_freq_limits.upper_bound / 1000000)), None) - if odvf.num_regions > 0: - for position in range(3): - printLog(device, 'VDDC_CURVE_SCLK[%d]: %sMhz' % ( - position, int(list(odvf.curve.vc_points)[position].frequency / 1000000)), None) - printLog(device, 'VDDC_CURVE_VOLT[%d]: %smV' % ( - position, int(list(odvf.curve.vc_points)[position].voltage)), None) - printLogSpacer() - - def showProduct(deviceList): """ Show the requested product information for a list of devices @@ -2825,7 +2754,7 @@ def showRange(deviceList, rangeType): :param rangeType: [sclk|voltage] Type of range to return """ global RETCODE - if rangeType not in {'sclk', 'mclk', 'voltage'}: + if rangeType not in {'sclk', 'mclk'}: printLog(None, 'Invalid range identifier %s' % (rangeType), None) RETCODE = 1 return @@ -2840,21 +2769,6 @@ def showRange(deviceList, rangeType): if rangeType == 'mclk': printLog(device, 'Valid mclk range: %sMhz - %sMhz' % ( int(odvf.curr_mclk_range.lower_bound / 1000000), int(odvf.curr_mclk_range.upper_bound / 1000000)), None) - if rangeType == 'voltage': - if odvf.num_regions == 0: - printErrLog(device, 'Voltage curve regions unsupported.') - continue - num_regions = c_uint32(odvf.num_regions) - regions = (rsmi_freq_volt_region_t * odvf.num_regions)() - ret = rocmsmi.rsmi_dev_od_volt_curve_regions_get(device, byref(num_regions), byref(regions)) - if rsmi_ret_ok(ret, device, 'volt'): - for i in range(num_regions.value): - printLog(device, - 'Region %d: Valid voltage range: %smV - %smV' % (i, regions[i].volt_range.lower_bound, - regions[i].volt_range.upper_bound), - None) - else: - printLog(device, 'Unable to display %s range' % (rangeType), None) printLogSpacer() @@ -3172,25 +3086,6 @@ def showVoltage(deviceList): printLogSpacer() -def showVoltageCurve(deviceList): - """ Show the voltage curve points for the specified devices - - :param deviceList: List of DRM devices (can be a single-item list) - """ - printLogSpacer(' Voltage Curve Points ') - odvf = rsmi_od_volt_freq_data_t() - for device in deviceList: - ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf)) - if rsmi_ret_ok(ret, device, 'get_od_volt_info', silent=False) and odvf.num_regions > 0: - for position in range(3): - printLog(device, 'Voltage point %d: %sMhz %smV' % ( - position, int(list(odvf.curve.vc_points)[position].frequency / 1000000), - int(list(odvf.curve.vc_points)[position].voltage)), None) - else: - printErrLog(device, 'Voltage curve Points unsupported.') - printLogSpacer() - - def showXgmiErr(deviceList): """ Display the XGMI Error status @@ -3844,7 +3739,6 @@ def isConciseInfoRequested(args): groupDisplayTop.add_argument('--showproductname', help='Show product details', action='store_true') groupDisplayTop.add_argument('--showserial', help='Show GPU\'s Serial Number', action='store_true') groupDisplayTop.add_argument('--showuniqueid', help='Show GPU\'s Unique ID', action='store_true') - groupDisplayTop.add_argument('--showvoltagerange', help='Show voltage range', action='store_true') groupDisplayTop.add_argument('--showbus', help='Show PCI bus number', action='store_true') groupDisplayPages.add_argument('--showpagesinfo', help='Show retired, pending and unreservable pages', action='store_true') @@ -3869,8 +3763,6 @@ def isConciseInfoRequested(args): groupDisplay.add_argument('-o', '--showoverdrive', help='Show current GPU Clock OverDrive level', action='store_true') groupDisplay.add_argument('-p', '--showperflevel', help='Show current DPM Performance Level', action='store_true') - groupDisplay.add_argument('-S', '--showclkvolt', help='Show supported GPU and Memory Clocks and Voltages', - action='store_true') groupDisplay.add_argument('-s', '--showclkfrq', help='Show supported GPU and Memory Clock', action='store_true') groupDisplay.add_argument('--showmeminfo', help='Show Memory usage information for given block(s) TYPE', metavar='TYPE', type=str, nargs='+') @@ -3882,7 +3774,6 @@ def isConciseInfoRequested(args): groupDisplay.add_argument('--showrasinfo', help='Show RAS enablement information and error counts for the specified block(s) (all if no arg given)', nargs='*') - groupDisplay.add_argument('--showvc', help='Show voltage curve', action='store_true') groupDisplay.add_argument('--showxgmierr', help='Show XGMI error information since last read', action='store_true') groupDisplay.add_argument('--showtopo', help='Show hardware topology information', action='store_true') groupDisplay.add_argument('--showtopoaccess', help='Shows the link accessibility between GPUs ', action='store_true') @@ -3922,8 +3813,6 @@ def isConciseInfoRequested(args): groupAction.add_argument('--setmlevel', help='Change GPU Memory clock frequency (MHz) and Voltage for (mV) a specific Level', metavar=('MCLKLEVEL', 'MCLK', 'MVOLT'), nargs=3) - groupAction.add_argument('--setvc', help='Change SCLK Voltage Curve (MHz mV) for a specific point', - metavar=('POINT', 'SCLK', 'SVOLT'), nargs=3) groupAction.add_argument('--setsrange', help='Set min and max SCLK speed', metavar=('SCLKMIN', 'SCLKMAX'), nargs=2) groupAction.add_argument('--setextremum', help='Set min/max of SCLK/MCLK speed', metavar=('min|max', "sclk|mclk", 'CLK'), nargs=3) groupAction.add_argument('--setmrange', help='Set min and max MCLK speed', metavar=('MCLKMIN', 'MCLKMAX'), nargs=2) @@ -3993,7 +3882,7 @@ def isConciseInfoRequested(args): or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \ or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \ args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \ - args.setvc or args.setsrange or args.setextremum or args.setmrange or args.setclock or \ + args.setsrange or args.setextremum or args.setmrange or args.setclock or \ args.setcomputepartition or args.setmemorypartition or args.resetcomputepartition or args.resetmemorypartition: relaunchAsSudo() @@ -4040,7 +3929,6 @@ def isConciseInfoRequested(args): args.showproductname = True args.showserial = True args.showuniqueid = True - args.showvoltagerange = True args.showbus = True args.showpagesinfo = True args.showfan = True @@ -4058,14 +3946,12 @@ def isConciseInfoRequested(args): args.showpids = "summary" args.showpidgpus = [] args.showreplaycount = True - args.showvc = True args.showcomputepartition = True args.showmemorypartition = True if not PRINT_JSON: args.showprofile = True args.showclkfrq = True - args.showclkvolt = True # Don't do reset in combination with any other command if args.gpureset: @@ -4136,8 +4022,6 @@ def isConciseInfoRequested(args): showPids(args.showpids) if args.showpidgpus or str(args.showpidgpus) == '[]': showGpusByPid(args.showpidgpus) - if args.showclkvolt: - showPowerPlayTable(deviceList) if args.showvoltage: showVoltage(deviceList) if args.showbus: @@ -4181,10 +4065,6 @@ def isConciseInfoRequested(args): showRange(deviceList, 'sclk') if args.showmclkrange: showRange(deviceList, 'mclk') - if args.showvoltagerange: - showRange(deviceList, 'voltage') - if args.showvc: - showVoltageCurve(deviceList) if args.showenergycounter: showEnergy(deviceList) if args.showcomputepartition: @@ -4221,8 +4101,6 @@ def isConciseInfoRequested(args): resetPowerOverDrive(deviceList, args.autorespond) if args.setprofile: setProfile(deviceList, args.setprofile) - if args.setvc: - setVoltageCurve(deviceList, args.setvc[0], args.setvc[1], args.setvc[2], args.autorespond) if args.setextremum: setClockExtremum(deviceList, args.setextremum[0], args.setextremum[1], args.setextremum[2], args.autorespond) if args.setsrange: diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index fb38823d..1b96059c 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -1262,18 +1262,6 @@ For the new format, GFXCLK field will show min and max values(0/1). If the curre frequency in neither min/max but lies within the range, this is indicated by an additional value followed by * at index 1 and max value at index 2. */ -constexpr uint32_t kOD_SCLK_label_array_index = 0; -constexpr uint32_t kOD_MCLK_label_array_index = - kOD_SCLK_label_array_index + 3; - -constexpr uint32_t kOD_VDDC_CURVE_label_array_index = - kOD_MCLK_label_array_index + 2; -constexpr uint32_t kOD_OD_RANGE_label_array_index = - kOD_VDDC_CURVE_label_array_index + 4; -constexpr uint32_t kOD_VDDC_CURVE_start_index = - kOD_OD_RANGE_label_array_index + 3; -// constexpr uint32_t kOD_VDDC_CURVE_num_lines = -// kOD_VDDC_CURVE_start_index + 4; constexpr uint32_t kMIN_VALID_LINES = 2; static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, @@ -1298,62 +1286,75 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, return RSMI_STATUS_NOT_YET_IMPLEMENTED; } - assert(val_vec[kOD_SCLK_label_array_index] == "OD_SCLK:" || - val_vec[kOD_SCLK_label_array_index] == "GFXCLK:"); - if ((val_vec[kOD_SCLK_label_array_index] != "OD_SCLK:") && - (val_vec[kOD_SCLK_label_array_index] != "GFXCLK:")) { - return RSMI_STATUS_UNEXPECTED_DATA; - } - - - // find last_item but skip empty lines - int last_item = val_vec.size()-1; - while (val_vec[last_item].empty() || val_vec[last_item][0] == 0) - last_item--; + // + const std::string kTAG_OD_SCLK{"OD_SCLK:"}; + const std::string kTAG_GFXCLK{"GFXCLK:"}; + const std::string KTAG_OD_MCLK{"OD_MCLK:"}; + const std::string KTAG_MCLK{"MCLK:"}; + const std::string KTAG_FIRST_FREQ_IDX{"0:"}; + amd::smi::TextFileTagContents_t txt_power_dev_od_voltage(val_vec); + txt_power_dev_od_voltage + .set_title_terminator(":", amd::smi::TagSplitterPositional_t::kLAST) + .set_key_data_splitter(":", amd::smi::TagSplitterPositional_t::kBETWEEN) + .structure_content(); + + // + // Note: We must have minimum of 'GFXCLK:' && 'MCLK:' OR: + // 'OD_SCLK:' && 'OD_MCLK:' tags. + if (txt_power_dev_od_voltage.get_title_size() < kMIN_VALID_LINES) { + return rsmi_status_t::RSMI_STATUS_NO_DATA; + } + + // Note: For debug builds/purposes only. + assert(txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) || + txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)); + // Note: For release builds/purposes. + if (!txt_power_dev_od_voltage.contains_title_key(kTAG_GFXCLK) && + !txt_power_dev_od_voltage.contains_title_key(kTAG_OD_SCLK)) { + return rsmi_status_t::RSMI_STATUS_UNEXPECTED_DATA; + } + + // Note: Quick helpers for getting 1st and last elements found + auto build_lower_bound = [&](const std::string& prim_key) { + auto lower_bound_data = txt_power_dev_od_voltage.get_structured_data_subkey_first(prim_key); + return std::vector{lower_bound_data}; + }; - p->curr_sclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_SCLK_label_array_index + 1); - p->curr_sclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_SCLK_label_array_index + 2); + auto build_upper_bound = [&](const std::string& prim_key) { + auto upper_bound_data = txt_power_dev_od_voltage.get_structured_data_subkey_last(prim_key); + return std::vector{upper_bound_data}; + }; - if (val_vec.size() < (kOD_MCLK_label_array_index + 1)) { - return RSMI_STATUS_UNEXPECTED_SIZE; + // Validates 'OD_SCLK' is in the structure + if (txt_power_dev_od_voltage.contains_structured_key(kTAG_OD_SCLK, + KTAG_FIRST_FREQ_IDX)) { + p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); + p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_OD_SCLK), nullptr, nullptr, 0); + + // Validates 'OD_MCLK' is in the structure + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_MCLK, + KTAG_FIRST_FREQ_IDX)) { + p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); + p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); + } } - // The condition below checks if it is the old style or new style format. - if (val_vec[kOD_MCLK_label_array_index] == "OD_MCLK:") { - p->curr_mclk_range.lower_bound = 0; - p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_MCLK_label_array_index + 1); - } else if (val_vec[kOD_MCLK_label_array_index] == "MCLK:") { - p->curr_mclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_MCLK_label_array_index + 1); - // the upper memory frequency is the last - p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, last_item); - return RSMI_STATUS_SUCCESS; - } else { - if (val_vec.size() < (kOD_MCLK_label_array_index + 3)) { - return RSMI_STATUS_UNEXPECTED_SIZE; - } - if (val_vec[kOD_MCLK_label_array_index + 1] == "MCLK:") { - p->curr_sclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_SCLK_label_array_index + 3); - p->curr_mclk_range.lower_bound = freq_string_to_int(val_vec, nullptr, - nullptr, kOD_MCLK_label_array_index + 2); - // the upper memory frequency is the last - p->curr_mclk_range.upper_bound = freq_string_to_int(val_vec, nullptr, - nullptr, last_item); - return RSMI_STATUS_SUCCESS; - } - return RSMI_STATUS_NOT_YET_IMPLEMENTED; + // Validates 'GFXCLK' is in the structure + else if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK, + KTAG_FIRST_FREQ_IDX)) { + p->curr_sclk_range.lower_bound = freq_string_to_int(build_lower_bound(kTAG_GFXCLK), nullptr, nullptr, 0); + p->curr_sclk_range.upper_bound = freq_string_to_int(build_upper_bound(kTAG_GFXCLK), nullptr, nullptr, 0); + + // Validates 'MCLK' is in the structure + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_MCLK, + KTAG_FIRST_FREQ_IDX)) { + p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_MCLK), nullptr, nullptr, 0); + p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_MCLK), nullptr, nullptr, 0); + } } - - if (val_vec.size() < kOD_VDDC_CURVE_label_array_index) { - return RSMI_STATUS_UNEXPECTED_SIZE; + else { + return RSMI_STATUS_NOT_YET_IMPLEMENTED; } - - p->num_regions = - static_cast((val_vec.size()) / 2); + p->num_regions = 0; return RSMI_STATUS_SUCCESS; CATCH @@ -1561,30 +1562,6 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, } -static void get_vc_region(uint32_t start_ind, - std::vector *val_vec, rsmi_freq_volt_region_t *p) { - std::ostringstream ss; - ss << __PRETTY_FUNCTION__ << " | ======= start ======="; - LOG_TRACE(ss); - assert(p != nullptr); - assert(val_vec != nullptr); - THROW_IF_NULLPTR_DEREF(p) - THROW_IF_NULLPTR_DEREF(val_vec) - - // There must be at least 1 region to read in - assert(val_vec->size() >= kOD_OD_RANGE_label_array_index + 2); - assert((*val_vec)[kOD_OD_RANGE_label_array_index] == "OD_RANGE:"); - if ((val_vec->size() < kOD_OD_RANGE_label_array_index + 2) || - ((*val_vec)[kOD_OD_RANGE_label_array_index] != "OD_RANGE:") ) { - ss << __PRETTY_FUNCTION__ << " | ======= end ======= | returning " - << getRSMIStatusString(RSMI_STATUS_UNEXPECTED_DATA); - LOG_TRACE(ss); - throw amd::smi::rsmi_exception(RSMI_STATUS_UNEXPECTED_DATA, __FUNCTION__); - } - od_value_pair_str_to_range((*val_vec)[start_ind], &p->freq_range); - od_value_pair_str_to_range((*val_vec)[start_ind + 1], &p->volt_range); -} - /* * num_regions [inout] on calling, the number of regions requested to be read * in. At completion, the number of regions actually read in @@ -1616,23 +1593,20 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, // This is a work-around to handle systems where kDevPowerODVoltage is not // fully supported yet. - if (val_vec.size() < 2) { + if (val_vec.size() < kMIN_VALID_LINES) { ss << __PRETTY_FUNCTION__ - << " | Issue: val_vec.size() < 2" << "; returning " + << " | Issue: val_vec.size() < " << kMIN_VALID_LINES << "; returning " << getRSMIStatusString(RSMI_STATUS_NOT_YET_IMPLEMENTED); LOG_ERROR(ss); return RSMI_STATUS_NOT_YET_IMPLEMENTED; } uint32_t val_vec_size = static_cast(val_vec.size()); - assert((val_vec_size - kOD_VDDC_CURVE_start_index) > 0); - ss << __PRETTY_FUNCTION__ << " | val_vec_size = " << std::dec - << val_vec_size - << " | kOD_VDDC_CURVE_start_index = " << kOD_VDDC_CURVE_start_index; + << val_vec_size; LOG_DEBUG(ss); - *num_regions = std::min((val_vec_size) / 2, *num_regions); + *num_regions = 0; return RSMI_STATUS_SUCCESS; CATCH diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 6eeffe28..8f13606e 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -1098,14 +1098,6 @@ std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) { ss << pt_rng_Mhz("\t**Current SCLK frequency range: ", &odv->curr_sclk_range); ss << pt_rng_Mhz("\t**Current MCLK frequency range: ", &odv->curr_mclk_range); - ss << pt_rng_Mhz("\t**Min/Max Possible SCLK frequency range: ", - &odv->sclk_freq_limits); - ss << pt_rng_Mhz("\t**Min/Max Possible MCLK frequency range: ", - &odv->mclk_freq_limits); - - ss << "\t**Current Freq/Volt. curve: " << "\n"; - ss << pt_vddc_curve(&odv->curve); - ss << "\t**Number of Freq./Volt. regions: " << odv->num_regions << "\n\n"; return ss.str(); } @@ -1203,5 +1195,6 @@ std::queue getAllDeviceGfxVers() { return deviceGfxVersions; } + } // namespace smi } // namespace amd diff --git a/tests/rocm_smi_test/functional/mutual_exclusion.cc b/tests/rocm_smi_test/functional/mutual_exclusion.cc index 026182eb..348b169d 100755 --- a/tests/rocm_smi_test/functional/mutual_exclusion.cc +++ b/tests/rocm_smi_test/functional/mutual_exclusion.cc @@ -186,10 +186,10 @@ void TestMutualExclusion::Run(void) { int64_t dmy_i64 = 0; char dmy_str[10]; rsmi_dev_perf_level_t dmy_perf_lvl; - rsmi_frequencies_t dmy_freqs; - rsmi_od_volt_freq_data_t dmy_od_volt; - rsmi_freq_volt_region_t dmy_vlt_reg; - rsmi_error_count_t dmy_err_cnt; + rsmi_frequencies_t dmy_freqs{}; + rsmi_od_volt_freq_data_t dmy_od_volt{}; + rsmi_freq_volt_region_t dmy_vlt_reg{}; + rsmi_error_count_t dmy_err_cnt{}; rsmi_ras_err_state_t dmy_ras_err_st; // This can be replaced with ASSERT_EQ() once env. stabilizes diff --git a/tests/rocm_smi_test/functional/volt_freq_curv_read.cc b/tests/rocm_smi_test/functional/volt_freq_curv_read.cc index 50b6ac05..360ad7e8 100755 --- a/tests/rocm_smi_test/functional/volt_freq_curv_read.cc +++ b/tests/rocm_smi_test/functional/volt_freq_curv_read.cc @@ -87,7 +87,7 @@ void TestVoltCurvRead::Close() { void TestVoltCurvRead::Run(void) { rsmi_status_t err, ret; - rsmi_od_volt_freq_data_t odv; + rsmi_od_volt_freq_data_t odv{}; rsmi_dev_perf_level_t pfl; TestBase::Run(); @@ -134,9 +134,6 @@ void TestVoltCurvRead::Run(void) { IF_VERB(STANDARD) { std::cout << "\t**rsmi_dev_od_volt_info_get(i, nullptr): " << amd::smi::getRSMIStatusString(err, false) << "\n"; - // << "\n" - // << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) - // << "\n"; } ASSERT_TRUE(err == RSMI_STATUS_INVALID_ARGS); err = rsmi_dev_od_volt_info_get(i, &odv); @@ -147,43 +144,5 @@ void TestVoltCurvRead::Run(void) { << "\t**odv.num_regions = " << std::dec << odv.num_regions << "\n"; } - if (err == RSMI_STATUS_SUCCESS) { - std::cout << "\t**Frequency-voltage curve data:" << "\n"; - std::cout << amd::smi::print_rsmi_od_volt_freq_data_t(&odv); - - rsmi_freq_volt_region_t *regions; - uint32_t num_regions; - regions = new rsmi_freq_volt_region_t[odv.num_regions]; - ASSERT_TRUE(regions != nullptr); - - num_regions = odv.num_regions; - err = rsmi_dev_od_volt_curve_regions_get(i, &num_regions, regions); - IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_curve_regions_get(" - << "i, &num_regions, regions): " - << amd::smi::getRSMIStatusString(err, false) << "\n" - << "\t**Number of regions: " << std::dec << num_regions - << "\n"; - } - ASSERT_TRUE(err == RSMI_STATUS_SUCCESS - || err == RSMI_STATUS_NOT_SUPPORTED - || err == RSMI_STATUS_UNEXPECTED_DATA - || err == RSMI_STATUS_UNEXPECTED_SIZE); - if (err != RSMI_STATUS_SUCCESS) { - IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_curve_regions_get: " - "Not supported on this machine" << std::endl; - } - continue; - } - CHK_ERR_ASRT(err) - ASSERT_TRUE(num_regions == odv.num_regions); - - std::cout << "\t**Frequency-voltage curve regions:" << std::endl; - std::cout << amd::smi::print_rsmi_od_volt_freq_regions(num_regions, - regions); - - delete []regions; - } } } From c4258481412c5fe59d9cc9028f30f843416bcea5 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Wed, 1 May 2024 23:42:46 -0500 Subject: [PATCH 03/10] Bump version lib:7.1.0 tool:2.1.0+hash Signed-off-by: Maisam Arif Change-Id: I6f3d7c64aacf36c9d33d663e23559a7f50cd8db6 --- CMakeLists.txt | 2 +- docs/python_api.rst | 2 -- python_smi_tools/rocm_smi.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ce86a396..3e6ac696 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,7 @@ find_program (GIT NAMES git) ## Setup the package version based on git tags. set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver") -get_package_version_number("7.0.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_package_version_number("7.1.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) message("Package version: ${PKG_VERSION_STR}") set(${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") set(${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") diff --git a/docs/python_api.rst b/docs/python_api.rst index 604803b9..36f152cb 100644 --- a/docs/python_api.rst +++ b/docs/python_api.rst @@ -196,8 +196,6 @@ Functions .. autofunction:: rocm_smi.showPower -.. autofunction:: rocm_smi.showPowerPlayTable - .. autofunction:: rocm_smi.showProduct .. autofunction:: rocm_smi.showProfile diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 334fb9be..5a7b7aa8 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -31,7 +31,7 @@ # Patch version - Increment when adding a fix, set to 0 when minor is incremented # Hash version - Shortened commit hash. Print here and not with lib for consistency with amd-smi SMI_MAJ = 2 -SMI_MIN = 0 +SMI_MIN = 1 SMI_PAT = 0 # SMI_HASH is provided by rsmiBindings __version__ = '%s.%s.%s+%s' % (SMI_MAJ, SMI_MIN, SMI_PAT, SMI_HASH) From 3d82f1799d20a43c3a20602435c85f661f911316 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Thu, 2 May 2024 11:09:57 -0500 Subject: [PATCH 04/10] Remove thread safe only mutex warning message In multiple GPUs environment, too many warning messages generated, and then need to be removed. Change-Id: I275de2397eb0e6b189e2e17e94335cb1e8f97815 --- third_party/shared_mutex/shared_mutex.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/third_party/shared_mutex/shared_mutex.cc b/third_party/shared_mutex/shared_mutex.cc index 16da69d5..7206ae93 100755 --- a/third_party/shared_mutex/shared_mutex.cc +++ b/third_party/shared_mutex/shared_mutex.cc @@ -141,7 +141,6 @@ shared_mutex_t shared_mutex_init(const char *name, mode_t mode, bool retried) { amd::smi::RocmSMI& smi = amd::smi::RocmSMI::getInstance(); if (GetEnvVarUInteger(THREAD_ONLY_ENV_VAR) == 1 || smi.is_thread_only_mutex()) { - fprintf(stderr, "rocm-smi: using thread safe only mutex\n"); return init_thread_safe_only(name); } From 0c48cd9122d2b685185a5b921c0e55a90ac24dc2 Mon Sep 17 00:00:00 2001 From: Ori Messinger Date: Thu, 2 May 2024 13:59:14 -0400 Subject: [PATCH 05/10] ROCm SMI LIB: Fix rsmiBindings.py.in Mismatch This commit aligns the rsmiBindings.py.in file's "notification_type_names" & "rsmi_evt_notification_type_t" with those found in the rsmiBindings.py file. Change-Id: I67f36606c505992fb98495651310bd70a1755033 Signed-off-by: Ori Messinger --- CHANGELOG.md | 23 +++++++++++++++++++++++ include/rocm_smi/rocm_smi.h | 1 + python_smi_tools/rsmiBindings.py | 2 +- python_smi_tools/rsmiBindings.py.in | 3 ++- 4 files changed, 27 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1ed65dd8..9ece6ec9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,29 @@ Full documentation for rocm_smi_lib is available at [https://rocm.docs.amd.com/] ***All information listed below is for reference and subject to change.*** +## rocm_smi_lib for ROCm 6.1.2 + +### Added + +- **Added Ring Hang event** +Added `RSMI_EVT_NOTIF_RING_HANG` to the possible events in the `rsmi_evt_notification_type_t` enum. + +### Changed + +- N/A + +### Optimized + +- N/A + +### Fixed + +- N/A + +### Known Issues + +- N/A + ## rocm_smi_lib for ROCm 6.1.1 ### Added diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index d82a21d3..cfbc9ae6 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -311,6 +311,7 @@ typedef struct { * Event notification event types */ typedef enum { + RSMI_EVT_NOTIF_NONE = KFD_SMI_EVENT_NONE, //!< Unused RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT, RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE, diff --git a/python_smi_tools/rsmiBindings.py b/python_smi_tools/rsmiBindings.py index 3dfab10f..87305140 100644 --- a/python_smi_tools/rsmiBindings.py +++ b/python_smi_tools/rsmiBindings.py @@ -107,8 +107,8 @@ class rsmi_dev_perf_level_t(c_int): class rsmi_evt_notification_type_t(c_int): RSMI_EVT_NOTIF_NONE = 0 - RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_NONE RSMI_EVT_NOTIF_VMFAULT = 1 + RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2 RSMI_EVT_NOTIF_GPU_PRE_RESET = 3 RSMI_EVT_NOTIF_GPU_POST_RESET = 4 diff --git a/python_smi_tools/rsmiBindings.py.in b/python_smi_tools/rsmiBindings.py.in index b24665cf..18a85358 100644 --- a/python_smi_tools/rsmiBindings.py.in +++ b/python_smi_tools/rsmiBindings.py.in @@ -134,10 +134,11 @@ class rsmi_dev_perf_level_t(c_int): RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100 -notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_RESET'] +notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET', 'RING_HANG'] class rsmi_evt_notification_type_t(c_int): + RSMI_EVT_NOTIF_NONE = 0 RSMI_EVT_NOTIF_VMFAULT = 1 RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2 From 8e6d66e15b7b75e97cead905cf966f4c6fc57c54 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Fri, 3 May 2024 02:58:31 -0500 Subject: [PATCH 06/10] fix: [SWDEV-458862] [rocm/rocm_smi_lib] Fixes reading pp_od_clk_voltage new variable format and size. Code changes related to the following: * get_od_clk_volt_info() * get_od_clk_volt_curve_regions() * Unit tests * CLI options restored: --showclkvolt, --showvc, --showvoltagerange, --setvc * Rework: 48ddd9ab * Bump CLI version * CHANGELOG.md Change-Id: I817ca224de923fdaa992df84592d63b4d5a12b22 Signed-off-by: Oliveira, Daniel --- CHANGELOG.md | 113 ++++++++--------- include/rocm_smi/rocm_smi.h | 4 - include/rocm_smi/rocm_smi_utils.h | 23 ++-- python_smi_tools/rocm_smi.py | 117 +++++++++++++++++- src/rocm_smi.cc | 58 ++++++++- src/rocm_smi_utils.cc | 9 ++ .../functional/perf_determinism.cc | 4 +- .../functional/volt_freq_curv_read.cc | 56 ++++++--- 8 files changed, 294 insertions(+), 90 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9ece6ec9..be5fc7fc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,7 +21,8 @@ Added `RSMI_EVT_NOTIF_RING_HANG` to the possible events in the `rsmi_evt_notific ### Fixed -- N/A +- **Fixed parsing of `pp_od_clk_voltage` within `get_od_clk_volt_info`** +The parsing of `pp_od_clk_voltage` was not dynamic enough to work with the dropping of voltage curve support on MI series cards. ### Known Issues @@ -30,15 +31,15 @@ Added `RSMI_EVT_NOTIF_RING_HANG` to the possible events in the `rsmi_evt_notific ## rocm_smi_lib for ROCm 6.1.1 ### Added -- **Unlock mutex if process is dead** +- **Unlock mutex if process is dead** Added in order to unlock mutex when process is dead. Additional debug output has been added if futher issues are detected. -- **Added Partition ID to rocm-smi CLI** +- **Added Partition ID to rocm-smi CLI** `rsmi_dev_pci_id_get()` now provides partition ID. See API for better detail. Previously these bits were reserved bits (right before domain) and partition id was within function. - bits [63:32] = domain - bits [31:28] = partition id - bits [27:16] = reserved - - bits [15: 0] = pci bus/device/function + - bits [15: 0] = pci bus/device/function rocm-smi now provides partition ID in `rocm-smi` and `rocm-smi --showhw`. If device supports partitioning and is in a non-SPX mode (CPX, DPX,TPX,... etc) partition ID will be non-zero. In SPX and non-supported devices will show as 0. See examples provided below. @@ -47,11 +48,11 @@ rocm-smi now provides partition ID in `rocm-smi` and `rocm-smi --showhw`. If dev ========================================= ROCm System Management Interface ========================================= =================================================== Concise Info =================================================== -Device Node IDs Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU% - (DID, GUID) (Edge) (Avg) (Mem, Compute, ID) +Device Node IDs Temp Power Partitions SCLK MCLK Fan Perf PwrCap VRAM% GPU% + (DID, GUID) (Edge) (Avg) (Mem, Compute, ID) ==================================================================================================================== -0 1 0x73bf, 34495 43.0°C 6.0W N/A, N/A, 0 0Mhz 96Mhz 0% manual 150.0W 3% 0% -1 2 0x73a3, 22215 34.0°C 8.0W N/A, N/A, 0 0Mhz 96Mhz 20.0% manual 213.0W 0% 0% +0 1 0x73bf, 34495 43.0°C 6.0W N/A, N/A, 0 0Mhz 96Mhz 0% manual 150.0W 3% 0% +1 2 0x73a3, 22215 34.0°C 8.0W N/A, N/A, 0 0Mhz 96Mhz 20.0% manual 213.0W 0% 0% ==================================================================================================================== =============================================== End of ROCm SMI Log ================================================ ``` @@ -78,15 +79,15 @@ GPU NODE DID GUID GFX VER GFX RAS SDMA RAS UMC RAS VBIOS BUS ======================================= End of ROCm SMI Log ======================================== ``` -- **Added `NODE`, `GUID`, and `GFX Version`** +- **Added `NODE`, `GUID`, and `GFX Version`** Changes impact the following rocm-smi CLIs: - `rocm-smi` - `rocm-smi -i` - `rocm-smi --showhw` - `rocm-smi --showproduct` - `NODE` - is the KFD node, since these can both be CPU and GPU devices. This field is invariant between boots. - `GUID` - also known as GPU ID. GUID is the KFD GPU's ID. This field has a chance to be variant between boots. + `NODE` - is the KFD node, since these can both be CPU and GPU devices. This field is invariant between boots. + `GUID` - also known as GPU ID. GUID is the KFD GPU's ID. This field has a chance to be variant between boots. `GFX Version` - this is the device's target graphics version. See below for a few example outputs. @@ -184,12 +185,12 @@ GPU[3] : GFX Version: gfx942 ================================== End of ROCm SMI Log =================================== ``` -- **Documentation now includes C++ and Python: tutorials, API guides, and C++ reference pages** +- **Documentation now includes C++ and Python: tutorials, API guides, and C++ reference pages** See [https://rocm.docs.amd.com/](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/) once 6.1.1 is released. ### Changed -- **Aligned `rocm-smi` fields display "N/A" instead of "unknown"/"unsupported": `Card ID`, `DID`, `Model`, `SKU`, and `VBIOS`** +- **Aligned `rocm-smi` fields display "N/A" instead of "unknown"/"unsupported": `Card ID`, `DID`, `Model`, `SKU`, and `VBIOS`** Impacts the following commands: - `rocm-smi` - see other examples above for 6.1.1 - `rocm-smi --showhw` - see other examples above for 6.1.1 @@ -208,11 +209,11 @@ GPU[3] : VBIOS version: N/A ========================================================================================== ================================== End of ROCm SMI Log =================================== ``` -- **Removed stacked id formatting in `rocm-smi`** +- **Removed stacked id formatting in `rocm-smi`** This is to simplify identifiers helpful to users. More identifiers can be found on: - `rocm-smi -i` - `rocm-smi --showhw` - - `rocm-smi --showproduct` + - `rocm-smi --showproduct` See examples shown above for 6.1.1. Previous output example can be seen below. ```shell @@ -233,18 +234,18 @@ Device [Model : Revision] Temp Power Partitions SCLK MCLK - N/A ### Fixed -- **Fixed HIP and ROCm SMI mismatch on GPU bus assignments** +- **Fixed HIP and ROCm SMI mismatch on GPU bus assignments** These changes prompted us to to provide better visability for our device nodes and partition IDs (see addition provided above). See examples below for fix overview. -1. MI300a GPU device `Domain:Bus:Device.function` clashes with another AMD USB device -Cause(s): -a. ROCm SMI did not propagate domain consistently (for partitioned devices) +1. MI300a GPU device `Domain:Bus:Device.function` clashes with another AMD USB device +Cause(s): +a. ROCm SMI did not propagate domain consistently (for partitioned devices) b. AMD GPU driver previously reported partition IDs within function node - causing clash with the other AMD USB device PCIe ID displayed. -2. Domain does not propagate for devices which support partitioning (MI300x/a) -Cause(s): -a. ROCm SMI did not propagate domain consistently (for partitioned devices) -3. Displayed topology will show disordered nodes when compared to HIP -Cause(s): -a. ROCm SMI did not propogate domain consistently (for partitioned devices) +2. Domain does not propagate for devices which support partitioning (MI300x/a) +Cause(s): +a. ROCm SMI did not propagate domain consistently (for partitioned devices) +3. Displayed topology will show disordered nodes when compared to HIP +Cause(s): +a. ROCm SMI did not propogate domain consistently (for partitioned devices) *Device in TPX* ```shell @@ -305,9 +306,9 @@ GPU[11] : (Topology) Numa Node: 3 GPU[11] : (Topology) Numa Affinity: 3 ================================== End of ROCm SMI Log =================================== ``` -- **Fixed memory leaks** +- **Fixed memory leaks** Caused by not closing directories and creating maps nodes instead of checking using by using .at(). -- **Fixed Python rocm_smi API calls** +- **Fixed Python rocm_smi API calls** Fixed initializing calls which reuse rocmsmi.initializeRsmi() bindings. ```shell @@ -318,7 +319,7 @@ Traceback (most recent call last): ret_init = rocmsmi.rsmi_init(0) NameError: name 'rocmsmi' is not defined ``` -- **Fixed rsmi_dev_activity_metric_get gfx/memory activity does not update with GPU activity** +- **Fixed rsmi_dev_activity_metric_get gfx/memory activity does not update with GPU activity** Checks and forces rereading gpu metrics unconditionally. ### Known Issues @@ -327,10 +328,10 @@ NameError: name 'rocmsmi' is not defined ## rocm_smi_lib for ROCm 6.1.0 ### Added -- **Added support to set max/min clock level for sclk (`RSMI_CLK_TYPE_SYS`) or mclk (`RSMI_CLK_TYPE_MEM`)** +- **Added support to set max/min clock level for sclk (`RSMI_CLK_TYPE_SYS`) or mclk (`RSMI_CLK_TYPE_MEM`)** Users can now set a maximum or minimum sclk or mclk value through `rsmi_dev_clk_extremum_set()` API provided ASIC support. Alternatively, users can -use our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below. - +use our Python CLI tool (`rocm-smi --setextremum max sclk 1500`). See example below. + ```shell $ sudo /opt/rocm/bin/rocm-smi --setextremum max sclk 2100 @@ -358,15 +359,15 @@ GPU[3] : Successfully set max sclk to 2100(MHz) ================================== End of ROCm SMI Log =================================== ``` -- **Added `rsmi_dev_target_graphics_version_get()`** +- **Added `rsmi_dev_target_graphics_version_get()`** Users can now query through ROCm SMI API (`rsmi_dev_target_graphics_version_get()`) to retreive the target graphics version for a GPU device. Currently, this output is not supplied through our rocm-smi CLI. ### Changed -- **Removed non-unified API headers: Individual GPU metric APIs are no longer supported** +- **Removed non-unified API headers: Individual GPU metric APIs are no longer supported** The individual metric APIs (`rsmi_dev_metrics_*`) were removed in order to keep updates easier for new GPU metric support. By providing a simple API (`rsmi_dev_gpu_metrics_info_get()`) with its reported device metrics, it is worth noting there is a risk for ABI break-age using `rsmi_dev_gpu_metrics_info_get()`. It is vital to understand, that ABI breaks are necessary (in some cases) in order to support newer ASICs and metrics for our customers. We will continue to support `rsmi_dev_gpu_metrics_info_get()` with these considerations and limitations in mind. -- **Depricated rsmi_dev_power_ave_get(), use newer API rsmi_dev_power_get()** +- **Depricated rsmi_dev_power_ave_get(), use newer API rsmi_dev_power_get()** As outlined in change below for 6.0.0 (***Added a generic power API: rsmi_dev_power_get***), is now depricated. Please update your ROCm SMI API calls accordingly. ### Optimizations @@ -374,17 +375,17 @@ As outlined in change below for 6.0.0 (***Added a generic power API: rsmi_dev_po ### Fixed -- Fix `--showpids` reporting `[PID] [PROCESS NAME] 1 UNKNOWN UNKNOWN UNKNOWN` +- Fix `--showpids` reporting `[PID] [PROCESS NAME] 1 UNKNOWN UNKNOWN UNKNOWN` Output was failing because cu_occupancy debugfs method is not provided on some graphics cards by design. `get_compute_process_info_by_pid` was updated to reflect this and returns with output needed by CLI. -- Fix `rocm-smi --showpower` output was inconsistent on Navi32/31 devices +- Fix `rocm-smi --showpower` output was inconsistent on Navi32/31 devices Updated to use `rsmi_dev_power_get()` within CLI to provide a consistent device power output. This was caused due to using the now depricated `rsmi_dev_average_power_get()` API. -- Fixed `rocm-smi --setcomputepartition` and `rocm-smi --resetcomputepartition` to notate if device is EBUSY -- Fixed `rocm-smi --setmemorypartition` and `rocm-smi --resetmemorypartition` read only SYSFS to return RSMI_STATUS_NOT_SUPPORTED +- Fixed `rocm-smi --setcomputepartition` and `rocm-smi --resetcomputepartition` to notate if device is EBUSY +- Fixed `rocm-smi --setmemorypartition` and `rocm-smi --resetmemorypartition` read only SYSFS to return RSMI_STATUS_NOT_SUPPORTED The `rsmi_dev_memory_partition_set` API is updated to handle the readonly SYSFS check. Corresponding tests and CLI (`rocm-smi --setmemorypartition` and `rocm-smi --resetmemorypartition`) calls were updated accordingly. - Fix `rocm-smi --showclkvolt` and `rocm-smi --showvc` displaying 0 for overdrive and voltage curve is not supported ### Known Issues -- **HIP and ROCm SMI mismatch on GPU bus assignments** +- **HIP and ROCm SMI mismatch on GPU bus assignments** Three separate issues have been identified: 1. MI300a GPU device `Domain:Bus:Device.function` clashes with another AMD USB device ```shell @@ -408,7 +409,7 @@ GPU[3] : PCI Bus: 0000:01:00.3 ========================================================================================== ================================== End of ROCm SMI Log =================================== ``` -2. Domain does not propagate for devices which support partitioning (MI300x/a) +2. Domain does not propagate for devices which support partitioning (MI300x/a) For example, a device in non-SPX (single partition) - devices will overlap in function device. ```shell $ rocm-smi --showbus @@ -440,7 +441,7 @@ GPU[22] : PCI Bus: 0002:01:00.0 GPU[23] : PCI Bus: 0003:01:00.0 ================================== End of ROCm SMI Log =================================== ``` -3. Displayed topology will show disordered nodes when compared to HIP +3. Displayed topology will show disordered nodes when compared to HIP See rocm-smi output vs transferbench. ```shell rocm-smi --showtopo option is not displaying the correct information when the MI300 driver is loaded in TPX mode. @@ -514,9 +515,9 @@ GPU[11] : (Topology) Numa Affinity: 3 ### Added -- **Added rocm-smi --version** -The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version. - - The ROCM-SMI version is the CLI/tool version number with commit ID appended after `+` sign. +- **Added rocm-smi --version** +The SMI will report two "versions", ROCM-SMI version and other is ROCM-SMI-LIB version. + - The ROCM-SMI version is the CLI/tool version number with commit ID appended after `+` sign. - The ROCM-SMI-LIB version is the library package version number. ``` $ rocm-smi --version @@ -524,11 +525,11 @@ ROCM-SMI version: 2.0.0+8e78352 ROCM-SMI-LIB version: 6.0.0 ``` -- **Added support for gfx941/gfx942 metrics** +- **Added support for gfx941/gfx942 metrics** You can now query MI300 device metrics to get real-time information. Metrics include power, temperature, energy, and performance. Users can query through `rsmi_dev_gpu_metrics_info_get()`. -- **Compute and memory partition support** +- **Compute and memory partition support** Users can now view, set, and reset partitions. The topology display can provide a more in-depth look at the device's current configuration. If your ASIC supports these features, the following commands can help get started: - `rocm-smi --showcomputepartition` - `rocm-smi --setcomputepartition ` @@ -539,23 +540,23 @@ Users can now view, set, and reset partitions. The topology display can provide ### Changed -- **GPU index sorting made consistent with other tools** +- **GPU index sorting made consistent with other tools** To ensure alignment with other ROCm software tools, GPU index sorting is optimized to use Bus:Device.Function (BDF) rather than the card number. -- **Increase max BDF ID length** +- **Increase max BDF ID length** To allow for larger BDF data, we have increased the maximum BDF length from 256 to 512 buffer size. -- **Documentation is transitioning to Sphinx** +- **Documentation is transitioning to Sphinx** Sphinx allows us to generate code documentation easier for our users. Helps us provide centrized HTML documentation at single website location. Here customers can see how to use our software and tools. -- **Added a generic power API: `rsmi_dev_power_get()`** -Older ASICs provided average socket power, newer ASICs (MI300) provide current socket power. The generic API provides one interface to retreive either of these power readings, allowing backwards compatability. +- **Added a generic power API: `rsmi_dev_power_get()`** +Older ASICs provided average socket power, newer ASICs (MI300) provide current socket power. The generic API provides one interface to retreive either of these power readings, allowing backwards compatability. -- **Added flexible temperature readings (`rocm-smi` and `rocm-smi --showtempgraph`)** +- **Added flexible temperature readings (`rocm-smi` and `rocm-smi --showtempgraph`)** Older ASICs provided edge temperature, newer ASICs (MI300) provide junction socket power (not edge). The rocm-smi CLI now provides a way to view which type of temperature is read across all sockets. -- **Added deep sleep frequency readings** -Newer ASICs (MI300) provide ability to know if a clock is in deep sleep. +- **Added deep sleep frequency readings** +Newer ASICs (MI300) provide ability to know if a clock is in deep sleep. ### Optimizations @@ -566,8 +567,8 @@ Newer ASICs (MI300) provide ability to know if a clock is in deep sleep. ### Fixed - Fix memory usage division by 0 -- Fix missing firmware blocks (rocm-smi --showfw) -- Fix rocm-smi --showevents shows wrong gpuID +- Fix missing firmware blocks (rocm-smi --showfw) +- Fix rocm-smi --showevents shows wrong gpuID ## rocm_smi_lib for ROCm 5.5.0 diff --git a/include/rocm_smi/rocm_smi.h b/include/rocm_smi/rocm_smi.h index cfbc9ae6..3d71530b 100755 --- a/include/rocm_smi/rocm_smi.h +++ b/include/rocm_smi/rocm_smi.h @@ -2870,8 +2870,6 @@ rsmi_status_t rsmi_dev_od_clk_info_set(uint32_t dv_ind, rsmi_freq_ind_t level, /** * @brief This function sets 1 of the 3 voltage curve points. * - * @deprecated This function is deprecated due to driver changes. - * * @details Given a device index @p dv_ind, a voltage point @p vpoint * and a voltage value @p voltvalue this function will set voltage curve point * @@ -2897,8 +2895,6 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, * @brief This function will retrieve the current valid regions in the * frequency/voltage space. * - * @deprecated This function is deprecated due to driver changes. - * * @details Given a device index @p dv_ind, a pointer to an unsigned integer * @p num_regions and a buffer of ::rsmi_freq_volt_region_t structures, @p * buffer, this function will populate @p buffer with the current diff --git a/include/rocm_smi/rocm_smi_utils.h b/include/rocm_smi/rocm_smi_utils.h index 8196822c..74082c49 100755 --- a/include/rocm_smi/rocm_smi_utils.h +++ b/include/rocm_smi/rocm_smi_utils.h @@ -554,9 +554,9 @@ class TagTextContents_t ++line_counter; } - // Any remaining elements? + // Any remaining elements? If so, the data belongs to the last found section title if (line_counter > bottom_title_line) { - update_primary_tbl(bottom_title_line, (line_counter - 1)); + update_primary_tbl(bottom_title_line, line_counter); } } @@ -570,15 +570,24 @@ class TagTextContents_t // Note: Organizes table with Title as a Key, a Key/ID for values and values. // It takes into consideration the initial constraints were all good and // that the primary table has been populated. + auto sec_key = std::string(); + auto sec_data = std::string(); + auto auto_key = uint32_t(0); for (const auto& [prim_key, prim_values] : m_primary) { for (const auto& value : prim_values) { if (auto mark_pos = value.find_first_of(m_line_splitter_mark.c_str()); mark_pos != std::string::npos) { - auto sec_key = trim(value.substr(0, mark_pos + 1)); - auto sec_data = trim(value.substr((mark_pos + 1), value.size())); - if (!sec_key.empty()) { - m_structured[prim_key].insert(std::make_pair(sec_key, sec_data)); - } + sec_key = trim(value.substr(0, mark_pos + 1)); + sec_data = trim(value.substr((mark_pos + 1), value.size())); + } + // In case there is no 'key' based on the data token marker, generate one. + else { + sec_key = std::to_string(auto_key) + m_line_splitter_mark; + sec_data = trim(value.substr(0, value.size())); + ++auto_key; + } + if (!sec_key.empty()) { + m_structured[prim_key].insert(std::make_pair(sec_key, sec_data)); } } } diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 5a7b7aa8..9e35f556 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -31,7 +31,7 @@ # Patch version - Increment when adding a fix, set to 0 when minor is incremented # Hash version - Shortened commit hash. Print here and not with lib for consistency with amd-smi SMI_MAJ = 2 -SMI_MIN = 1 +SMI_MIN = 2 SMI_PAT = 0 # SMI_HASH is provided by rsmiBindings __version__ = '%s.%s.%s+%s' % (SMI_MAJ, SMI_MIN, SMI_PAT, SMI_HASH) @@ -1270,6 +1270,34 @@ def setClockExtremum(deviceList, level, clkType, clkValue, autoRespond): printLog(device, 'Setting %s %s clock is not supported for this device.' % (level, clkType), None) +def setVoltageCurve(deviceList, point, clk, volt, autoRespond): + """ Set voltage curve for a point in the PowerPlay table for a list of devices. + + :param deviceList: List of DRM devices (can be a single-item list) + :param point: Point on the voltage curve to modify + :param clk: Clock speed specified for this curve point + :param volt: Voltage specified for this curve point + :param autoRespond: Response to automatically provide for all prompts + """ + global RETCODE + value = '%s %s %s' % (point, clk, volt) + try: + any(int(item) for item in value.split()) + except ValueError: + printErrLog(None, 'Unable to set Voltage curve') + printErrLog(None, 'Non-integer characters are present in %s' %value) + RETCODE = 1 + return + confirmOutOfSpecWarning(autoRespond) + for device in deviceList: + ret = rocmsmi.rsmi_dev_od_volt_info_set(device, int(point), int(clk), int(volt)) + if rsmi_ret_ok(ret, device, 'set_voltage_curve'): + printLog(device, 'Successfully set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt), None) + else: + printErrLog(device, 'Unable to set voltage point %s to %s(MHz) %s(mV)' % (point, clk, volt)) + RETCODE = 1 + + def setPowerPlayTableLevel(deviceList, clkType, point, clk, volt, autoRespond): """ Set clock frequency and voltage for a level in the PowerPlay table for a list of devices. @@ -2683,6 +2711,38 @@ def showPower(deviceList): printLogSpacer() +def showPowerPlayTable(deviceList): + """ Display current GPU Memory clock frequencies and voltages for a list of devices + + :param deviceList: List of DRM devices (can be a single-item list) + """ + global PRINT_JSON + if PRINT_JSON: + return + printLogSpacer(' GPU Memory clock frequencies and voltages ') + odvf = rsmi_od_volt_freq_data_t() + for device in deviceList: + ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf)) + if rsmi_ret_ok(ret, device, 'get_od_volt'): + # TODO: Make this more dynamic and less hard-coded if possible + printLog(device, 'OD_SCLK:', None) + printLog(device, '0: %sMhz' % (int(odvf.curr_sclk_range.lower_bound / 1000000)), None) + printLog(device, '1: %sMhz' % (int(odvf.curr_sclk_range.upper_bound / 1000000)), None) + printLog(device, 'OD_MCLK:', None) + printLog(device, '0: %sMhz' % (int(odvf.curr_mclk_range.lower_bound / 1000000)), None) + printLog(device, '1: %sMhz' % (int(odvf.curr_mclk_range.upper_bound / 1000000)), None) + if odvf.sclk_freq_limits.lower_bound > 0 or odvf.sclk_freq_limits.upper_bound > 0 \ + or odvf.mclk_freq_limits.lower_bound >0 or odvf.mclk_freq_limits.upper_bound > 0: + printLog(device, 'OD_RANGE:', None) + if odvf.sclk_freq_limits.lower_bound > 0 or odvf.sclk_freq_limits.upper_bound > 0: + printLog(device, 'SCLK: %sMhz %sMhz' % ( + int(odvf.sclk_freq_limits.lower_bound / 1000000), int(odvf.sclk_freq_limits.upper_bound / 1000000)), None) + if odvf.mclk_freq_limits.lower_bound >0 or odvf.mclk_freq_limits.upper_bound > 0: + printLog(device, 'MCLK: %sMhz %sMhz' % ( + int(odvf.mclk_freq_limits.lower_bound / 1000000), int(odvf.mclk_freq_limits.upper_bound / 1000000)), None) + printLogSpacer() + + def showProduct(deviceList): """ Show the requested product information for a list of devices @@ -2754,7 +2814,7 @@ def showRange(deviceList, rangeType): :param rangeType: [sclk|voltage] Type of range to return """ global RETCODE - if rangeType not in {'sclk', 'mclk'}: + if rangeType not in {'sclk', 'mclk', 'voltage'}: printLog(None, 'Invalid range identifier %s' % (rangeType), None) RETCODE = 1 return @@ -2769,6 +2829,21 @@ def showRange(deviceList, rangeType): if rangeType == 'mclk': printLog(device, 'Valid mclk range: %sMhz - %sMhz' % ( int(odvf.curr_mclk_range.lower_bound / 1000000), int(odvf.curr_mclk_range.upper_bound / 1000000)), None) + if rangeType == 'voltage': + if odvf.num_regions == 0: + printErrLog(device, 'Voltage curve regions unsupported.') + continue + num_regions = c_uint32(odvf.num_regions) + regions = (rsmi_freq_volt_region_t * odvf.num_regions)() + ret = rocmsmi.rsmi_dev_od_volt_curve_regions_get(device, byref(num_regions), byref(regions)) + if rsmi_ret_ok(ret, device, 'volt'): + for i in range(num_regions.value): + printLog(device, + 'Region %d: Valid voltage range: %smV - %smV' % (i, regions[i].volt_range.lower_bound, + regions[i].volt_range.upper_bound), + None) + else: + printLog(device, 'Unable to display %s range' % (rangeType), None) printLogSpacer() @@ -3086,6 +3161,25 @@ def showVoltage(deviceList): printLogSpacer() +def showVoltageCurve(deviceList): + """ Show the voltage curve points for the specified devices + + :param deviceList: List of DRM devices (can be a single-item list) + """ + printLogSpacer(' Voltage Curve Points ') + odvf = rsmi_od_volt_freq_data_t() + for device in deviceList: + ret = rocmsmi.rsmi_dev_od_volt_info_get(device, byref(odvf)) + if rsmi_ret_ok(ret, device, 'get_od_volt_info', silent=False) and odvf.num_regions > 0: + for position in range(3): + printLog(device, 'Voltage point %d: %sMhz %smV' % ( + position, int(list(odvf.curve.vc_points)[position].frequency / 1000000), + int(list(odvf.curve.vc_points)[position].voltage)), None) + else: + printErrLog(device, 'Voltage curve Points unsupported.') + printLogSpacer() + + def showXgmiErr(deviceList): """ Display the XGMI Error status @@ -3739,6 +3833,7 @@ def isConciseInfoRequested(args): groupDisplayTop.add_argument('--showproductname', help='Show product details', action='store_true') groupDisplayTop.add_argument('--showserial', help='Show GPU\'s Serial Number', action='store_true') groupDisplayTop.add_argument('--showuniqueid', help='Show GPU\'s Unique ID', action='store_true') + groupDisplayTop.add_argument('--showvoltagerange', help='Show voltage range', action='store_true') groupDisplayTop.add_argument('--showbus', help='Show PCI bus number', action='store_true') groupDisplayPages.add_argument('--showpagesinfo', help='Show retired, pending and unreservable pages', action='store_true') @@ -3763,6 +3858,8 @@ def isConciseInfoRequested(args): groupDisplay.add_argument('-o', '--showoverdrive', help='Show current GPU Clock OverDrive level', action='store_true') groupDisplay.add_argument('-p', '--showperflevel', help='Show current DPM Performance Level', action='store_true') + groupDisplay.add_argument('-S', '--showclkvolt', help='Show supported GPU and Memory Clocks and Voltages', + action='store_true') groupDisplay.add_argument('-s', '--showclkfrq', help='Show supported GPU and Memory Clock', action='store_true') groupDisplay.add_argument('--showmeminfo', help='Show Memory usage information for given block(s) TYPE', metavar='TYPE', type=str, nargs='+') @@ -3774,6 +3871,7 @@ def isConciseInfoRequested(args): groupDisplay.add_argument('--showrasinfo', help='Show RAS enablement information and error counts for the specified block(s) (all if no arg given)', nargs='*') + groupDisplay.add_argument('--showvc', help='Show voltage curve', action='store_true') groupDisplay.add_argument('--showxgmierr', help='Show XGMI error information since last read', action='store_true') groupDisplay.add_argument('--showtopo', help='Show hardware topology information', action='store_true') groupDisplay.add_argument('--showtopoaccess', help='Shows the link accessibility between GPUs ', action='store_true') @@ -3813,6 +3911,8 @@ def isConciseInfoRequested(args): groupAction.add_argument('--setmlevel', help='Change GPU Memory clock frequency (MHz) and Voltage for (mV) a specific Level', metavar=('MCLKLEVEL', 'MCLK', 'MVOLT'), nargs=3) + groupAction.add_argument('--setvc', help='Change SCLK Voltage Curve (MHz mV) for a specific point', + metavar=('POINT', 'SCLK', 'SVOLT'), nargs=3) groupAction.add_argument('--setsrange', help='Set min and max SCLK speed', metavar=('SCLKMIN', 'SCLKMAX'), nargs=2) groupAction.add_argument('--setextremum', help='Set min/max of SCLK/MCLK speed', metavar=('min|max', "sclk|mclk", 'CLK'), nargs=3) groupAction.add_argument('--setmrange', help='Set min and max MCLK speed', metavar=('MCLKMIN', 'MCLKMAX'), nargs=2) @@ -3882,7 +3982,7 @@ def isConciseInfoRequested(args): or args.resetclocks or args.setprofile or args.resetprofile or args.setoverdrive or args.setmemoverdrive \ or args.setpoweroverdrive or args.resetpoweroverdrive or args.rasenable or args.rasdisable or \ args.rasinject or args.gpureset or args.setperfdeterminism or args.setslevel or args.setmlevel or \ - args.setsrange or args.setextremum or args.setmrange or args.setclock or \ + args.setvc or args.setsrange or args.setextremum or args.setmrange or args.setclock or \ args.setcomputepartition or args.setmemorypartition or args.resetcomputepartition or args.resetmemorypartition: relaunchAsSudo() @@ -3929,6 +4029,7 @@ def isConciseInfoRequested(args): args.showproductname = True args.showserial = True args.showuniqueid = True + args.showvoltagerange = True args.showbus = True args.showpagesinfo = True args.showfan = True @@ -3946,12 +4047,14 @@ def isConciseInfoRequested(args): args.showpids = "summary" args.showpidgpus = [] args.showreplaycount = True + args.showvc = True args.showcomputepartition = True args.showmemorypartition = True if not PRINT_JSON: args.showprofile = True args.showclkfrq = True + args.showclkvolt = True # Don't do reset in combination with any other command if args.gpureset: @@ -4022,6 +4125,8 @@ def isConciseInfoRequested(args): showPids(args.showpids) if args.showpidgpus or str(args.showpidgpus) == '[]': showGpusByPid(args.showpidgpus) + if args.showclkvolt: + showPowerPlayTable(deviceList) if args.showvoltage: showVoltage(deviceList) if args.showbus: @@ -4065,6 +4170,10 @@ def isConciseInfoRequested(args): showRange(deviceList, 'sclk') if args.showmclkrange: showRange(deviceList, 'mclk') + if args.showvoltagerange: + showRange(deviceList, 'voltage') + if args.showvc: + showVoltageCurve(deviceList) if args.showenergycounter: showEnergy(deviceList) if args.showcomputepartition: @@ -4101,6 +4210,8 @@ def isConciseInfoRequested(args): resetPowerOverDrive(deviceList, args.autorespond) if args.setprofile: setProfile(deviceList, args.setprofile) + if args.setvc: + setVoltageCurve(deviceList, args.setvc[0], args.setvc[1], args.setvc[2], args.autorespond) if args.setextremum: setClockExtremum(deviceList, args.setextremum[0], args.setextremum[1], args.setextremum[2], args.autorespond) if args.setsrange: diff --git a/src/rocm_smi.cc b/src/rocm_smi.cc index 1b96059c..f7f9d53e 100755 --- a/src/rocm_smi.cc +++ b/src/rocm_smi.cc @@ -1286,12 +1286,16 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, return RSMI_STATUS_NOT_YET_IMPLEMENTED; } - // + // Tags expected in this file const std::string kTAG_OD_SCLK{"OD_SCLK:"}; - const std::string kTAG_GFXCLK{"GFXCLK:"}; const std::string KTAG_OD_MCLK{"OD_MCLK:"}; + const std::string kTAG_GFXCLK{"GFXCLK:"}; const std::string KTAG_MCLK{"MCLK:"}; + const std::string KTAG_SCLK{"SCLK:"}; + const std::string KTAG_OD_RANGE{"OD_RANGE:"}; + const std::string KTAG_OD_VDDGFX_OFFSET{"OD_VDDGFX_OFFSET:"}; const std::string KTAG_FIRST_FREQ_IDX{"0:"}; + amd::smi::TextFileTagContents_t txt_power_dev_od_voltage(val_vec); txt_power_dev_od_voltage .set_title_terminator(":", amd::smi::TagSplitterPositional_t::kLAST) @@ -1337,6 +1341,20 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, p->curr_mclk_range.lower_bound = freq_string_to_int(build_lower_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); p->curr_mclk_range.upper_bound = freq_string_to_int(build_upper_bound(KTAG_OD_MCLK), nullptr, nullptr, 0); } + + // Validates 'OD_RANGE' is in the structure + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE, + KTAG_SCLK)) { + od_value_pair_str_to_range(txt_power_dev_od_voltage + .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_SCLK), + &p->sclk_freq_limits); + } + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE, + KTAG_MCLK)) { + od_value_pair_str_to_range(txt_power_dev_od_voltage + .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_MCLK), + &p->mclk_freq_limits); + } } // Validates 'GFXCLK' is in the structure else if (txt_power_dev_od_voltage.contains_structured_key(kTAG_GFXCLK, @@ -1354,6 +1372,8 @@ static rsmi_status_t get_od_clk_volt_info(uint32_t dv_ind, else { return RSMI_STATUS_NOT_YET_IMPLEMENTED; } + + // Note: No curve entries. p->num_regions = 0; return RSMI_STATUS_SUCCESS; @@ -1562,6 +1582,36 @@ rsmi_status_t rsmi_dev_od_volt_info_set(uint32_t dv_ind, uint32_t vpoint, } +static void get_vc_region(const std::vector& val_vec, rsmi_freq_volt_region_t& p) +{ + std::ostringstream ss; + ss << __PRETTY_FUNCTION__ << " | ======= start ======="; + LOG_TRACE(ss); + + // + amd::smi::TextFileTagContents_t txt_power_dev_od_voltage(val_vec); + txt_power_dev_od_voltage + .set_title_terminator(":", amd::smi::TagSplitterPositional_t::kLAST) + .set_key_data_splitter(":", amd::smi::TagSplitterPositional_t::kBETWEEN) + .structure_content(); + + const std::string KTAG_OD_RANGE{"OD_RANGE:"}; + const std::string KTAG_MCLK{"MCLK:"}; + const std::string KTAG_SCLK{"SCLK:"}; + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE, + KTAG_SCLK)) { + od_value_pair_str_to_range(txt_power_dev_od_voltage + .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_SCLK), + &p.freq_range); + } + if (txt_power_dev_od_voltage.contains_structured_key(KTAG_OD_RANGE, + KTAG_MCLK)) { + od_value_pair_str_to_range(txt_power_dev_od_voltage + .get_structured_value_by_keys(KTAG_OD_RANGE, KTAG_MCLK), + &p.volt_range); + } +} + /* * num_regions [inout] on calling, the number of regions requested to be read * in. At completion, the number of regions actually read in @@ -1606,7 +1656,11 @@ static rsmi_status_t get_od_clk_volt_curve_regions(uint32_t dv_ind, << " | val_vec_size = " << std::dec << val_vec_size; LOG_DEBUG(ss); + + // Note: No curve entries. *num_regions = 0; + // Get OD ranges. + get_vc_region(val_vec, *p); return RSMI_STATUS_SUCCESS; CATCH diff --git a/src/rocm_smi_utils.cc b/src/rocm_smi_utils.cc index 8f13606e..8437a27f 100755 --- a/src/rocm_smi_utils.cc +++ b/src/rocm_smi_utils.cc @@ -1076,6 +1076,7 @@ static std::string print_pnt(rsmi_od_vddc_point_t *pt) { ss << "\t\t** Voltage: " << pt->voltage << " mV\n"; return ss.str(); } + static std::string pt_vddc_curve(rsmi_od_volt_curve *c) { std::ostringstream ss; if (c == nullptr) { @@ -1098,6 +1099,14 @@ std::string print_rsmi_od_volt_freq_data_t(rsmi_od_volt_freq_data_t *odv) { ss << pt_rng_Mhz("\t**Current SCLK frequency range: ", &odv->curr_sclk_range); ss << pt_rng_Mhz("\t**Current MCLK frequency range: ", &odv->curr_mclk_range); + ss << pt_rng_Mhz("\t**Min/Max Possible SCLK frequency range: ", + &odv->sclk_freq_limits); + ss << pt_rng_Mhz("\t**Min/Max Possible MCLK frequency range: ", + &odv->mclk_freq_limits); + + ss << "\t**Current Freq/Volt. curve: " << "\n"; + ss << "\t\t N/A" << "\n"; + ss << "\t**Number of Freq./Volt. regions: " << odv->num_regions << "\n\n"; return ss.str(); } diff --git a/tests/rocm_smi_test/functional/perf_determinism.cc b/tests/rocm_smi_test/functional/perf_determinism.cc index 790fc13f..6f95a43f 100644 --- a/tests/rocm_smi_test/functional/perf_determinism.cc +++ b/tests/rocm_smi_test/functional/perf_determinism.cc @@ -93,9 +93,9 @@ void TestPerfDeterminism::Close() { void TestPerfDeterminism::Run(void) { rsmi_status_t err; rsmi_dev_perf_level_t pfl; - rsmi_od_volt_freq_data_t odv; + rsmi_od_volt_freq_data_t odv{}; rsmi_status_t ret; - uint64_t clkvalue; + uint64_t clkvalue(0); TestBase::Run(); if (setup_failed_) { std::cout << "** SetUp Failed for this test. Skipping.**" << std::endl; diff --git a/tests/rocm_smi_test/functional/volt_freq_curv_read.cc b/tests/rocm_smi_test/functional/volt_freq_curv_read.cc index 360ad7e8..e0e0bf2b 100755 --- a/tests/rocm_smi_test/functional/volt_freq_curv_read.cc +++ b/tests/rocm_smi_test/functional/volt_freq_curv_read.cc @@ -113,22 +113,7 @@ void TestVoltCurvRead::Run(void) { << amd::smi::getRSMIStatusString(ret, false) << "\n"; } CHK_ERR_ASRT(ret) - err = rsmi_dev_od_volt_info_get(i, &odv); - IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): " - << amd::smi::getRSMIStatusString(err, false) - << "\n" - << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) - << "\n"; - } - if (err != RSMI_STATUS_SUCCESS) { - IF_VERB(STANDARD) { - std::cout << - "\t**rsmi_dev_od_volt_info_get: Not supported on this machine" - << std::endl; - } - continue; - } + // Verify api support checking functionality is working err = rsmi_dev_od_volt_info_get(i, nullptr); IF_VERB(STANDARD) { @@ -144,5 +129,44 @@ void TestVoltCurvRead::Run(void) { << "\t**odv.num_regions = " << std::dec << odv.num_regions << "\n"; } + if (err == RSMI_STATUS_SUCCESS) { + std::cout << "\t**Frequency-voltage curve data:" << "\n"; + std::cout << amd::smi::print_rsmi_od_volt_freq_data_t(&odv); + + rsmi_freq_volt_region_t *regions{}; + uint32_t num_regions; + regions = new rsmi_freq_volt_region_t[odv.num_regions]; + ASSERT_TRUE(regions != nullptr); + + num_regions = odv.num_regions; + err = rsmi_dev_od_volt_curve_regions_get(i, &num_regions, regions); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_curve_regions_get(" + << "i, &num_regions, regions): " + << amd::smi::getRSMIStatusString(err, false) << "\n" + << "\t**Number of regions: " << std::dec << num_regions + << "\n"; + } + ASSERT_TRUE(err == RSMI_STATUS_SUCCESS + || err == RSMI_STATUS_NOT_SUPPORTED + || err == RSMI_STATUS_UNEXPECTED_DATA + || err == RSMI_STATUS_UNEXPECTED_SIZE + || err == RSMI_STATUS_INVALID_ARGS); + if (err != RSMI_STATUS_SUCCESS) { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_curve_regions_get: " + "Not supported on this machine" << std::endl; + } + continue; + } + CHK_ERR_ASRT(err) + ASSERT_TRUE(num_regions == odv.num_regions); + + std::cout << "\t**Frequency-voltage curve regions:" << std::endl; + std::cout << amd::smi::print_rsmi_od_volt_freq_regions(num_regions, + regions); + + delete []regions; + } } } From 9c16cc8baf583e4543fcfb0ecd95904fed811ef8 Mon Sep 17 00:00:00 2001 From: Maisam Arif Date: Tue, 7 May 2024 21:00:50 -0500 Subject: [PATCH 07/10] Bump version lib:7.2.0 tool:2.2.0+hash Signed-off-by: Maisam Arif Change-Id: I07138dad67d796fb8c2dd418a384f663dd8532c0 --- CMakeLists.txt | 2 +- python_smi_tools/README.md | 5 +++-- python_smi_tools/rocm_smi.py | 5 +++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e6ac696..ad4ea6ea 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,7 @@ find_program (GIT NAMES git) ## Setup the package version based on git tags. set(PKG_VERSION_GIT_TAG_PREFIX "rsmi_pkg_ver") -get_package_version_number("7.1.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) +get_package_version_number("7.2.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT) message("Package version: ${PKG_VERSION_STR}") set(${ROCM_SMI_LIBS_TARGET}_VERSION_MAJOR "${CPACK_PACKAGE_VERSION_MAJOR}") set(${ROCM_SMI_LIBS_TARGET}_VERSION_MINOR "${CPACK_PACKAGE_VERSION_MINOR}") diff --git a/python_smi_tools/README.md b/python_smi_tools/README.md index 1fa33eba..266b3ad3 100644 --- a/python_smi_tools/README.md +++ b/python_smi_tools/README.md @@ -2,8 +2,9 @@ This tool acts as a command line interface for manipulating and monitoring the amdgpu kernel, and is intended to replace -and deprecate the existing rocm_smi.py CLI tool. -It uses Ctypes to call the rocm_smi_lib API. +and deprecate the existing rocm_smi.py CLI tool located at +https://github.com/ROCm/ROC-smi. +This tool uses Ctypes to call the rocm_smi_lib API. Recommended: At least one AMD GPU with ROCm driver installed Required: ROCm SMI library installed (librocm_smi64) diff --git a/python_smi_tools/rocm_smi.py b/python_smi_tools/rocm_smi.py index 9e35f556..b770716c 100755 --- a/python_smi_tools/rocm_smi.py +++ b/python_smi_tools/rocm_smi.py @@ -3,8 +3,9 @@ This tool acts as a command line interface for manipulating and monitoring the amdgpu kernel, and is intended to replace -and deprecate the existing rocm_smi.py CLI tool. -It uses Ctypes to call the rocm_smi_lib API. +and deprecate the existing rocm_smi.py CLI tool located at +https://github.com/ROCm/ROC-smi. +This tool uses Ctypes to call the rocm_smi_lib API. Recommended: At least one AMD GPU with ROCm driver installed Required: ROCm SMI library installed (librocm_smi64) """ From 8c444164103bec701ff24c231eddc0eb36fdbef6 Mon Sep 17 00:00:00 2001 From: "Bill(Shuzhou) Liu" Date: Wed, 8 May 2024 13:14:39 -0500 Subject: [PATCH 08/10] Discover the amdgpu when card numbers are not consecutive. When discover the amdgpu, if the assigned numbers are not consecutive, not all GPU can be discovered. The code is change to discover the GPU based on max card number. Change-Id: I8b6a8b49594d6a54c7feb2645bedb83dc5c1b4cc --- src/rocm_smi_main.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/rocm_smi_main.cc b/src/rocm_smi_main.cc index 03c8b613..7d6edea6 100755 --- a/src/rocm_smi_main.cc +++ b/src/rocm_smi_main.cc @@ -713,6 +713,8 @@ static bool isAMDGPU(std::string dev_path) { uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { std::string err_msg; uint32_t count = 0; + int32_t cardId = 0; + int32_t max_cardId = -1; std::ostringstream ss; // If this gets called more than once, clear previous findings. @@ -736,6 +738,9 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { if ((strcmp(dentry->d_name, ".") == 0) || (strcmp(dentry->d_name, "..") == 0)) continue; + sscanf(&dentry->d_name[strlen(kDeviceNamePrefix)], "%d", &cardId); + if (cardId > max_cardId) + max_cardId = cardId; count++; } dentry = readdir(drm_dir); @@ -818,7 +823,7 @@ uint32_t RocmSMI::DiscoverAmdgpuDevices(void) { uint32_t cardAdded = 0; // Discover all root cards & gpu partitions associated with each - for (uint32_t cardId = 0; cardId < count; cardId++) { + for (uint32_t cardId = 0; cardId <= max_cardId; cardId++) { std::string path = kPathDRMRoot; path += "/card"; path += std::to_string(cardId); From 497ef4a7ef090d70d2324c57f607d379af6df6dd Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 14 May 2024 18:18:00 -0500 Subject: [PATCH 09/10] fix: [SWDEV-461904] [rocm/rocm_smi_lib] Checks returned error by rsmi_dev_od_volt_info_get() before assert Code changes related to the following: * Unit tests Change-Id: Icc0f329e35992aae19f07243024521181467bcd3 Signed-off-by: Oliveira, Daniel --- .../functional/volt_freq_curv_read.cc | 110 ++++++++++-------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/tests/rocm_smi_test/functional/volt_freq_curv_read.cc b/tests/rocm_smi_test/functional/volt_freq_curv_read.cc index e0e0bf2b..60d068da 100755 --- a/tests/rocm_smi_test/functional/volt_freq_curv_read.cc +++ b/tests/rocm_smi_test/functional/volt_freq_curv_read.cc @@ -106,13 +106,22 @@ void TestVoltCurvRead::Run(void) { << amd::smi::getRSMIStatusString(err, false) << "\n"; } - CHK_ERR_ASRT(err) - ret = rsmi_dev_perf_level_get(i, &pfl); - IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_perf_level_get(i, &pfl): " - << amd::smi::getRSMIStatusString(ret, false) << "\n"; + + if (err != rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { + ASSERT_EQ(err, rsmi_status_t::RSMI_STATUS_SUCCESS); + ret = rsmi_dev_perf_level_get(i, &pfl); + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_perf_level_get(i, &pfl): " + << amd::smi::getRSMIStatusString(ret, false) << "\n"; + } + ASSERT_EQ(err, rsmi_status_t::RSMI_STATUS_SUCCESS); + } + else { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_perf_level_get: Not supported on this " + "machine" << std::endl; + } } - CHK_ERR_ASRT(ret) // Verify api support checking functionality is working err = rsmi_dev_od_volt_info_get(i, nullptr); @@ -120,53 +129,62 @@ void TestVoltCurvRead::Run(void) { std::cout << "\t**rsmi_dev_od_volt_info_get(i, nullptr): " << amd::smi::getRSMIStatusString(err, false) << "\n"; } - ASSERT_TRUE(err == RSMI_STATUS_INVALID_ARGS); - err = rsmi_dev_od_volt_info_get(i, &odv); - IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): " - << amd::smi::getRSMIStatusString(err, false) << "\n" - << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) - << "\t**odv.num_regions = " << std::dec - << odv.num_regions << "\n"; - } - if (err == RSMI_STATUS_SUCCESS) { - std::cout << "\t**Frequency-voltage curve data:" << "\n"; - std::cout << amd::smi::print_rsmi_od_volt_freq_data_t(&odv); - rsmi_freq_volt_region_t *regions{}; - uint32_t num_regions; - regions = new rsmi_freq_volt_region_t[odv.num_regions]; - ASSERT_TRUE(regions != nullptr); - - num_regions = odv.num_regions; - err = rsmi_dev_od_volt_curve_regions_get(i, &num_regions, regions); + if (err != rsmi_status_t::RSMI_STATUS_NOT_SUPPORTED) { + ASSERT_EQ(err, rsmi_status_t::RSMI_STATUS_INVALID_ARGS); + err = rsmi_dev_od_volt_info_get(i, &odv); IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_curve_regions_get(" - << "i, &num_regions, regions): " - << amd::smi::getRSMIStatusString(err, false) << "\n" - << "\t**Number of regions: " << std::dec << num_regions - << "\n"; + std::cout << "\t**rsmi_dev_od_volt_info_get(i, &odv): " + << amd::smi::getRSMIStatusString(err, false) << "\n" + << amd::smi::print_rsmi_od_volt_freq_data_t(&odv) + << "\t**odv.num_regions = " << std::dec + << odv.num_regions << "\n"; } - ASSERT_TRUE(err == RSMI_STATUS_SUCCESS - || err == RSMI_STATUS_NOT_SUPPORTED - || err == RSMI_STATUS_UNEXPECTED_DATA - || err == RSMI_STATUS_UNEXPECTED_SIZE - || err == RSMI_STATUS_INVALID_ARGS); - if (err != RSMI_STATUS_SUCCESS) { + if (err == rsmi_status_t::RSMI_STATUS_SUCCESS) { + std::cout << "\t**Frequency-voltage curve data:" << "\n"; + std::cout << amd::smi::print_rsmi_od_volt_freq_data_t(&odv); + + rsmi_freq_volt_region_t *regions{}; + uint32_t num_regions; + regions = new rsmi_freq_volt_region_t[odv.num_regions]; + ASSERT_NE(regions, nullptr); + + num_regions = odv.num_regions; + err = rsmi_dev_od_volt_curve_regions_get(i, &num_regions, regions); IF_VERB(STANDARD) { - std::cout << "\t**rsmi_dev_od_volt_curve_regions_get: " - "Not supported on this machine" << std::endl; + std::cout << "\t**rsmi_dev_od_volt_curve_regions_get(" + << "i, &num_regions, regions): " + << amd::smi::getRSMIStatusString(err, false) << "\n" + << "\t**Number of regions: " << std::dec << num_regions + << "\n"; } - continue; - } - CHK_ERR_ASRT(err) - ASSERT_TRUE(num_regions == odv.num_regions); + ASSERT_TRUE(err == RSMI_STATUS_SUCCESS + || err == RSMI_STATUS_NOT_SUPPORTED + || err == RSMI_STATUS_UNEXPECTED_DATA + || err == RSMI_STATUS_UNEXPECTED_SIZE + || err == RSMI_STATUS_INVALID_ARGS); + if (err != RSMI_STATUS_SUCCESS) { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_curve_regions_get: " + "Not supported on this machine" << std::endl; + } + continue; + } + ASSERT_EQ(err, rsmi_status_t::RSMI_STATUS_SUCCESS); + ASSERT_EQ(num_regions, odv.num_regions); - std::cout << "\t**Frequency-voltage curve regions:" << std::endl; - std::cout << amd::smi::print_rsmi_od_volt_freq_regions(num_regions, - regions); + std::cout << "\t**Frequency-voltage curve regions:" << std::endl; + std::cout << amd::smi::print_rsmi_od_volt_freq_regions(num_regions, + regions); - delete []regions; + delete []regions; + } + } + else { + IF_VERB(STANDARD) { + std::cout << "\t**rsmi_dev_od_volt_info_get: Not supported on this " + "machine" << std::endl; + } } } } From e7d54946fb6bd578ee94752305f71aaf84555197 Mon Sep 17 00:00:00 2001 From: "Oliveira, Daniel" Date: Tue, 14 May 2024 19:36:52 -0500 Subject: [PATCH 10/10] fix: [MIT-License] [rocm/rocm_smi_lib] Updates the license to MIT Code changes related to the following: None Change-Id: I62d0a5f02a2d5e58c1952337dff54892793c16cf Signed-off-by: Oliveira, Daniel --- License.txt | 44 ++++++++++++++------------------------------ 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/License.txt b/License.txt index 7d64f365..31f95034 100644 --- a/License.txt +++ b/License.txt @@ -1,38 +1,22 @@ -The University of Illinois/NCSA -Open Source License (NCSA) +MIT License -Copyright (c) 2014-2018, Advanced Micro Devices, Inc. All rights reserved. - -Developed by: - - AMD Research and AMD HSA Software Development - - Advanced Micro Devices, Inc. - - www.amd.com +Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to -deal with the Software without restriction, including without limitation -the rights to use, copy, modify, merge, publish, distribute, sublicense, -and/or sell copies of the Software, and to permit persons to whom the -Software is furnished to do so, subject to the following conditions: +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: - - Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimers. - - Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimers in - the documentation and/or other materials provided with the distribution. - - Neither the names of Advanced Micro Devices, Inc, - nor the names of its contributors may be used to endorse or promote - products derived from this Software without specific prior written - permission. +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR -OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS WITH THE SOFTWARE. +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.