diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 642bd09a6..7174044dc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -48,6 +48,11 @@ set ( CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64" ) set ( CORE_RUNTIME_LIBRARY "lib${CORE_RUNTIME_TARGET}" ) project( ${CORE_RUNTIME_TARGET} ) +if ( NOT DEFINED BUILD_SHARED_LIBS ) + set ( BUILD_SHARED_LIBS "on" ) +endif() +set ( BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS} CACHE BOOL "Build shared library (.so) or not.") + # Optionally, build HSA Runtime with ccache. set(ROCM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build") if (ROCM_CCACHE_BUILD) @@ -137,6 +142,7 @@ set ( SRCS "core/util/lnx/os_linux.cpp" "core/runtime/amd_loader_context.cpp" "core/runtime/hsa_ven_amd_loader.cpp" "core/runtime/amd_memory_region.cpp" + "core/runtime/amd_filter_device.cpp" "core/runtime/amd_topology.cpp" "core/runtime/default_signal.cpp" "core/runtime/host_queue.cpp" @@ -163,7 +169,7 @@ set ( SRCS "core/util/lnx/os_linux.cpp" "libamdhsacode/amd_hsa_code.cpp" ) -add_library( ${CORE_RUNTIME_TARGET} SHARED ${SRCS} ) +add_library( ${CORE_RUNTIME_TARGET} ${SRCS} ) target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE hsakmt @@ -181,19 +187,29 @@ set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY SOVERSION "${SO_MAJOR}" ) ## Create symlinks for legacy packaging and install add_custom_target ( hsa_include_link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/include/hsa hsa_include_link ) -add_custom_target ( hsa_lib_link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/lib/${CORE_RUNTIME_LIBRARY}.so ${CORE_RUNTIME_LIBRARY}-link.so ) -add_custom_target ( hsa_lib_link2 ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/lib/${CORE_RUNTIME_LIBRARY}.so.${SO_MAJOR} ${CORE_RUNTIME_LIBRARY}-link.so.${SO_MAJOR} ) +if ( ${BUILD_SHARED_LIBS} ) + add_custom_target ( hsa_lib_link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/lib/${CORE_RUNTIME_LIBRARY}.so ${CORE_RUNTIME_LIBRARY}-link.so ) + add_custom_target ( hsa_lib_link2 ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/lib/${CORE_RUNTIME_LIBRARY}.so.${SO_MAJOR} ${CORE_RUNTIME_LIBRARY}-link.so.${SO_MAJOR} ) +endif() ## Set install information -install ( TARGETS ${CORE_RUNTIME_TARGET} LIBRARY DESTINATION hsa/lib ) -install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/inc/ DESTINATION hsa/include/hsa ) -install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/hsa_include_link DESTINATION include PERMISSIONS OWNER_WRITE OWNER_READ RENAME hsa ) -install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CORE_RUNTIME_LIBRARY}-link.so DESTINATION lib PERMISSIONS OWNER_WRITE OWNER_READ RENAME ${CORE_RUNTIME_LIBRARY}.so ) -install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CORE_RUNTIME_LIBRARY}-link.so.${SO_MAJOR} DESTINATION lib PERMISSIONS OWNER_WRITE OWNER_READ RENAME ${CORE_RUNTIME_LIBRARY}.so.${SO_MAJOR} ) +install ( TARGETS ${CORE_RUNTIME_TARGET} DESTINATION hsa/lib COMPONENT binary) +install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/inc/ DESTINATION hsa/include/hsa COMPONENT binary) +install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/hsa_include_link DESTINATION include PERMISSIONS OWNER_WRITE OWNER_READ RENAME hsa COMPONENT dirlink) + +if ( ${BUILD_SHARED_LIBS} ) + install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CORE_RUNTIME_LIBRARY}-link.so DESTINATION lib PERMISSIONS OWNER_WRITE OWNER_READ RENAME ${CORE_RUNTIME_LIBRARY}.so COMPONENT binary) + install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CORE_RUNTIME_LIBRARY}-link.so.${SO_MAJOR} DESTINATION lib PERMISSIONS OWNER_WRITE OWNER_READ RENAME ${CORE_RUNTIME_LIBRARY}.so.${SO_MAJOR} COMPONENT binary) +endif () ## Packaging directives set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Package types to build") +## Only pack the "binary" components, post install script will add the directory link. +set (CPACK_DEB_COMPONENT_INSTALL ON) +set (CPACK_COMPONENTS_ALL_IN_ONE_PACKAGE 1) +set (CPACK_COMPONENTS_ALL binary) + set ( CPACK_PACKAGE_NAME "hsa-rocr-dev" ) set ( CPACK_PACKAGE_VENDOR "AMD" ) set ( CPACK_PACKAGE_VERSION ${PACKAGE_VERSION_STRING} ) @@ -201,6 +217,10 @@ set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "AMD Heterogeneous System Architecture HSA - Linux HSA Runtime for Boltzmann (ROCm) platforms" ) set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) +## Process the install scripts to update the CPACK variables +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/post_install ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst @ONLY) +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/pre_remove ${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm @ONLY) + # Debian package specific variables set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsakmt-roct" ) set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCR-Runtime" ) diff --git a/src/DEBIAN/postinst b/src/DEBIAN/post_install similarity index 52% rename from src/DEBIAN/postinst rename to src/DEBIAN/post_install index 769a72e46..295922dcf 100644 --- a/src/DEBIAN/postinst +++ b/src/DEBIAN/post_install @@ -3,12 +3,13 @@ set -e do_ldconfig() { - echo /opt/rocm/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig + echo @CPACK_PACKAGING_INSTALL_PREFIX@/hsa/lib > /etc/ld.so.conf.d/hsa-rocr-dev.conf && ldconfig } case "$1" in configure) do_ldconfig + ln -sf ../hsa/include/hsa @CPACK_PACKAGING_INSTALL_PREFIX@/include/hsa ;; abort-upgrade|abort-remove|abort-deconfigure) echo "$1" diff --git a/src/DEBIAN/prerm b/src/DEBIAN/pre_remove similarity index 76% rename from src/DEBIAN/prerm rename to src/DEBIAN/pre_remove index 2b7d50a82..2dd27fdc3 100644 --- a/src/DEBIAN/prerm +++ b/src/DEBIAN/pre_remove @@ -9,6 +9,7 @@ rm_ldconfig() { case "$1" in remove) rm_ldconfig + rm -rf @CPACK_PACKAGING_INSTALL_PREFIX@/include/hsa ;; purge) ;; diff --git a/src/cmake_modules/utils.cmake b/src/cmake_modules/utils.cmake index 77e441131..415e2d757 100644 --- a/src/cmake_modules/utils.cmake +++ b/src/cmake_modules/utils.cmake @@ -205,3 +205,15 @@ function ( get_version DEFAULT_VERSION_STRING ) #message("${VERSION_JOB}") endfunction() + +## Collects subdirectory names and returns them in a list +function ( listsubdirs DIRPATH SUBDIRECTORIES ) + file( GLOB CONTENTS RELATIVE ${DIRPATH} "${DIRPATH}/*" ) + set ( FOLDERS, "" ) + foreach( ITEM IN LISTS CONTENTS) + if( IS_DIRECTORY "${DIRPATH}/${ITEM}" ) + list( APPEND FOLDERS ${ITEM} ) + endif() + endforeach() + set (${SUBDIRECTORIES} ${FOLDERS} PARENT_SCOPE) +endfunction() diff --git a/src/core/inc/amd_filter_device.h b/src/core/inc/amd_filter_device.h new file mode 100644 index 000000000..f7205a0fc --- /dev/null +++ b/src/core/inc/amd_filter_device.h @@ -0,0 +1,213 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#ifndef HSA_RUNTIME_CORE_INC_AMD_FILTER_DEVICE_H_ +#define HSA_RUNTIME_CORE_INC_AMD_FILTER_DEVICE_H_ + +#include +#include +#include +#include +#include +#include + +#include "hsakmt.h" + +namespace amd { + +// ROCr allows users to filter and reorder various Gpu devices that are +// present on ROCm system. This ability is made available via environment +// variable ROCR_VISIBLE_DEVICES (RVD). Users are allowed to specify a list +// of Gpu Identifiers separated by comma delimiter as the value of this env +// variable. +// +// On a ROCm platform instance, a Gpu device could be identified by its: +// +// Index - Position at which ROCr reports it upon device enumeration +// UUID - A string that is unique and is immutable i.e. tags Gpu +// instance across systems and power cycles. UUID values +// are defined to begin with "GPU-" prefix +// +// @note: Not all Gpu devices will report valid UUID's. For example, +// Only devices from Gfx9 and later will encode valid UUID's. To account +// for this and other reasons, the UUID string "GPU-XX" is defined as +// indicating those devices. Users can still select those Gpu devices +// by using their enumeration index +// +// Users are allowed to select a device by specifying its UUID string in +// full or part. A UUID string that does not uniquely match an agent's +// valid UUID prefix is interpreted as terminating. The UUID string +// "GPU-XX" will not match and therefore will terminate +// +// RVD interpreter treats an empty token list as filtering all devices. +// Users can use this mode to report ZERO Gpu devices +// +// RVD interpreter treats a token as Illegal if can't be evaluated into an +// instance of Device UUID or Enumeration Index +// +// RVD interpreter treats a Legal instance of Enumeration Index as Terminating +// if any ONE of the following conditions apply: +// Value of index lies outside the interval [0 - (numGpuDevices - 1)] +// Value of index maps to a device that has been previously selected +// +// RVD interpreter treats a Legal instance of Device UUID as Terminating +// if any ONE of the following conditions apply: +// Value of UUID is the literal "GPU-XX" +// Value of UUID matches ZERO devices on system +// Value of UUID matches TWO or more devices on system +// Value of UUID maps to a device that has been previously selected +// +// RVD interpreter builds the list of Gpu devices to surface using tokens +// that are Legal and NOT Terminating +// +// Following are some examples of RVD value strings and their intepretation +// on a ROCm system with four Gpu devices. Assume for now the UUID's of the +// four Gpu devices are: +// Gpu-0: "GPU-BABABABABABABABA" +// Gpu-1: "GPU-ABBAABBAABBAABBA" +// Gpu-2: "GPU-BABAABBAABBABABA" +// Gpu-3: "GPU-ABBABABABABAABBA" +// +// Surface ZERO devices +// A1) ROCR_VISIBLE_DEVICES="" +// A2) ROCR_VISIBLE_DEVICES="-1" +// A3) ROCR_VISIBLE_DEVICES="GPU-XX" +// +// Surface Gpu-3 and Gpu-0 devices in that order +// B) ROCR_VISIBLE_DEVICES="3,GPU-BABABABABABABABA,4" +// +// Surface Gpu-1 and Gpu-2 devices in that order +// C) ROCR_VISIBLE_DEVICES="1,GPU-ABBAABBAABBAABBA,GPU-XX" +// +// Surface Gpu-3 and Gpu-2 devices in that order +// D) ROCR_VISIBLE_DEVICES="3,GPU-BABAABBA,GPU-XX" +// +class RvdFilter { + public: + /// @brief Constructor + RvdFilter() {} + + // @brief Destructor. + ~RvdFilter() {} + + /// @brief Determine if user has specified environment variable + /// ROCR_VISIBLE_DEVICES (RVD) to filter and reorder Gpu devices + /// + /// @return TRUE if user has defined the env RVD + static bool FilterDevices(); + + /// @brief Determine if user has specified environment variable + /// ROCR_VISIBLE_DEVICES (RVD) to filter out all Gpu devices i.e. + /// surface ZERO devices + /// + /// @return TRUE if user has specified ZERO to be surfaced + bool SelectZeroDevices(); + + /// @brief Builds the list of tokens specified by user to filter + /// and reorder Gpu devices. A token represents either a Gpu's + /// enumeration index or its UUID value. It is possible for the + /// list to have no tokens i.e. user has selected zero devices + void BuildRvdTokenList(); + + /// @brief Build the list of Gpu device UUIDs as enumerated by ROCt + /// + /// @param numNodes Number of ROCm devices present on system, includes + /// both Cpu and Gpu's devices + void BuildDeviceUuidList(uint32_t numNodes); + + /// @brief Build the list of Gpu devices that will be enumerated to user + /// + /// @return Number of Gpu devices to surface upon devices enumeration + uint32_t BuildUsrDeviceList(); + + /// @brief Processes UUID token and returns its enumeration index + /// + /// @param token RVD token encoding a device's UUID value + /// @return int32_t if it is valid, -1 otherwise + int32_t ProcessUuidToken(const std::string& token); + + /// @brief Get the number of Gpu devices that will be surface + /// upon device enumeration + /// + /// @uint32_t Number of devices to enumerate including possibly + /// ZERO devices + uint32_t GetUsrDeviceListSize(); + + /// @brief Return the rank of queried Gpu device. If queried device + /// is surfaced the number of Gpu devices that will be surface + /// upon device enumeration + /// + /// @int32_t -1 if queried device is not surfaced, else a value in + /// the range [0 - (numGpus - 1)] + int32_t GetUsrDeviceRank(uint32_t roctIdx); + +#ifndef NDEBUG + /// @brief Set debug UUID values to Gpu devices. This is intended to + /// help debug and test RVD module functionality + void SetDeviceUuidList(); + + /// @brief Print the list of Uuids of Gpu devices present on system + void PrintDeviceUuidList(); + + /// @brief Print the list of Gpu devices per their enumeration order + void PrintUsrDeviceList(); + + /// @brief Print the list of tokens specified by user to filter + /// and reorder Gpu devices + void PrintRvdTokenList(); +#endif + + private: + /// @brief List of tokens specified by user to select and reorder + std::vector rvdTokenList_; + + /// @brief Ordered list of ROCt enumerated Gpu device's UUID values + std::vector devUuidList_; + + /// @brief Ordered list of ROCr enumerated Gpu devices + std::map usrDeviceList_; + +}; // End of class RvdFilter + +} // namespace amd + +#endif // header guard - HSA_RUNTIME_CORE_INC_AMD_FILTER_DEVICE_H_ diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h index c8c4243ba..bd0826fb6 100644 --- a/src/core/inc/amd_gpu_shaders.h +++ b/src/core/inc/amd_gpu_shaders.h @@ -125,9 +125,12 @@ static const unsigned int kCodeTrapHandler9[] = { .set MAX_NUM_DOORBELLS_MASK , ((1 << 10) - 1) .set SENDMSG_M0_DOORBELL_ID_BITS , 12 .set SENDMSG_M0_DOORBELL_ID_MASK , ((1 << SENDMSG_M0_DOORBELL_ID_BITS) - 1) - .set TTMP11_DEBUG_TRAP_BIT , 7 + .set TTMP11_TRAP_RAISED_BIT , 7 + .set TTMP11_EXCP_RAISED_BIT , 8 + .set TTMP11_EVENTS_MASK , (1 << TTMP11_TRAP_RAISED_BIT) | (1 << TTMP11_EXCP_RAISED_BIT) .set DEBUG_INTERRUPT_CONTEXT_ID_BIT , 23 .set INSN_S_ENDPGM_OPCODE , 0xBF810000 + .set INSN_S_ENDPGM_MASK , 0xFFFF0000 .if .amdgcn.gfx_generation_number == 9 .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26 @@ -151,9 +154,9 @@ static const unsigned int kCodeTrapHandler9[] = { // ttmp14 = TMA[31:0] // ttmp15 = TMA[63:32] // gfx9: - // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[17:0], DebugTrap[0], NoScratch[0], WaveIdInWG[5:0] + // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[16:0], TrapRaised[0], ExcpRaised[0], NoScratch[0], WaveIdInWG[5:0] // gfx10: - // ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], 0[15:0], DebugTrap[0], NoScratch[0], WaveIdInWG[5:0] + // ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], 0[14:0], TrapRaised[0], ExcpRaised[0], NoScratch[0], WaveIdInWG[5:0] .macro mGetDoorbellId s_mov_b32 exec_lo, 0x80000000 @@ -200,9 +203,12 @@ static const unsigned int kCodeTrapHandler9[] = { s_cbranch_scc1 .excp_raised // Otherwise trap entered due to single step exception. - .signal_debugger: - s_bitset1_b32 ttmp11, TTMP11_DEBUG_TRAP_BIT + s_branch .signal_debugger + + .signal_trap_debugger: + s_bitset1_b32 ttmp11, TTMP11_TRAP_RAISED_BIT + .signal_debugger: // Fetch doorbell index for our queue. s_mov_b32 ttmp2, exec_lo s_mov_b32 ttmp3, exec_hi @@ -215,7 +221,7 @@ static const unsigned int kCodeTrapHandler9[] = { s_mov_b32 exec_lo, ttmp2 // Set the debug interrupt context id. - // FIXME: Make conditional on TTMP11_DEBUG_TRAP_BIT when exceptions are handled. + // FIXME: Make conditional when exceptions are handled. s_bitset1_b32 ttmp3, DEBUG_INTERRUPT_CONTEXT_ID_BIT // Send an interrupt to trigger event notification. @@ -231,6 +237,7 @@ static const unsigned int kCodeTrapHandler9[] = { s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK s_load_dword ttmp2, [ttmp0, ttmp1] s_waitcnt lgkmcnt(0) + s_and_b32 ttmp2, ttmp2, INSN_S_ENDPGM_MASK s_cmp_eq_u32 ttmp2, INSN_S_ENDPGM_OPCODE s_cbranch_scc1 .skip_halt s_or_b32 ttmp12, ttmp12, SQ_WAVE_STATUS_HALT_MASK @@ -239,6 +246,8 @@ static const unsigned int kCodeTrapHandler9[] = { mExitTrap .excp_raised: + s_bitset1_b32 ttmp11, TTMP11_EXCP_RAISED_BIT + // If memory violation without XNACK error then signal queue error. // XNACK error will be handled by VM interrupt, since it has more information. s_and_b32 ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK) @@ -251,24 +260,31 @@ static const unsigned int kCodeTrapHandler9[] = { s_mov_b32 ttmp3, SIGNAL_CODE_ILLEGAL_INST s_cbranch_scc1 .signal_error - // Otherwise (memory violation with XNACK error) return to shader. - s_branch .exit_trap + // Otherwise (memory violation with XNACK error) return to shader. Do not + // send a signal as that will cause an interrupt storm. Instead let the + // interrupt generated by the TLB miss cause the kernel to notify ROCr and + // put the queue into an error state. This also ensures the TLB interrupt + // is received which provides information about the page causing the fault. + s_branch .halt_wave .trap_raised: // If debugger trap (s_trap >= 3) then signal debugger. s_cmp_ge_u32 ttmp2, 0x3; - s_cbranch_scc1 .signal_debugger + s_cbranch_scc1 .signal_trap_debugger // If llvm.trap (s_trap 2) then signal queue error. s_cmp_eq_u32 ttmp2, 0x2 s_mov_b32 ttmp3, SIGNAL_CODE_LLVM_TRAP - s_cbranch_scc1 .signal_error + s_cbranch_scc1 .signal_trap_error // For other traps advance PC and return to shader. s_add_u32 ttmp0, ttmp0, 0x4 s_addc_u32 ttmp1, ttmp1, 0x0 s_branch .exit_trap + .signal_trap_error: + s_bitset1_b32 ttmp11, TTMP11_TRAP_RAISED_BIT + .signal_error: // FIXME: don't trash ttmp4/ttmp5 when exception handling is unified. s_mov_b32 ttmp4, ttmp3 @@ -321,26 +337,26 @@ static const unsigned int kCodeTrapHandler9[] = { .exit_trap: mExitTrap */ - - 0x92eeff6d, 0x00080010, 0xbf850036, 0xb8eef803, 0x866fff6e, 0x00000900, - 0xbf850026, 0xbef71a87, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, - 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x867eff7e, - 0x00000fff, 0xbeef007e, 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, - 0xbf800000, 0xbf900001, 0xbefc006e, 0x866dff6d, 0x0000ffff, 0xc0021bb6, - 0x00000000, 0xbf8cc07f, 0xbf06ff6e, 0xbf810000, 0xbf850002, 0x8778ff78, + 0x92eeff6d, 0x00080010, 0xbf85003a, 0xb8eef803, 0x866fff6e, 0x00000900, + 0xbf850029, 0xbf820001, 0xbef71a87, 0xbeee007e, 0xbeef007f, 0xbefe00ff, + 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, + 0x867eff7e, 0x00000fff, 0xbeef007e, 0xbefe006e, 0xbeef1a97, 0xbeee007c, + 0xbefc006f, 0xbf800000, 0xbf900001, 0xbefc006e, 0x866dff6d, 0x0000ffff, + 0xc0021bb6, 0x00000000, 0xbf8cc07f, 0x866eff6e, 0xffff0000, 0xbf06ff6e, + 0xbf810000, 0xbf850002, 0x8778ff78, 0x00002000, 0x8f6e8b77, 0x866eff6e, + 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, + 0xbef71a88, 0x866fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, 0xbeef00ff, + 0x20000000, 0xbf85000f, 0x866fff6e, 0x00000800, 0xbeef00f4, 0xbf85000b, + 0xbf82002e, 0xbf09836e, 0xbf85ffc9, 0xbf06826e, 0xbeef00ff, 0x80000000, + 0xbf850003, 0x806c846c, 0x826d806d, 0xbf820027, 0xbef71a87, 0xbef0006f, + 0xbefe00ff, 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, + 0x866eff7e, 0x000003ff, 0x8e6e836e, 0xc0051bbd, 0x0000006e, 0xbf8cc07f, + 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, 0x00000008, + 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, 0xbf8cc07f, + 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, 0xc0431bb8, + 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, 0x8778ff78, 0x00002000, 0x8f6e8b77, 0x866eff6e, 0x001f8000, 0xb96ef807, 0x86fe7e7e, - 0x86ea6a6a, 0xb978f802, 0xbe801f6c, 0x866fff6e, 0x10000100, 0xbf06ff6f, - 0x00000100, 0xbeef00ff, 0x20000000, 0xbf85000e, 0x866fff6e, 0x00000800, - 0xbeef00f4, 0xbf85000a, 0xbf82002f, 0xbf09836e, 0xbf85ffcc, 0xbf06826e, - 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c, 0x826d806d, 0xbf820026, - 0xbef0006f, 0xbefe00ff, 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, - 0xbf84fffd, 0x866eff7e, 0x000003ff, 0x8e6e836e, 0xc0051bbd, 0x0000006e, - 0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, - 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, - 0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, - 0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, - 0x8778ff78, 0x00002000, 0x8f6e8b77, 0x866eff6e, 0x001f8000, 0xb96ef807, - 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, + 0x86ea6a6a, 0xb978f802, 0xbe801f6c, }; static const unsigned int kCodeCopyAligned8[] = { @@ -456,26 +472,27 @@ static const unsigned int kCodeFill10[] = { }; static const unsigned int kCodeTrapHandler10[] = { - 0x93eeff6d, 0x00080010, 0xbf85003a, 0xb96ef803, 0x876fff6e, 0x00000900, - 0xbf85002a, 0xbef71d87, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, - 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x877eff7e, - 0x00000fff, 0xbeef037e, 0xbefe036e, 0xbeef1d97, 0xbeee037c, 0xbefc036f, - 0xbf800000, 0xbf900001, 0xbefc036e, 0x876dff6d, 0x0000ffff, 0xf4001bb6, - 0xfa000000, 0xbf8cc07f, 0xbf06ff6e, 0xbf810000, 0xbf850002, 0x8878ff78, - 0x00002000, 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, - 0x02000000, 0x886e6f6e, 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, - 0xbe80226c, 0x876fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, 0xbeef03ff, - 0x20000000, 0xbf85000e, 0x876fff6e, 0x00000800, 0xbeef03f4, 0xbf85000a, - 0xbf82002f, 0xbf09836e, 0xbf85ffc8, 0xbf06826e, 0xbeef03ff, 0x80000000, - 0xbf850003, 0x806c846c, 0x826d806d, 0xbf820026, 0xbef0036f, 0xbefe03ff, - 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0x876eff7e, - 0x000003ff, 0x8f6e836e, 0xf4051bbd, 0xdc000000, 0xbf8cc07f, 0xf4051bb7, - 0xfa0000c0, 0xbf8cc07f, 0xbef10380, 0xf6811c37, 0xfa000008, 0xbf8cc07f, - 0x88707170, 0xbf85000e, 0xf4051c37, 0xfa000010, 0xbf8cc07f, 0x87f07070, - 0xbf840009, 0xf4011bb7, 0xfa000018, 0xbf8cc07f, 0xf4411bb8, 0xfa000000, - 0xbf8cc07f, 0xbefc0380, 0xbf800000, 0xbf900001, 0x8878ff78, 0x00002000, - 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, - 0x886e6f6e, 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, + 0x93eeff6d, 0x00080010, 0xbf85003e, 0xb96ef803, 0x876fff6e, 0x00000900, + 0xbf85002d, 0xbf820001, 0xbef71d87, 0xbeee037e, 0xbeef037f, 0xbefe03ff, + 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, + 0x877eff7e, 0x00000fff, 0xbeef037e, 0xbefe036e, 0xbeef1d97, 0xbeee037c, + 0xbefc036f, 0xbf800000, 0xbf900001, 0xbefc036e, 0x876dff6d, 0x0000ffff, + 0xf4001bb6, 0xfa000000, 0xbf8cc07f, 0x876eff6e, 0xffff0000, 0xbf06ff6e, + 0xbf810000, 0xbf850002, 0x8878ff78, 0x00002000, 0x906e8977, 0x876fff6e, + 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, 0x886e6f6e, 0xb9eef807, + 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, 0xbef71d88, 0x876fff6e, + 0x10000100, 0xbf06ff6f, 0x00000100, 0xbeef03ff, 0x20000000, 0xbf85000f, + 0x876fff6e, 0x00000800, 0xbeef03f4, 0xbf85000b, 0xbf82002e, 0xbf09836e, + 0xbf85ffc5, 0xbf06826e, 0xbeef03ff, 0x80000000, 0xbf850003, 0x806c846c, + 0x826d806d, 0xbf820027, 0xbef71d87, 0xbef0036f, 0xbefe03ff, 0x80000000, + 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0x876eff7e, 0x000003ff, + 0x8f6e836e, 0xf4051bbd, 0xdc000000, 0xbf8cc07f, 0xf4051bb7, 0xfa0000c0, + 0xbf8cc07f, 0xbef10380, 0xf6811c37, 0xfa000008, 0xbf8cc07f, 0x88707170, + 0xbf85000e, 0xf4051c37, 0xfa000010, 0xbf8cc07f, 0x87f07070, 0xbf840009, + 0xf4011bb7, 0xfa000018, 0xbf8cc07f, 0xf4411bb8, 0xfa000000, 0xbf8cc07f, + 0xbefc0380, 0xbf800000, 0xbf900001, 0x8878ff78, 0x00002000, 0x906e8977, + 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, 0x886e6f6e, + 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, }; } // namespace amd diff --git a/src/core/runtime/amd_aql_queue.cpp b/src/core/runtime/amd_aql_queue.cpp index 4ce4f9093..6ee90cbb2 100644 --- a/src/core/runtime/amd_aql_queue.cpp +++ b/src/core/runtime/amd_aql_queue.cpp @@ -309,7 +309,7 @@ AqlQueue::~AqlQueue() { } void AqlQueue::Destroy() { - if (amd_queue_.hsa_queue.type & HSA_QUEUE_TYPE_COOPERATIVE) { + if (amd_queue_.hsa_queue.type == HSA_QUEUE_TYPE_COOPERATIVE) { agent_->GWSRelease(); return; } diff --git a/src/core/runtime/amd_cpu_agent.cpp b/src/core/runtime/amd_cpu_agent.cpp index c5b33e4f2..b355ff01e 100644 --- a/src/core/runtime/amd_cpu_agent.cpp +++ b/src/core/runtime/amd_cpu_agent.cpp @@ -357,6 +357,12 @@ hsa_status_t CpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_AMD_AGENT_INFO_DOMAIN: *((uint32_t*)value) = static_cast(properties_.Domain); break; + case HSA_AMD_AGENT_INFO_UUID: { + // At this point CPU devices do not support UUID's. + char uuid_tmp[] = "CPU-XX"; + snprintf((char*)value, sizeof(uuid_tmp), "%s", uuid_tmp); + break; + } default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; break; diff --git a/src/core/runtime/amd_filter_device.cpp b/src/core/runtime/amd_filter_device.cpp new file mode 100644 index 000000000..04087fb11 --- /dev/null +++ b/src/core/runtime/amd_filter_device.cpp @@ -0,0 +1,272 @@ +//////////////////////////////////////////////////////////////////////////////// +// +// The University of Illinois/NCSA +// Open Source License (NCSA) +// +// Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. +// +// Developed by: +// +// AMD Research and AMD HSA Software Development +// +// Advanced Micro Devices, Inc. +// +// www.amd.com +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal with the Software without restriction, including without limitation +// the rights to use, copy, modify, merge, publish, distribute, sublicense, +// and/or sell copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following conditions: +// +// - Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimers. +// - Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimers in +// the documentation and/or other materials provided with the distribution. +// - Neither the names of Advanced Micro Devices, Inc, +// nor the names of its contributors may be used to endorse or promote +// products derived from this Software without specific prior written +// permission. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +// THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS WITH THE SOFTWARE. +// +//////////////////////////////////////////////////////////////////////////////// + +#include "core/inc/amd_filter_device.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hsakmt.h" + +#include "core/util/utils.h" +#include "core/inc/runtime.h" +#include "core/inc/amd_cpu_agent.h" +#include "core/inc/amd_gpu_agent.h" +#include "core/inc/amd_memory_region.h" + +namespace amd { + +bool RvdFilter::FilterDevices() { + return core::Runtime::runtime_singleton_->flag().filter_visible_gpus(); +} + +bool RvdFilter::SelectZeroDevices() { + const std::string& envVal = core::Runtime::runtime_singleton_->flag().visible_gpus(); + return envVal.empty(); +} + +void RvdFilter::BuildRvdTokenList() { + // Determine if user has chosen ZERO devices to be surfaced + const std::string& envVal = core::Runtime::runtime_singleton_->flag().visible_gpus(); + if (envVal.empty()) { + return; + } + + // Parse env value into tokens separated by comma (',') delimiter + std::string token; + char separator = ','; + std::stringstream stream(envVal); + while (getline(stream, token, separator)) { + std::transform(token.begin(), token.end(), token.begin(), ::toupper); + token = trim(token); + rvdTokenList_.push_back(token); + } +} + +void RvdFilter::BuildDeviceUuidList(uint32_t numNodes) { + HSAKMT_STATUS status; + HsaNodeProperties props = {0}; + for (HSAuint32 idx = 0; idx < numNodes; idx++) { + // Query for node properties and ignore Cpu devices + status = hsaKmtGetNodeProperties(idx, &props); + if (status != HSAKMT_STATUS_SUCCESS) { + continue; + } + if (props.NumFComputeCores == 0) { + continue; + } + + // For devices whose UUID is zero build a string that + // will not match user provided value + if (props.UniqueID == 0) { + devUuidList_.push_back("Invalid-UUID"); + continue; + } + + // For devices that support valid UUID values capture UUID + // value into a upper case hex string of length 16 including + // leading zeros if necessary + std::stringstream stream; + stream << "GPU-" << std::setfill('0') << std::setw(sizeof(uint64_t) * 2) << std::hex + << props.UniqueID; + std::string uuidVal(stream.str()); + std::transform(uuidVal.begin(), uuidVal.end(), uuidVal.begin(), ::toupper); + devUuidList_.push_back(uuidVal); + } +} + +int32_t RvdFilter::ProcessUuidToken(const std::string& token) { + // Determine if token exceeds max length of a UUID string + uint32_t tokenLen = token.length(); + if ((tokenLen < 5) || (tokenLen > 20)) { + return -1; + } + + // Track the number of devices user token matches + int32_t devIdx = -1; + int32_t compareVal = -1; + uint32_t numGpus = devUuidList_.size(); + for (uint32_t idx = 0; idx < numGpus; idx++) { + uint32_t uuidLen = devUuidList_[idx].length(); + + // Token could match UUID of another device + if (tokenLen > uuidLen) { + compareVal = -1; + continue; + } + + // Token could match as substring of device UUID + compareVal = token.compare(0, tokenLen, devUuidList_[idx], 0, tokenLen); + + // Check if user Uuid matches with ROCt Uuid + if (compareVal == 0) { + if (devIdx != -1) { + return -1; + } + devIdx = idx; + } + } + + // Return value includes possibility of both + // finding or not finding a device + return devIdx; +} + +uint32_t RvdFilter::BuildUsrDeviceList() { + // Get number of Gpu devices and user specified tokens + uint32_t numGpus = devUuidList_.size(); + uint32_t loopCnt = std::min(numGpus, uint32_t(rvdTokenList_.size())); + + // Evaluate tokens into device index or UUID values + int32_t usrIdx = 0; + int32_t devIdx = -1; + for (uint32_t idx = 0; idx < loopCnt; idx++) { + // User token to be evaluated as UUID or device index + std::string& token = rvdTokenList_[idx]; + + // Token encodes a UUID valaue + if (token.at(0) == 'G') { + devIdx = ProcessUuidToken(token); + if (devIdx == -1) { + return usrDeviceList_.size(); + } + + // Token encodes device index + } else { + char* end = nullptr; + const char* tmp = token.c_str(); + devIdx = std::strtol(tmp, &end, 0); + if (*end != '\0') { + return usrDeviceList_.size(); + } + } + + // Rvd Token evaluates to wrong device index + if ((devIdx < 0) || (devIdx >= numGpus)) { + return usrDeviceList_.size(); + } + + // Determine if device index is previously seen + // Such indices are interpreted as terminators + bool exists = (usrDeviceList_.find(devIdx) != usrDeviceList_.end()); + if (exists) { + return usrDeviceList_.size(); + } + + // Add index to the list of devices that will be + // surfaced upon device enumeration + usrDeviceList_[devIdx] = usrIdx++; + } + + return usrDeviceList_.size(); +} + +uint32_t RvdFilter::GetUsrDeviceListSize() { return usrDeviceList_.size(); } + +int32_t RvdFilter::GetUsrDeviceRank(uint32_t roctIdx) { + const auto& it = usrDeviceList_.find(roctIdx); + if (it != usrDeviceList_.end()) { + return it->second; + } + return -1; +} + +#ifndef NDEBUG +void RvdFilter::SetDeviceUuidList() { + uint64_t dbgUuid[] = {0xBABABABABABABABA, 0xBABABABABABAABBA, 0xBABABABAABBAABBA, + 0xBABAABBAABBAABBA, 0xABBAABBAABBAABBA, 0xABBAABBAABBABABA, + 0xABBAABBABABABABA, 0xABBABABABABABABA}; + + // Override or Set Uuid values for the first four devices + uint32_t numGpus = devUuidList_.size(); + uint32_t numUuids = (sizeof(dbgUuid) / sizeof(uint64_t)); + for (uint32_t idx = 0; (idx < numGpus && (idx < numUuids)); idx++) { + std::stringstream stream; + + // For devices whose UUID is zero + if (dbgUuid[idx] == 0) { + stream << "GPU-XX"; + continue; + } + + // For devices that support valid UUID values + stream << "GPU-" << std::setfill('0') << std::setw(sizeof(uint64_t) * 2) << std::hex + << dbgUuid[idx]; + std::string uuidVal(stream.str()); + std::transform(uuidVal.begin(), uuidVal.end(), uuidVal.begin(), ::toupper); + devUuidList_[idx] = uuidVal; + } +} + +void RvdFilter::PrintDeviceUuidList() { + uint32_t numGpus = devUuidList_.size(); + for (uint32_t idx = 0; idx < numGpus; idx++) { + std::cout << "Dev[" << idx << "]: " << devUuidList_[idx]; + std::cout << std::endl << std::flush; + } +} + +void RvdFilter::PrintUsrDeviceList() { + // Flip the map values as value indicates surface rank + for (auto const& elem : usrDeviceList_) { + std::cout << "UsrDev[" << elem.second << "]: " << elem.first; + std::cout << std::endl << std::flush; + } +} + +void RvdFilter::PrintRvdTokenList() { + uint32_t numTokens = rvdTokenList_.size(); + for (uint32_t idx = 0; idx < numTokens; idx++) { + std::cout << "Token[" << idx << "]: " << rvdTokenList_[idx]; + std::cout << std::endl << std::flush; + } +} +#endif + +} // namespace amd diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 1dca41375..33fbf6737 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -51,6 +51,7 @@ #include #include #include +#include #include "core/inc/amd_aql_queue.h" #include "core/inc/amd_blit_kernel.h" @@ -136,7 +137,7 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props) GpuAgent::~GpuAgent() { for (auto& blit : blits_) { - if (blit.created()) { + if (!blit.empty()) { hsa_status_t status = blit->Destroy(*this); assert(status == HSA_STATUS_SUCCESS); } @@ -597,7 +598,7 @@ void GpuAgent::InitGWS() { if (status != HSAKMT_STATUS_SUCCESS) throw AMD::hsa_exception(HSA_STATUS_ERROR_OUT_OF_RESOURCES, "GWS allocation failed."); - queue->amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_COOPERATIVE | HSA_QUEUE_TYPE_MULTI; + queue->amd_queue_.hsa_queue.type = HSA_QUEUE_TYPE_COOPERATIVE; gws_queue_.ref_ct_ = 0; return queue.release(); }); @@ -679,7 +680,7 @@ hsa_status_t GpuAgent::DmaFill(void* ptr, uint32_t value, size_t count) { hsa_status_t GpuAgent::EnableDmaProfiling(bool enable) { for (auto& blit : blits_) { - if (blit.created()) { + if (!blit.empty()) { const hsa_status_t stat = blit->EnableProfiling(enable); if (stat != HSA_STATUS_SUCCESS) { return stat; @@ -900,6 +901,25 @@ hsa_status_t GpuAgent::GetInfo(hsa_agent_info_t attribute, void* value) const { case HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES: *((bool*)value) = properties_.NumGws != 0; break; + case HSA_AMD_AGENT_INFO_UUID: { + uint64_t uuid_value = static_cast(properties_.UniqueID); + + // Either device does not support UUID e.g. a Gfx8 device, + // or runtime is using an older thunk library that does not + // support UUID's + if (uuid_value == 0) { + char uuid_tmp[] = "GPU-XX"; + snprintf((char*)value, sizeof(uuid_tmp), "%s", uuid_tmp); + break; + } + + // Device supports UUID, build UUID string to return + std::stringstream ss; + ss << "GPU-" << std::setfill('0') << std::setw(sizeof(uint64_t) * 2) << std::hex + << uuid_value; + snprintf((char*)value, (ss.str().length() + 1), "%s", (char*)ss.str().c_str()); + break; + } default: return HSA_STATUS_ERROR_INVALID_ARGUMENT; break; @@ -913,7 +933,7 @@ hsa_status_t GpuAgent::QueueCreate(size_t size, hsa_queue_type32_t queue_type, uint32_t group_segment_size, core::Queue** queue) { // Handle GWS queues. - if (queue_type & HSA_QUEUE_TYPE_COOPERATIVE) { + if (queue_type == HSA_QUEUE_TYPE_COOPERATIVE) { ScopedAcquire lock(&gws_queue_.lock_); auto ret = (*gws_queue_.queue_).get(); if (ret != nullptr) { diff --git a/src/core/runtime/amd_topology.cpp b/src/core/runtime/amd_topology.cpp index 1b1438fc1..4fe2fac4f 100644 --- a/src/core/runtime/amd_topology.cpp +++ b/src/core/runtime/amd_topology.cpp @@ -41,6 +41,7 @@ //////////////////////////////////////////////////////////////////////////////// #include "core/inc/amd_topology.h" +#include "core/inc/amd_filter_device.h" #include #include @@ -66,76 +67,6 @@ namespace amd { static const uint kKfdVersionMajor = 0; static const uint kKfdVersionMinor = 99; -#ifndef NDEBUG -static bool PrintUsrGpuMap(std::map& gpu_usr_map) { - (void)PrintUsrGpuMap; // Suppress unused symbol warning. - std::map::iterator it; - for (it = gpu_usr_map.begin(); it != gpu_usr_map.end(); it++) { - int32_t usrIdx = it->second; - uint32_t kfdIdx = it->first; - std::cout << "KfdIdx: " << kfdIdx << " @ UsrIdx: " << usrIdx << std::endl; - } - return true; -} -#endif - -/** - * Determines if user has defined the env that indicates which - * subset of Gpu's are desired to be surfaced. If defined the - * set of Gpu's are captured into a map of Gpu index and - * - * @return true if env is not blank, false otherwise. It is - * possible to have zero devices surfaced even when env is - * not blank. - */ -static bool MapUsrGpuList(int32_t numNodes, std::map& gpu_usr_map) { - const std::string& env_value = core::Runtime::runtime_singleton_->flag().visible_gpus(); - if (env_value.empty()) { - return false; - } - - // Capture the env value string as a parsable stream - std::stringstream stream(env_value); - - // Read stream until there are no more tokens - int32_t usrIdx = 0; - int32_t token = 0x11231926; - while (!stream.eof()) { - // Read the option value - stream >> token; - if (stream.fail()) { - return true; - } - - // Stop processing input tokens if invalid index is seen - // A value that is less than zero or greater than the - // number of Numa nodes is considered invalid - if ((token < 0) || (token >= numNodes)) { - return true; - } - - // Determine if current value has been seen before - // @note: Currently we are interpreting a repeat as - // an invalid index i.e. is equal to -1 - bool exists = gpu_usr_map.find(token) != gpu_usr_map.end(); - if (exists) { - return true; - } - - // Update Gpu User map table - gpu_usr_map[token] = usrIdx++; - - // Ignore the delimiter - if (stream.peek() == ',') { - stream.ignore(); - } else { - return true; - } - } - - return true; -} - CpuAgent* DiscoverCpu(HSAuint32 node_id, HsaNodeProperties& node_prop) { if (node_prop.NumCPUCores == 0) { return nullptr; @@ -281,15 +212,18 @@ void BuildTopology() { core::Runtime::runtime_singleton_->SetLinkCount(props.NumNodes); - // Determine and process user's request to surface - // a subset of Gpu devices + // Query if env ROCR_VISIBLE_DEVICES is defined. If defined + // determine number and order of GPU devices to be surfaced + RvdFilter rvdFilter; int32_t invalidIdx = -1; + uint32_t visibleCnt = 0; std::vector gpu_usr_list; - std::map gpu_usr_map; - bool filter = MapUsrGpuList(props.NumNodes, gpu_usr_map); - int32_t list_sz = gpu_usr_map.size(); + bool filter = RvdFilter::FilterDevices(); if (filter) { - for (int32_t idx = 0; idx < list_sz; idx++) { + rvdFilter.BuildRvdTokenList(); + rvdFilter.BuildDeviceUuidList(props.NumNodes); + visibleCnt = rvdFilter.BuildUsrDeviceList(); + for (int32_t idx = 0; idx < visibleCnt; idx++) { gpu_usr_list.push_back(invalidIdx); } } @@ -302,7 +236,7 @@ void BuildTopology() { continue; } - // Instantiate a Cpu/Apu device + // Instantiate a Cpu device const CpuAgent* cpu = DiscoverCpu(node_id, node_prop); assert(((node_prop.NumCPUCores == 0) || (cpu != nullptr)) && "CPU device failed discovery."); @@ -311,9 +245,9 @@ void BuildTopology() { // visible list, continue if not found if (node_prop.NumFComputeCores != 0) { if (filter) { - const auto& it = gpu_usr_map.find(kfdIdx); - if (it != gpu_usr_map.end()) { - gpu_usr_list[it->second] = node_id; + int32_t devRank = rvdFilter.GetUsrDeviceRank(kfdIdx); + if (devRank != (-1)) { + gpu_usr_list[devRank] = node_id; } } else { gpu_usr_list.push_back(node_id); @@ -328,6 +262,7 @@ void BuildTopology() { RegisterLinkInfo(node_id, node_prop.NumIOLinks); } + // Instantiate ROCr objects to encapsulate Gpu devices SurfaceGpuList(gpu_usr_list); } diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index 7de29f965..1d08c2564 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -700,13 +700,8 @@ hsa_status_t hsa_queue_create( agent->GetInfo(HSA_AGENT_INFO_QUEUE_TYPE, &agent_queue_type); assert(HSA_STATUS_SUCCESS == status); - if (agent_queue_type == HSA_QUEUE_TYPE_SINGLE && - ((type & HSA_QUEUE_TYPE_SINGLE) != HSA_QUEUE_TYPE_SINGLE)) { - return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION; - } - - if ((type & HSA_QUEUE_TYPE_COOPERATIVE) && - ((type & HSA_QUEUE_TYPE_SINGLE) != HSA_QUEUE_TYPE_MULTI)) { + if ((agent_queue_type == HSA_QUEUE_TYPE_SINGLE) && + (type != HSA_QUEUE_TYPE_SINGLE)) { return HSA_STATUS_ERROR_INVALID_QUEUE_CREATION; } diff --git a/src/core/runtime/isa.cpp b/src/core/runtime/isa.cpp index 2fa7d2872..5b56c84a4 100755 --- a/src/core/runtime/isa.cpp +++ b/src/core/runtime/isa.cpp @@ -207,20 +207,34 @@ const IsaRegistry::IsaMap IsaRegistry::GetSupportedIsas() { ISAREG_ENTRY_GEN(7, 0, 0, false, false) ISAREG_ENTRY_GEN(7, 0, 1, false, false) ISAREG_ENTRY_GEN(7, 0, 2, false, false) + ISAREG_ENTRY_GEN(8, 0, 1, false, false) ISAREG_ENTRY_GEN(8, 0, 1, true, false) ISAREG_ENTRY_GEN(8, 0, 2, false, false) + ISAREG_ENTRY_GEN(8, 0, 2, true, false) ISAREG_ENTRY_GEN(8, 0, 3, false, false) + ISAREG_ENTRY_GEN(8, 0, 3, true, false) + ISAREG_ENTRY_GEN(8, 1, 0, false, false) ISAREG_ENTRY_GEN(8, 1, 0, true, false) ISAREG_ENTRY_GEN(9, 0, 0, false, false) + ISAREG_ENTRY_GEN(9, 0, 0, true, false) + ISAREG_ENTRY_GEN(9, 0, 2, false, false) ISAREG_ENTRY_GEN(9, 0, 2, true, false) ISAREG_ENTRY_GEN(9, 0, 4, false, false) + ISAREG_ENTRY_GEN(9, 0, 4, true, false) ISAREG_ENTRY_GEN(9, 0, 6, false, false) + ISAREG_ENTRY_GEN(9, 0, 6, true, false) ISAREG_ENTRY_GEN(9, 0, 6, false, true ) - ISAREG_ENTRY_GEN(9, 0, 8, false, true ) + ISAREG_ENTRY_GEN(9, 0, 6, true, true ) ISAREG_ENTRY_GEN(9, 0, 8, false, false) + ISAREG_ENTRY_GEN(9, 0, 8, true, false) + ISAREG_ENTRY_GEN(9, 0, 8, false, true ) + ISAREG_ENTRY_GEN(9, 0, 8, true, true ) ISAREG_ENTRY_GEN(10, 1, 0, false, false) + ISAREG_ENTRY_GEN(10, 1, 0, true, false) ISAREG_ENTRY_GEN(10, 1, 1, false, false) + ISAREG_ENTRY_GEN(10, 1, 1, true, false) ISAREG_ENTRY_GEN(10, 1, 2, false, false) + ISAREG_ENTRY_GEN(10, 1, 2, true, false) return supported_isas; } diff --git a/src/core/runtime/runtime.cpp b/src/core/runtime/runtime.cpp index 763efcdad..dd6a15ca2 100644 --- a/src/core/runtime/runtime.cpp +++ b/src/core/runtime/runtime.cpp @@ -381,6 +381,8 @@ hsa_status_t Runtime::DeregisterReleaseNotifier(void* ptr, } hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { + void* source = const_cast(src); + // Choose agents from pointer info bool is_src_system = false; bool is_dst_system = false; @@ -388,54 +390,65 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { core::Agent* dst_agent; // Fetch ownership - const auto& is_system_mem = [&](void* ptr, core::Agent*& agent) { + const auto& is_system_mem = [&](void* ptr, core::Agent*& agent, bool& need_lock) { hsa_amd_pointer_info_t info; + uint32_t count; + hsa_agent_t* accessible = nullptr; + MAKE_SCOPE_GUARD([&]() { free(accessible); }); info.size = sizeof(info); - hsa_status_t err = PtrInfo(ptr, &info, nullptr, nullptr, nullptr); + hsa_status_t err = PtrInfo(ptr, &info, malloc, &count, &accessible); if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "PtrInfo failed in hsa_memory_copy."); ptrdiff_t endPtr = (ptrdiff_t)ptr + size; if (info.agentBaseAddress <= ptr && endPtr <= (ptrdiff_t)info.agentBaseAddress + info.sizeInBytes) { + if (info.agentOwner.handle == 0) info.agentOwner = accessible[0]; agent = core::Agent::Convert(info.agentOwner); + need_lock = false; return agent->device_type() != core::Agent::DeviceType::kAmdGpuDevice; } else { + need_lock = true; agent = cpu_agents_[0]; return true; } }; - is_src_system = is_system_mem(const_cast(src), src_agent); - is_dst_system = is_system_mem(dst, dst_agent); + bool src_lock, dst_lock; + is_src_system = is_system_mem(source, src_agent, src_lock); + is_dst_system = is_system_mem(dst, dst_agent, dst_lock); // CPU-CPU if (is_src_system && is_dst_system) { - memcpy(dst, src, size); + memcpy(dst, source, size); return HSA_STATUS_SUCCESS; } // Same GPU - if (src_agent->node_id() == dst_agent->node_id()) return dst_agent->DmaCopy(dst, src, size); + if (src_agent->node_id() == dst_agent->node_id()) return dst_agent->DmaCopy(dst, source, size); // GPU-CPU // Must ensure that system memory is visible to the GPU during the copy. const amd::MemoryRegion* system_region = static_cast(system_regions_fine_[0]); - const auto& locked_copy = [&](void* ptr, core::Agent* locking_agent, bool locking_src) { - void* gpuPtr; + void* gpuPtr = nullptr; + const auto& locked_copy = [&](void*& ptr, core::Agent* locking_agent) { + void* tmp; hsa_agent_t agent = locking_agent->public_handle(); - hsa_status_t err = system_region->Lock(1, &agent, ptr, size, &gpuPtr); - if (err != HSA_STATUS_SUCCESS) return err; - MAKE_SCOPE_GUARD([&]() { system_region->Unlock(ptr); }); - if (locking_src) - return locking_agent->DmaCopy(dst, gpuPtr, size); - else - return locking_agent->DmaCopy(gpuPtr, src, size); + hsa_status_t err = system_region->Lock(1, &agent, ptr, size, &tmp); + if (err != HSA_STATUS_SUCCESS) throw AMD::hsa_exception(err, "Lock failed in hsa_memory_copy."); + gpuPtr = ptr; + ptr = tmp; }; - if (is_src_system) return locked_copy(const_cast(src), dst_agent, true); - if (is_dst_system) return locked_copy(dst, src_agent, false); + MAKE_SCOPE_GUARD([&]() { + if (gpuPtr != nullptr) system_region->Unlock(gpuPtr); + }); + + if (src_lock) locked_copy(source, dst_agent); + if (dst_lock) locked_copy(dst, src_agent); + if (is_src_system) return dst_agent->DmaCopy(dst, source, size); + if (is_dst_system) return src_agent->DmaCopy(dst, source, size); /* GPU-GPU - functional support, not a performance path. @@ -448,7 +461,7 @@ hsa_status_t Runtime::CopyMemory(void* dst, const void* src, size_t size) { void* temp = nullptr; system_region->Allocate(size, core::MemoryRegion::AllocateNoFlags, &temp); MAKE_SCOPE_GUARD([&]() { system_region->Free(temp, size); }); - hsa_status_t err = src_agent->DmaCopy(temp, src, size); + hsa_status_t err = src_agent->DmaCopy(temp, source, size); if (err == HSA_STATUS_SUCCESS) err = dst_agent->DmaCopy(dst, temp, size); return err; } @@ -876,6 +889,8 @@ hsa_status_t Runtime::IPCCreate(void* ptr, size_t len, hsa_amd_ipc_memory_t* han info.size = sizeof(info); if (PtrInfo(ptr, &info, nullptr, nullptr, nullptr, &block) != HSA_STATUS_SUCCESS) return HSA_STATUS_ERROR_INVALID_ARGUMENT; + if ((info.agentBaseAddress != ptr) || (info.sizeInBytes != len)) + return HSA_STATUS_ERROR_INVALID_ARGUMENT; if ((block.base != ptr) || (block.length != len)) { if (!IsMultipleOf(block.base, 2 * 1024 * 1024)) { assert(false && "Fragment's block not aligned to 2MB!"); @@ -1335,12 +1350,14 @@ void Runtime::LoadExtensions() { "libhsa-ext-image.so.1"}; #endif - // Update Hsa Api Table with handle of Image extension Apis - extensions_.LoadFinalizer(kFinalizerLib[os_index(os::current_os)]); + // Update Hsa Api Table with handle of Finalizer extension Apis + // Skipping finalizer loading since finalizer is no longer distributed. + // LinkExts will expose the finalizer-not-present implementation. + // extensions_.LoadFinalizer(kFinalizerLib[os_index(os::current_os)]); hsa_api_table_.LinkExts(&extensions_.finalizer_api, core::HsaApiTable::HSA_EXT_FINALIZER_API_TABLE_ID); - // Update Hsa Api Table with handle of Finalizer extension Apis + // Update Hsa Api Table with handle of Image extension Apis extensions_.LoadImage(kImageLib[os_index(os::current_os)]); hsa_api_table_.LinkExts(&extensions_.image_api, core::HsaApiTable::HSA_EXT_IMAGE_API_TABLE_ID); diff --git a/src/core/util/flag.h b/src/core/util/flag.h index 272e5f1a4..8f4a47432 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -72,6 +72,7 @@ class Flag { enable_sdma_ = os::GetEnvVar("HSA_ENABLE_SDMA"); visible_gpus_ = os::GetEnvVar("ROCR_VISIBLE_DEVICES"); + filter_visible_gpus_ = os::IsEnvVarSet("ROCR_VISIBLE_DEVICES"); var = os::GetEnvVar("HSA_RUNNING_UNDER_VALGRIND"); running_valgrind_ = (var == "1") ? true : false; @@ -144,6 +145,8 @@ class Flag { std::string visible_gpus() const { return visible_gpus_; } + bool filter_visible_gpus() const { return filter_visible_gpus_; } + uint32_t max_queues() const { return max_queues_; } size_t scratch_mem_size() const { return scratch_mem_size_; } @@ -167,6 +170,7 @@ class Flag { std::string enable_sdma_; + bool filter_visible_gpus_; std::string visible_gpus_; uint32_t max_queues_; diff --git a/src/core/util/lazy_ptr.h b/src/core/util/lazy_ptr.h index dbeb1f764..bb2583c71 100644 --- a/src/core/util/lazy_ptr.h +++ b/src/core/util/lazy_ptr.h @@ -87,6 +87,7 @@ template class lazy_ptr { const std::unique_ptr& operator->() const { make(true); + assert(obj != nullptr && "Null dereference through lazy_ptr."); return obj; } @@ -107,7 +108,17 @@ template class lazy_ptr { void touch() const { make(false); } // Tells if the lazy object has been constructed or not. - bool created() const { return obj != nullptr; } + // Construction may fail silently (return nullptr). + bool created() const { + std::atomic_thread_fence(std::memory_order_acquire); + return func == nullptr; + } + + // Tells if the lazy object exists or not. + bool empty() const { + std::atomic_thread_fence(std::memory_order_acquire); + return obj == nullptr; + } private: mutable std::unique_ptr obj; @@ -122,16 +133,15 @@ template class lazy_ptr { return; } MAKE_SCOPE_GUARD([&]() { lock.Release(); }); - if (obj != nullptr) return; + if (func == nullptr) return; T* ptr = func(); - std::atomic_thread_fence(std::memory_order_release); obj.reset(ptr); + std::atomic_thread_fence(std::memory_order_release); func = nullptr; } __forceinline void make(bool block) const { - std::atomic_thread_fence(std::memory_order_acquire); - if (obj == nullptr) { + if (!created()) { make_body(block); } } diff --git a/src/core/util/lnx/os_linux.cpp b/src/core/util/lnx/os_linux.cpp index 24974185a..fdbe19a29 100644 --- a/src/core/util/lnx/os_linux.cpp +++ b/src/core/util/lnx/os_linux.cpp @@ -237,6 +237,12 @@ bool WaitForAllThreads(Thread* threads, uint threadCount) { return true; } +bool IsEnvVarSet(std::string env_var_name) { + char* buff = NULL; + buff = getenv(env_var_name.c_str()); + return (buff != NULL); +} + void SetEnvVar(std::string env_var_name, std::string env_var_value) { setenv(env_var_name.c_str(), env_var_value.c_str(), 1); } diff --git a/src/core/util/os.h b/src/core/util/os.h index 510317861..00210cc12 100644 --- a/src/core/util/os.h +++ b/src/core/util/os.h @@ -152,6 +152,12 @@ bool WaitForThread(Thread thread); /// @return: bool. bool WaitForAllThreads(Thread* threads, uint thread_count); +/// @brief: Determines if environment key is set. +/// @param: env_var_name(Input), name of the environment value. +/// @return: bool, true for binding any value to environment key, +/// including an empty string. False otherwise +bool IsEnvVarSet(std::string env_var_name); + /// @brief: Sets the environment value. /// @param: env_var_name(Input), name of the environment value. /// @param: env_var_value(Input), value of the environment value.s diff --git a/src/core/util/utils.h b/src/core/util/utils.h old mode 100755 new mode 100644 index f7f09e9d7..fbe7e4760 --- a/src/core/util/utils.h +++ b/src/core/util/utils.h @@ -49,6 +49,9 @@ #include "stddef.h" #include "stdlib.h" #include +#include +#include +#include typedef unsigned int uint; typedef uint64_t uint64; @@ -307,6 +310,22 @@ static __forceinline uint64_t NextPow2(uint64_t value) { static __forceinline bool strIsEmpty(const char* str) noexcept { return str[0] == '\0'; } +static __forceinline std::string& ltrim(std::string& s) { + auto it = std::find_if(s.begin(), s.end(), + [](char c) { return !std::isspace(c, std::locale::classic()); }); + s.erase(s.begin(), it); + return s; +} + +static __forceinline std::string& rtrim(std::string& s) { + auto it = std::find_if(s.rbegin(), s.rend(), + [](char c) { return !std::isspace(c, std::locale::classic()); }); + s.erase(it.base(), s.end()); + return s; +} + +static __forceinline std::string& trim(std::string& s) { return ltrim(rtrim(s)); } + #include "atomic_helpers.h" #endif // HSA_RUNTIME_CORE_UTIL_UTIIS_H_ diff --git a/src/inc/hsa.h b/src/inc/hsa.h index 54dc78460..0e5936852 100644 --- a/src/inc/hsa.h +++ b/src/inc/hsa.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -632,7 +632,7 @@ hsa_status_t HSA_API hsa_system_major_extension_supported( uint16_t version_major, uint16_t *version_minor, bool* result); - + /** * @deprecated @@ -711,7 +711,7 @@ hsa_status_t HSA_API hsa_system_get_major_extension_table( uint16_t extension, uint16_t version_major, size_t table_length, - void *table); + void *table); /** * @brief Struct containing an opaque handle to an agent, a device that participates in @@ -1283,7 +1283,7 @@ hsa_status_t HSA_API hsa_agent_major_extension_supported( uint16_t version_major, uint16_t *version_minor, bool* result); - + /** @} */ @@ -2184,24 +2184,25 @@ typedef struct hsa_region_s { */ typedef enum { /** - * Queue supports multiple producers. + * Queue supports multiple producers. Use of multiproducer queue mechanics is + * required. */ HSA_QUEUE_TYPE_MULTI = 0, /** * Queue only supports a single producer. In some scenarios, the application * may want to limit the submission of AQL packets to a single agent. Queues * that support a single producer may be more efficient than queues supporting - * multiple producers. + * multiple producers. Use of multiproducer queue mechanics is not supported. */ HSA_QUEUE_TYPE_SINGLE = 1, /** - * Queue supports cooperative dispatches able to use GWS synchronization. - * Queues of this type must also be of type HSA_QUEUE_TYPE_MULTI and - * may be limited in number. The runtime may return the same queue to serve - * multiple hsa_queue_create calls when this type is given. Callers must - * inspect the returned queue to discover queue size. Queues of this type - * are reference counted and require a matching number of hsa_queue_destroy - * calls to release. Use of multiproducer queue mechanics is required. See + * Queue supports multiple producers and cooperative dispatches. Cooperative + * dispatches are able to use GWS synchronization. Queues of this type may be + * limited in number. The runtime may return the same queue to serve multiple + * ::hsa_queue_create calls when this type is given. Callers must inspect the + * returned queue to discover queue size. Queues of this type are reference + * counted and require a matching number of ::hsa_queue_destroy calls to + * release. Use of multiproducer queue mechanics is required. See * ::HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES to query agent support for this * type. */ diff --git a/src/inc/hsa_ext_amd.h b/src/inc/hsa_ext_amd.h index 294e62f2a..ff00177a9 100644 --- a/src/inc/hsa_ext_amd.h +++ b/src/inc/hsa_ext_amd.h @@ -163,7 +163,17 @@ typedef enum hsa_amd_agent_info_s { * Queries for support of cooperative queues. See ::HSA_QUEUE_TYPE_COOPERATIVE. * The type of this attribute is bool. */ - HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010 + HSA_AMD_AGENT_INFO_COOPERATIVE_QUEUES = 0xA010, + /** + * Queries UUID of an agent. The value is an Ascii string with a maximum + * of 21 chars including NUL. The string value consists of two parts: header + * and body. The header identifies device type (GPU, CPU, DSP) while body + * encodes UUID as a 16 digit hex string + * + * Agents that do not support UUID will return the string "GPU-XX" or + * "CPU-XX" or "DSP-XX" depending upon their device type ::hsa_device_type_t + */ + HSA_AMD_AGENT_INFO_UUID = 0xA011 } hsa_amd_agent_info_t; typedef struct hsa_amd_hdp_flush_s { diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index 6299983af..25c30736a 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -42,14 +42,18 @@ #include "executable.hpp" +#include +#include +#include +#include + #include #include #include +#include #include #include #include -#include -#include #include "inc/amd_hsa_elf.h" #include "inc/amd_hsa_kernel_code.h" #include "core/inc/amd_hsa_code.hpp" @@ -68,7 +72,10 @@ __attribute__((noinline)) static void _loader_debug_state() { static volatile int function_needs_a_side_effect = 0; function_needs_a_side_effect ^= 1; } -HSA_API r_debug _amdgpu_r_debug = {1, +// r_version history: +// 1: Initial debug protocol +// 2: New trap handler ABI. The reason for halting a wave is recorded in ttmp11[8:7]. +HSA_API r_debug _amdgpu_r_debug = {2, nullptr, reinterpret_cast(&_loader_debug_state), r_debug::RT_CONSISTENT, @@ -1859,9 +1866,65 @@ hsa_status_t ExecutableImpl::Freeze(const char *options) { std::stringstream ss; uint64_t elf_begin = lco->getElfData(); uint64_t elf_size = lco->getElfSize(); - ss << "file:///proc/" << getpid() << "/mem#" - << "offset=" << std::hex << std::showbase << elf_begin << "&" - << "size=" << elf_size; + + struct args { + ElfW(Addr) mem_addr; + size_t callback_num; + const char *file_name; + size_t file_offset; + } data{ elf_begin, 0 }; + + // Iterate the loaded shared objects program headers to see if the elf binary + // is allocated in a mapped file. + if (dl_iterate_phdr([](struct dl_phdr_info *info, size_t size, void *ptr) -> int { + struct args *data = (struct args *) ptr; + const ElfW(Addr) reladdr = data->mem_addr - info->dlpi_addr; + + int n = info->dlpi_phnum; + while (--n >= 0) { + if (info->dlpi_phdr[n].p_type == PT_LOAD + && reladdr - info->dlpi_phdr[n].p_vaddr >= 0 + && reladdr - info->dlpi_phdr[n].p_vaddr < info->dlpi_phdr[n].p_memsz) { + // The first callback is always the program executable. + if (!info->dlpi_name[0] && data->callback_num == 0) { + static char argv0[PATH_MAX] = {0}; + if (!argv0[0] && readlink("/proc/self/exe", argv0, sizeof(argv0)) == -1) + return 0; + data->file_name = argv0; + } else { + data->file_name = info->dlpi_name; + } + + data->file_offset = reladdr - info->dlpi_phdr[n].p_vaddr + info->dlpi_phdr[n].p_offset; + return 1; + } + } + + ++data->callback_num; + return 0; + }, &data)) { + unsigned char c; + + ss.fill('0'); + ss << "file://"; + + while ((c = *data.file_name++) != '\0') { + // %-encode the file name + if (isalnum(c) || c == '/' || c == '-' || c == '_' || c == '.' || c == '~') { + ss << c; + } else { + ss << std::uppercase; + ss << '%' << std::hex << std::setw(2) << static_cast(c); + ss << std::nouppercase; + } + } + ss << "#offset=" << std::dec << data.file_offset + << "&size=" << std::dec << elf_size; + } else { + ss << "file:///proc/" << getpid() << "/mem#" + << "offset=" << std::hex << std::showbase << elf_begin << "&" + << "size=" << std::dec << elf_size; + } lco->r_debug_info.l_addr = lco->getDelta(); lco->r_debug_info.l_name = strdup(ss.str().c_str()); lco->r_debug_info.l_prev = nullptr;