From ae653deee7c15ca1ef81d1f18204d533d65c8ccb Mon Sep 17 00:00:00 2001 From: Sean Keely Date: Tue, 17 Dec 2019 19:21:45 -0600 Subject: [PATCH] ROCm 3.0.0 updates --- src/CMakeLists.txt | 77 ++++------ src/cmake_modules/utils.cmake | 153 +++++++++++++++---- src/core/inc/amd_gpu_shaders.h | 228 ++++++++++++++++++++--------- src/core/inc/amd_hsa_loader.hpp | 20 ++- src/core/runtime/amd_gpu_agent.cpp | 1 + src/core/runtime/hsa.cpp | 2 +- src/core/util/flag.h | 6 + src/loader/executable.cpp | 89 ++++++++++- src/loader/executable.hpp | 6 + 9 files changed, 427 insertions(+), 155 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2c8eea700..f29218229 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -42,9 +42,6 @@ cmake_minimum_required ( VERSION 3.5.0 ) -## Verbose output. -set ( CMAKE_VERBOSE_MAKEFILE on ) - ## Set core runtime module name and project name. set ( CORE_RUNTIME_NAME "hsa-runtime" ) set ( CORE_RUNTIME_TARGET "${CORE_RUNTIME_NAME}64" ) @@ -85,24 +82,19 @@ endif() ## Get the package version. get_version ( "1.1.9" ) +set (SO_MAJOR 1) +set (SO_MINOR 1) +set (SO_PATCH 9) -set ( BUILD_VERSION_MAJOR ${VERSION_MAJOR} ) -set ( BUILD_VERSION_MINOR ${VERSION_MINOR} ) -set ( BUILD_VERSION_PATCH ${VERSION_PATCH} ) -set ( LIB_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) - -if ( VERSION_BUILD ) - set ( BUILD_VERSION_PATCH "${BUILD_VERSION_PATCH}-${VERSION_BUILD}" ) -endif () -set ( BUILD_VERSION_STRING "${BUILD_VERSION_MAJOR}.${BUILD_VERSION_MINOR}.${BUILD_VERSION_PATCH}" ) +set ( SO_VERSION_STRING "${SO_MAJOR}.${SO_MINOR}.${SO_PATCH}" ) +set ( PACKAGE_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH}.${VERSION_COMMIT_COUNT}-${VERSION_JOB}-${VERSION_HASH}" ) ## Find the hsakmt library and include files, use directory hint from cache -find_file ( HSAKMT_INC NAMES "hsakmt.h" "libhsakmt/hsakmt.h" PATHS ${HSAKMT_INC_PATH} ) -find_library ( HSAKMT_LIB "libhsakmt.so" ${HSAKMT_LIB_PATH} ) -get_filename_component ( HSAKMT_INC_PATH ${HSAKMT_INC} DIRECTORY CACHE ) -get_filename_component ( HSAKMT_LIB_PATH ${HSAKMT_LIB} DIRECTORY CACHE ) -unset( HSAKMT_INC CACHE ) -unset( HSAKMT_LIB CACHE ) +## Search relative to build directory, relative to source directory, and finally the rocm install default (/opt/rocm) +get_include_path( HSAKMT_INC_PATH "libhsakmt include path" NAMES "hsakmt.h" "libhsakmt/hsakmt.h" HINTS "${CMAKE_BINARY_DIR}/../../include" "${CMAKE_CURRENT_SOURCE_DIR}/../../../../libhsakmt/include" PATHS "/opt/rocm/include") +get_library_path( HSAKMT_LIB_PATH "libhsakmt library path" NAMES "libhsakmt.so" HINTS "${CMAKE_BINARY_DIR}/../../lib" "${CMAKE_BINARY_DIR}/../roct" PATHS "/opt/rocm/lib") +include_directories ( ${HSAKMT_INC_PATH} ) +link_directories ( ${HSAKMT_LIB_PATH} ) ## Set include directories for ROCr runtime include_directories ( ${CMAKE_CURRENT_SOURCE_DIR} ) @@ -110,16 +102,11 @@ include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/inc ) include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/core/inc ) include_directories ( ${CMAKE_CURRENT_SOURCE_DIR}/libamdhsacode ) -## Set include and link directories for libhsakmt -include_directories ( ${HSAKMT_INC_PATH} ) -link_directories ( ${HSAKMT_LIB_PATH} ) - ## ROCr build internal versioning -if ( VERSION_BUILD ) - add_definitions ( -DROCR_BUILD_ID=${BUILD_VERSION_STRING} ) -else () - add_definitions ( -DROCR_BUILD_ID="${BUILD_VERSION_STRING}-unknown" ) -endif () +add_definitions ( -DROCR_BUILD_ID=${PACKAGE_VERSION_STRING} ) + +## Set RUNPATH - ../../lib covers use of the legacy symlink in /hsa/lib/ +set(CMAKE_INSTALL_RPATH "$ORIGIN;$ORIGIN/../../lib") ## ------------------------- Linux Compiler and Linker options ------------------------- set ( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror -fexceptions -fno-rtti -fvisibility=hidden -Wno-error=sign-compare -Wno-sign-compare -Wno-write-strings -Wno-conversion-null -fno-math-errno -fno-threadsafe-statics -fmerge-all-constants -fms-extensions -Wno-error=comment -Wno-comment -Wno-error=pointer-arith -Wno-pointer-arith -Wno-error=unused-variable -Wno-error=unused-function" ) @@ -131,9 +118,7 @@ set ( DRVDEF "${CMAKE_CURRENT_SOURCE_DIR}/hsacore.so.def" ) set ( LNKSCR "${CMAKE_CURRENT_SOURCE_DIR}/hsacore.so.link" ) -set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack -Wl,${LNKSCR} -Wl,--version-script=${DRVDEF}" ) - -set ( CMAKE_SKIP_BUILD_RPATH TRUE ) +set ( CMAKE_SHARED_LINKER_FLAGS "-Wl,-Bdynamic -Wl,-z,noexecstack -Wl,${LNKSCR} -Wl,--version-script=${DRVDEF} -Wl,--enable-new-dtags" ) ## ------------------------- End Compiler and Linker options ---------------------------- @@ -177,10 +162,6 @@ set ( SRCS "core/util/lnx/os_linux.cpp" add_library( ${CORE_RUNTIME_TARGET} SHARED ${SRCS} ) -## Set the VERSION and SOVERSION values -set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY VERSION "${LIB_VERSION_STRING}" ) -set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY SOVERSION "${BUILD_VERSION_MAJOR}" ) - target_link_libraries ( ${CORE_RUNTIME_TARGET} PRIVATE hsakmt elf dl pthread rt @@ -188,36 +169,42 @@ target_link_libraries ( ${CORE_RUNTIME_TARGET} ## Strip should be optional or not at all if ( "${CMAKE_BUILD_TYPE}" STREQUAL Release ) -# add_custom_command ( TARGET ${CORE_RUNTIME_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} *.so ) +# add_custom_command ( TARGET ${CORE_RUNTIME_TARGET} POST_BUILD COMMAND ${CMAKE_STRIP} $ ) endif () -## Create symlinks for packaging and install -add_custom_target ( hsa-link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/include/hsa hsa-link ) -add_custom_target ( ${CORE_RUNTIME_TARGET}.so-link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/lib/${CORE_RUNTIME_LIBRARY}.so ${CORE_RUNTIME_LIBRARY}.so-link ) +## Set the VERSION and SOVERSION values +set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY VERSION "${SO_VERSION_STRING}" ) +set_property ( TARGET ${CORE_RUNTIME_TARGET} PROPERTY SOVERSION "${SO_MAJOR}" ) + +## Create symlinks for legacy packaging and install +add_custom_target ( hsa_include_link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/include/hsa hsa_include_link ) +add_custom_target ( hsa_lib_link ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/lib/${CORE_RUNTIME_LIBRARY}.so ${CORE_RUNTIME_LIBRARY}-link.so ) +add_custom_target ( hsa_lib_link2 ALL WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} COMMAND ${CMAKE_COMMAND} -E create_symlink ../hsa/lib/${CORE_RUNTIME_LIBRARY}.so.${SO_MAJOR} ${CORE_RUNTIME_LIBRARY}-link.so.${SO_MAJOR} ) ## Set install information install ( TARGETS ${CORE_RUNTIME_TARGET} LIBRARY DESTINATION hsa/lib ) install ( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/inc/ DESTINATION hsa/include/hsa ) -install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/hsa-link DESTINATION include PERMISSIONS OWNER_WRITE OWNER_READ RENAME hsa ) -install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CORE_RUNTIME_LIBRARY}.so-link DESTINATION lib PERMISSIONS OWNER_WRITE OWNER_READ RENAME ${CORE_RUNTIME_LIBRARY}.so ) +install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/hsa_include_link DESTINATION include PERMISSIONS OWNER_WRITE OWNER_READ RENAME hsa ) +install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CORE_RUNTIME_LIBRARY}-link.so DESTINATION lib PERMISSIONS OWNER_WRITE OWNER_READ RENAME ${CORE_RUNTIME_LIBRARY}.so ) +install ( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CORE_RUNTIME_LIBRARY}-link.so.${SO_MAJOR} DESTINATION lib PERMISSIONS OWNER_WRITE OWNER_READ RENAME ${CORE_RUNTIME_LIBRARY}.so.${SO_MAJOR} ) ## Packaging directives +set ( CPACK_GENERATOR "DEB;RPM" CACHE STRING "Package types to build") + set ( CPACK_PACKAGE_NAME "hsa-rocr-dev" ) set ( CPACK_PACKAGE_VENDOR "AMD" ) -set ( CPACK_PACKAGE_VERSION_MAJOR ${BUILD_VERSION_MAJOR} ) -set ( CPACK_PACKAGE_VERSION_MINOR ${BUILD_VERSION_MINOR} ) -set ( CPACK_PACKAGE_VERSION_PATCH ${BUILD_VERSION_PATCH} ) +set ( CPACK_PACKAGE_VERSION ${PACKAGE_VERSION_STRING} ) set ( CPACK_PACKAGE_CONTACT "Advanced Micro Devices Inc." ) set ( CPACK_PACKAGE_DESCRIPTION_SUMMARY "AMD Heterogeneous System Architecture HSA - Linux HSA Runtime for Boltzmann (ROCm) platforms" ) set ( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) # Debian package specific variables -set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsakmt-roct-dev" ) +set ( CPACK_DEBIAN_PACKAGE_DEPENDS "hsakmt-roct" ) set ( CPACK_DEBIAN_PACKAGE_HOMEPAGE "https://github.com/RadeonOpenCompute/ROCR-Runtime" ) set ( CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/postinst;${CMAKE_CURRENT_SOURCE_DIR}/DEBIAN/prerm" ) ## RPM package specific variables -set ( CPACK_RPM_PACKAGE_DEPENDS "hsakmt-roct-dev" ) +set ( CPACK_RPM_PACKAGE_DEPENDS "hsakmt-roct" ) set ( CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_post" ) set ( CPACK_RPM_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RPM/rpm_postun" ) diff --git a/src/cmake_modules/utils.cmake b/src/cmake_modules/utils.cmake index 0530c87f3..77e441131 100644 --- a/src/cmake_modules/utils.cmake +++ b/src/cmake_modules/utils.cmake @@ -40,6 +40,59 @@ ## ################################################################################ +function( get_path LIB CACHED_PATH HELP ) + + set( options "") + set( oneValueArgs RESULT ) + set( multiValueArgs HINTS NAMES ) + cmake_parse_arguments(ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN} ) + + # Search for canary file. + if( ${LIB} ) + find_library( FULLPATH NAMES ${ARGS_NAMES} HINTS ${${CACHED_PATH}} ${ARGS_HINTS} ) + else() + find_file( FULLPATH NAMES ${ARGS_NAMES} HINTS ${${CACHED_PATH}} ${ARGS_HINTS} ) + endif() + set( RESULT (NOT ${FULLPATH} MATCHES NOTFOUND) ) + + # Extract path + get_filename_component ( DIRPATH ${FULLPATH} DIRECTORY ) + + # Check path against cache + if( NOT "${${CACHED_PATH}}" STREQUAL "" ) + if ( NOT "${${CACHED_PATH}}" STREQUAL "${DIRPATH}" ) + message(WARNING "${CACHED_PATH} may be incorrect." ) + set( DIRPATH ${${CACHED_PATH}} ) + endif() + elseif(NOT ${RESULT}) + message(WARNING "${CACHED_PATH} not located during path search.") + endif() + + # Set cache variable and help text + set( ${CACHED_PATH} ${DIRPATH} CACHE PATH ${HELP} FORCE ) + unset( FULLPATH CACHE ) + + # Return success flag + if( NOT ${ARGS_RESULT} STREQUAL "" ) + set( ${ARGS_RESULT} ${RESULT} PARENT_SCOPE) + endif() + +endfunction() + +## Searches for a file using include paths and stores the path to that file in the cache +## using the cached value if set. Search paths are optional. Returns success in RESULT. +## get_include_path( NAMES name1 [name2...] [HINTS path1 [path2 ... ENV var]] [RESULT ] +macro( get_include_path CACHED_PATH HELP ) + get_path( 0 ${ARGV} ) +endmacro() + +## Searches for a file using library paths and stores the path to that file in the cache +## using the cached value if set. Search paths are optional. Returns success in RESULT. +## get_library_path( NAMES name1 [name2...] [HINTS path1 [path2 ... ENV var]] [RESULT ] +macro( get_library_path CACHED_PATH HELP ) + get_path( 1 ${ARGV} ) +endmacro() + ## Parses the VERSION_STRING variable and places ## the first, second and third number values in ## the major, minor and patch variables. @@ -58,27 +111,18 @@ function( parse_version VERSION_STRING ) if ( ${VERSION_COUNT} GREATER 0) list ( GET VERSIONS 0 MAJOR ) set ( VERSION_MAJOR ${MAJOR} PARENT_SCOPE ) - set ( TEMP_VERSION_STRING "${MAJOR}" ) endif () if ( ${VERSION_COUNT} GREATER 1 ) list ( GET VERSIONS 1 MINOR ) set ( VERSION_MINOR ${MINOR} PARENT_SCOPE ) - set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${MINOR}" ) endif () if ( ${VERSION_COUNT} GREATER 2 ) list ( GET VERSIONS 2 PATCH ) set ( VERSION_PATCH ${PATCH} PARENT_SCOPE ) - set ( TEMP_VERSION_STRING "${TEMP_VERSION_STRING}.${PATCH}" ) - endif () - - if ( VERSION_BUILD ) - set ( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) endif () - set ( VERSION_STRING "${TEMP_VERSION_STRING}" PARENT_SCOPE ) - endfunction () ## Gets the current version of the repository @@ -87,30 +131,77 @@ endfunction () ## and a library version string. function ( get_version DEFAULT_VERSION_STRING ) - parse_version ( ${DEFAULT_VERSION_STRING} ) - -## find_program ( GIT NAMES git ) -## -## if ( GIT ) -## -## execute_process ( COMMAND git describe --tags --dirty --long -## WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} -## OUTPUT_VARIABLE GIT_TAG_STRING -## OUTPUT_STRIP_TRAILING_WHITESPACE -## RESULT_VARIABLE RESULT ) -## -## if ( ${RESULT} EQUAL 0 ) -## -## parse_version ( ${GIT_TAG_STRING} ) -## -## endif () -## -## endif () - - set( VERSION_STRING "${VERSION_STRING}" PARENT_SCOPE ) + set( VERSION_JOB "local-build" ) + set( VERSION_COMMIT_COUNT 0 ) + set( VERSION_HASH "unknown" ) + + find_program( GIT NAMES git ) + + if( GIT ) + + #execute_process ( COMMAND git describe --tags --dirty --long + # WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + # OUTPUT_VARIABLE GIT_TAG_STRING + # OUTPUT_STRIP_TRAILING_WHITESPACE + # RESULT_VARIABLE RESULT ) + + # Get branch commit (common ancestor) of current branch and master branch. + execute_process(COMMAND git merge-base HEAD origin/HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE GIT_MERGE_BASE + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RESULT ) + + if( ${RESULT} EQUAL 0 ) + # Count commits from branch point. + execute_process(COMMAND git rev-list --count ${GIT_MERGE_BASE}..HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE VERSION_COMMIT_COUNT + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RESULT ) + if(NOT ${RESULT} EQUAL 0 ) + set( VERSION_COMMIT_COUNT 0 ) + endif() + endif() + + # Get current short hash. + execute_process(COMMAND git rev-parse --short HEAD + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + OUTPUT_VARIABLE VERSION_HASH + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE RESULT ) + if( ${RESULT} EQUAL 0 ) + # Check for dirty workspace. + execute_process(COMMAND git diff --quiet + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + RESULT_VARIABLE RESULT ) + if(${RESULT} EQUAL 1) + set(VERSION_HASH "${VERSION_HASH}-dirty") + endif() + else() + set( VERSION_HASH "unknown" ) + endif() + endif() + + # Build automation IDs + if(DEFINED ENV{ROCM_BUILD_ID}) + set( VERSION_JOB $ENV{ROCM_BUILD_ID} ) + endif() + + parse_version(${DEFAULT_VERSION_STRING}) + set( VERSION_MAJOR "${VERSION_MAJOR}" PARENT_SCOPE ) set( VERSION_MINOR "${VERSION_MINOR}" PARENT_SCOPE ) set( VERSION_PATCH "${VERSION_PATCH}" PARENT_SCOPE ) - set( VERSION_BUILD "${VERSION_BUILD}" PARENT_SCOPE ) + set( VERSION_COMMIT_COUNT "${VERSION_COMMIT_COUNT}" PARENT_SCOPE ) + set( VERSION_HASH "${VERSION_HASH}" PARENT_SCOPE ) + set( VERSION_JOB "${VERSION_JOB}" PARENT_SCOPE ) + + #message("${VERSION_MAJOR}" ) + #message("${VERSION_MINOR}" ) + #message("${VERSION_PATCH}" ) + #message("${VERSION_COMMIT_COUNT}") + #message("${VERSION_HASH}") + #message("${VERSION_JOB}") endfunction() diff --git a/src/core/inc/amd_gpu_shaders.h b/src/core/inc/amd_gpu_shaders.h index 089390ed6..c8c4243ba 100644 --- a/src/core/inc/amd_gpu_shaders.h +++ b/src/core/inc/amd_gpu_shaders.h @@ -110,6 +110,7 @@ static const unsigned int kCodeTrapHandler8[] = { static const unsigned int kCodeTrapHandler9[] = { /* + .set SQ_WAVE_PC_HI_ADDRESS_MASK , 0xFFFF .set SQ_WAVE_PC_HI_TRAP_ID_SHIFT , 16 .set SQ_WAVE_PC_HI_TRAP_ID_SIZE , 8 .set SQ_WAVE_PC_HI_TRAP_ID_BFE , (SQ_WAVE_PC_HI_TRAP_ID_SHIFT | (SQ_WAVE_PC_HI_TRAP_ID_SIZE << 16)) @@ -122,6 +123,11 @@ static const unsigned int kCodeTrapHandler9[] = { .set SIGNAL_CODE_ILLEGAL_INST , (1 << 30) .set SIGNAL_CODE_LLVM_TRAP , (1 << 31) .set MAX_NUM_DOORBELLS_MASK , ((1 << 10) - 1) + .set SENDMSG_M0_DOORBELL_ID_BITS , 12 + .set SENDMSG_M0_DOORBELL_ID_MASK , ((1 << SENDMSG_M0_DOORBELL_ID_BITS) - 1) + .set TTMP11_DEBUG_TRAP_BIT , 7 + .set DEBUG_INTERRUPT_CONTEXT_ID_BIT , 23 + .set INSN_S_ENDPGM_OPCODE , 0xBF810000 .if .amdgcn.gfx_generation_number == 9 .set TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT , 26 @@ -138,27 +144,124 @@ static const unsigned int kCodeTrapHandler9[] = { .error "unsupported target" .endif + // ABI between first and second level trap handler: + // ttmp0 = PC[31:0] + // ttmp1 = 0[2:0], PCRewind[3:0], HostTrap[0], TrapId[7:0], PC[47:32] + // ttmp12 = SQ_WAVE_STATUS + // ttmp14 = TMA[31:0] + // ttmp15 = TMA[63:32] + // gfx9: + // ttmp11 = SQ_WAVE_IB_STS[20:15], 0[17:0], DebugTrap[0], NoScratch[0], WaveIdInWG[5:0] + // gfx10: + // ttmp11 = SQ_WAVE_IB_STS[25], SQ_WAVE_IB_STS[21:15], 0[15:0], DebugTrap[0], NoScratch[0], WaveIdInWG[5:0] + + .macro mGetDoorbellId + s_mov_b32 exec_lo, 0x80000000 + s_sendmsg sendmsg(MSG_GET_DOORBELL) + .wait_sendmsg_\@: + s_nop 7 + s_bitcmp0_b32 exec_lo, 0x1F + s_cbranch_scc0 .wait_sendmsg_\@ + .endm + + .macro mExitTrap + // Restore SQ_WAVE_IB_STS. + .if .amdgcn.gfx_generation_number == 9 + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + .endif + .if .amdgcn.gfx_generation_number == 10 + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) + s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK + s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK + s_or_b32 ttmp2, ttmp2, ttmp3 + s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 + .endif + + // Restore SQ_WAVE_STATUS. + s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 + s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 + + // Return to shader at unmodified PC. + s_rfe_b64 [ttmp0, ttmp1] + .endm + trap_entry: + // If trap raised (non-zero trap id) then branch. + s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE + s_cbranch_scc1 .trap_raised + + // If non-masked exception raised then branch. + s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) + s_and_b32 ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK) + s_cbranch_scc1 .excp_raised + + // Otherwise trap entered due to single step exception. + .signal_debugger: + s_bitset1_b32 ttmp11, TTMP11_DEBUG_TRAP_BIT + + // Fetch doorbell index for our queue. + s_mov_b32 ttmp2, exec_lo + s_mov_b32 ttmp3, exec_hi + mGetDoorbellId + s_mov_b32 exec_hi, ttmp3 + + // Restore exec_lo, move the doorbell_id into ttmp3 + s_and_b32 exec_lo, exec_lo, SENDMSG_M0_DOORBELL_ID_MASK + s_mov_b32 ttmp3, exec_lo + s_mov_b32 exec_lo, ttmp2 + + // Set the debug interrupt context id. + // FIXME: Make conditional on TTMP11_DEBUG_TRAP_BIT when exceptions are handled. + s_bitset1_b32 ttmp3, DEBUG_INTERRUPT_CONTEXT_ID_BIT + + // Send an interrupt to trigger event notification. + s_mov_b32 ttmp2, m0 + s_mov_b32 m0, ttmp3 + s_nop 0x0 // Manually inserted wait states + s_sendmsg sendmsg(MSG_INTERRUPT) + + // Restore m0 + s_mov_b32 m0, ttmp2 + + // If PC is at an s_endpgm instruction then don't halt the wavefront. + s_and_b32 ttmp1, ttmp1, SQ_WAVE_PC_HI_ADDRESS_MASK + s_load_dword ttmp2, [ttmp0, ttmp1] + s_waitcnt lgkmcnt(0) + s_cmp_eq_u32 ttmp2, INSN_S_ENDPGM_OPCODE + s_cbranch_scc1 .skip_halt + s_or_b32 ttmp12, ttmp12, SQ_WAVE_STATUS_HALT_MASK + + .skip_halt: + mExitTrap + + .excp_raised: // If memory violation without XNACK error then signal queue error. // XNACK error will be handled by VM interrupt, since it has more information. - s_getreg_b32 ttmp2, hwreg(HW_REG_TRAPSTS) - s_and_b32 ttmp4, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK) - s_cmp_eq_u32 ttmp4, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_mov_b32 ttmp4, SIGNAL_CODE_MEM_VIOL + s_and_b32 ttmp3, ttmp2, (SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | SQ_WAVE_TRAPSTS_XNACK_ERROR_MASK) + s_cmp_eq_u32 ttmp3, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK + s_mov_b32 ttmp3, SIGNAL_CODE_MEM_VIOL s_cbranch_scc1 .signal_error // If illegal instruction then signal queue error. - s_and_b32 ttmp4, ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK - s_mov_b32 ttmp4, SIGNAL_CODE_ILLEGAL_INST + s_and_b32 ttmp3, ttmp2, SQ_WAVE_TRAPSTS_ILLEGAL_INST_MASK + s_mov_b32 ttmp3, SIGNAL_CODE_ILLEGAL_INST s_cbranch_scc1 .signal_error - // If any other exception then return to shader. - s_bfe_u32 ttmp2, ttmp1, SQ_WAVE_PC_HI_TRAP_ID_BFE - s_cbranch_scc0 .exit_trap + // Otherwise (memory violation with XNACK error) return to shader. + s_branch .exit_trap + + .trap_raised: + // If debugger trap (s_trap >= 3) then signal debugger. + s_cmp_ge_u32 ttmp2, 0x3; + s_cbranch_scc1 .signal_debugger - // If llvm.trap then signal queue error. + // If llvm.trap (s_trap 2) then signal queue error. s_cmp_eq_u32 ttmp2, 0x2 - s_mov_b32 ttmp4, SIGNAL_CODE_LLVM_TRAP + s_mov_b32 ttmp3, SIGNAL_CODE_LLVM_TRAP s_cbranch_scc1 .signal_error // For other traps advance PC and return to shader. @@ -167,13 +270,11 @@ static const unsigned int kCodeTrapHandler9[] = { s_branch .exit_trap .signal_error: + // FIXME: don't trash ttmp4/ttmp5 when exception handling is unified. + s_mov_b32 ttmp4, ttmp3 + // Fetch doorbell index for our queue. - s_mov_b32 exec_lo, 0x80000000 - s_sendmsg sendmsg(MSG_GET_DOORBELL) - .wait_sendmsg: - s_nop 7 - s_bitcmp0_b32 exec_lo, 0x1F - s_cbranch_scc0 .wait_sendmsg + mGetDoorbellId // Map doorbell index to amd_queue_t* through TMA (doorbell_queue_map). s_and_b32 ttmp2, exec_lo, MAX_NUM_DOORBELLS_MASK @@ -192,13 +293,13 @@ static const unsigned int kCodeTrapHandler9[] = { // Skip event trigger if the signal value was already non-zero. s_or_b32 ttmp4, ttmp4, ttmp5 - s_cbranch_scc1 .signal_done + s_cbranch_scc1 .halt_wave // Check for a non-NULL signal event mailbox. s_load_dwordx2 [ttmp4, ttmp5], [ttmp2, ttmp3], 0x10 glc s_waitcnt lgkmcnt(0) s_and_b64 [ttmp4, ttmp5], [ttmp4, ttmp5], [ttmp4, ttmp5] - s_cbranch_scc0 .signal_done + s_cbranch_scc0 .halt_wave // Load the signal event value. s_load_dword ttmp2, [ttmp2, ttmp3], 0x18 glc @@ -213,46 +314,33 @@ static const unsigned int kCodeTrapHandler9[] = { s_nop 0 s_sendmsg sendmsg(MSG_INTERRUPT) - .signal_done: + .halt_wave: // Halt the wavefront. s_or_b32 ttmp12, ttmp12, SQ_WAVE_STATUS_HALT_MASK .exit_trap: - // Restore SQ_WAVE_IB_STS. - .if .amdgcn.gfx_generation_number == 9 - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 - .endif - .if .amdgcn.gfx_generation_number == 10 - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) - s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK - s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_REPLAY_W64H_SHIFT - SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT) - s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK - s_or_b32 ttmp2, ttmp2, ttmp3 - s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 - .endif - - // Restore SQ_WAVE_STATUS. - s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 - s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 - s_setreg_b32 hwreg(HW_REG_STATUS), ttmp12 - - // Return to shader at unmodified PC. - s_rfe_b64 [ttmp0, ttmp1] + mExitTrap */ - 0xb8eef803, 0x8670ff6e, 0x10000100, 0xbf06ff70, 0x00000100, 0xbef000ff, - 0x20000000, 0xbf85000e, 0x8670ff6e, 0x00000800, 0xbef000f4, 0xbf85000a, - 0x92eeff6d, 0x00080010, 0xbf84002c, 0xbf06826e, 0xbef000ff, 0x80000000, - 0xbf850003, 0x806c846c, 0x826d806d, 0xbf820025, 0xbefe00ff, 0x80000000, - 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0x866eff7e, 0x000003ff, - 0x8e6e836e, 0xc0051bbd, 0x0000006e, 0xbf8cc07f, 0xc0071bb7, 0x000000c0, - 0xbf8cc07f, 0xbef10080, 0xc2831c37, 0x00000008, 0xbf8cc07f, 0x87707170, - 0xbf85000e, 0xc0071c37, 0x00000010, 0xbf8cc07f, 0x86f07070, 0xbf840009, - 0xc0031bb7, 0x00000018, 0xbf8cc07f, 0xc0431bb8, 0x00000000, 0xbf8cc07f, - 0xbefc0080, 0xbf800000, 0xbf900001, 0x8778ff78, 0x00002000, 0x8f6e8b77, - 0x866eff6e, 0x001f8000, 0xb96ef807, 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, - 0xbe801f6c, + + 0x92eeff6d, 0x00080010, 0xbf850036, 0xb8eef803, 0x866fff6e, 0x00000900, + 0xbf850026, 0xbef71a87, 0xbeee007e, 0xbeef007f, 0xbefe00ff, 0x80000000, + 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff006f, 0x867eff7e, + 0x00000fff, 0xbeef007e, 0xbefe006e, 0xbeef1a97, 0xbeee007c, 0xbefc006f, + 0xbf800000, 0xbf900001, 0xbefc006e, 0x866dff6d, 0x0000ffff, 0xc0021bb6, + 0x00000000, 0xbf8cc07f, 0xbf06ff6e, 0xbf810000, 0xbf850002, 0x8778ff78, + 0x00002000, 0x8f6e8b77, 0x866eff6e, 0x001f8000, 0xb96ef807, 0x86fe7e7e, + 0x86ea6a6a, 0xb978f802, 0xbe801f6c, 0x866fff6e, 0x10000100, 0xbf06ff6f, + 0x00000100, 0xbeef00ff, 0x20000000, 0xbf85000e, 0x866fff6e, 0x00000800, + 0xbeef00f4, 0xbf85000a, 0xbf82002f, 0xbf09836e, 0xbf85ffcc, 0xbf06826e, + 0xbeef00ff, 0x80000000, 0xbf850003, 0x806c846c, 0x826d806d, 0xbf820026, + 0xbef0006f, 0xbefe00ff, 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, + 0xbf84fffd, 0x866eff7e, 0x000003ff, 0x8e6e836e, 0xc0051bbd, 0x0000006e, + 0xbf8cc07f, 0xc0071bb7, 0x000000c0, 0xbf8cc07f, 0xbef10080, 0xc2831c37, + 0x00000008, 0xbf8cc07f, 0x87707170, 0xbf85000e, 0xc0071c37, 0x00000010, + 0xbf8cc07f, 0x86f07070, 0xbf840009, 0xc0031bb7, 0x00000018, 0xbf8cc07f, + 0xc0431bb8, 0x00000000, 0xbf8cc07f, 0xbefc0080, 0xbf800000, 0xbf900001, + 0x8778ff78, 0x00002000, 0x8f6e8b77, 0x866eff6e, 0x001f8000, 0xb96ef807, + 0x86fe7e7e, 0x86ea6a6a, 0xb978f802, 0xbe801f6c, }; static const unsigned int kCodeCopyAligned8[] = { @@ -368,18 +456,26 @@ static const unsigned int kCodeFill10[] = { }; static const unsigned int kCodeTrapHandler10[] = { - 0xB96EF803, 0x8770FF6E, 0x10000100, 0xBF06FF70, 0x00000100, 0xBEF003FF, - 0x20000000, 0xBF85000E, 0x8770FF6E, 0x00000800, 0xBEF003F4, 0xBF85000A, - 0x93EEFF6D, 0x00080010, 0xBF84002C, 0xBF06826E, 0xBEF003FF, 0x80000000, - 0xBF850003, 0x806C846C, 0x826D806D, 0xBF820025, 0xBEFE03FF, 0x80000000, - 0xBF90000A, 0xBF800007, 0xBF0C9F7E, 0xBF84FFFD, 0x876EFF7E, 0x000003FF, - 0x8F6E836E, 0xF4051BBD, 0xDC000000, 0xBF8CC07F, 0xF4051BB7, 0xFA0000C0, - 0xBF8CC07F, 0xBEF10380, 0xF6811C37, 0xFA000008, 0xBF8CC07F, 0x88707170, - 0xBF85000E, 0xF4051C37, 0xFA000010, 0xBF8CC07F, 0x87F07070, 0xBF840009, - 0xF4011BB7, 0xFA000018, 0xBF8CC07F, 0xF4411BB8, 0xFA000000, 0xBF8CC07F, - 0xBEFC0380, 0xBF800000, 0xBF900001, 0x8878FF78, 0x00002000, 0x906E8977, - 0x876FFF6E, 0x003F8000, 0x906E8677, 0x876EFF6E, 0x02000000, 0x886E6F6E, - 0xB9EEF807, 0x87FE7E7E, 0x87EA6A6A, 0xB9F8F802, 0xBE80226C, + 0x93eeff6d, 0x00080010, 0xbf85003a, 0xb96ef803, 0x876fff6e, 0x00000900, + 0xbf85002a, 0xbef71d87, 0xbeee037e, 0xbeef037f, 0xbefe03ff, 0x80000000, + 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0xbeff036f, 0x877eff7e, + 0x00000fff, 0xbeef037e, 0xbefe036e, 0xbeef1d97, 0xbeee037c, 0xbefc036f, + 0xbf800000, 0xbf900001, 0xbefc036e, 0x876dff6d, 0x0000ffff, 0xf4001bb6, + 0xfa000000, 0xbf8cc07f, 0xbf06ff6e, 0xbf810000, 0xbf850002, 0x8878ff78, + 0x00002000, 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, + 0x02000000, 0x886e6f6e, 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, + 0xbe80226c, 0x876fff6e, 0x10000100, 0xbf06ff6f, 0x00000100, 0xbeef03ff, + 0x20000000, 0xbf85000e, 0x876fff6e, 0x00000800, 0xbeef03f4, 0xbf85000a, + 0xbf82002f, 0xbf09836e, 0xbf85ffc8, 0xbf06826e, 0xbeef03ff, 0x80000000, + 0xbf850003, 0x806c846c, 0x826d806d, 0xbf820026, 0xbef0036f, 0xbefe03ff, + 0x80000000, 0xbf90000a, 0xbf800007, 0xbf0c9f7e, 0xbf84fffd, 0x876eff7e, + 0x000003ff, 0x8f6e836e, 0xf4051bbd, 0xdc000000, 0xbf8cc07f, 0xf4051bb7, + 0xfa0000c0, 0xbf8cc07f, 0xbef10380, 0xf6811c37, 0xfa000008, 0xbf8cc07f, + 0x88707170, 0xbf85000e, 0xf4051c37, 0xfa000010, 0xbf8cc07f, 0x87f07070, + 0xbf840009, 0xf4011bb7, 0xfa000018, 0xbf8cc07f, 0xf4411bb8, 0xfa000000, + 0xbf8cc07f, 0xbefc0380, 0xbf800000, 0xbf900001, 0x8878ff78, 0x00002000, + 0x906e8977, 0x876fff6e, 0x003f8000, 0x906e8677, 0x876eff6e, 0x02000000, + 0x886e6f6e, 0xb9eef807, 0x87fe7e7e, 0x87ea6a6a, 0xb9f8f802, 0xbe80226c, }; } // namespace amd diff --git a/src/core/inc/amd_hsa_loader.hpp b/src/core/inc/amd_hsa_loader.hpp index 4b90f0e2c..b592f3a41 100644 --- a/src/core/inc/amd_hsa_loader.hpp +++ b/src/core/inc/amd_hsa_loader.hpp @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2015, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -394,6 +394,10 @@ class Loader { const char *options, hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) = 0; + + /// @brief Freezes @p executable + virtual hsa_status_t FreezeExecutable(Executable *executable, const char *options) = 0; + /// @brief Destroys @p executable virtual void DestroyExecutable(Executable *executable) = 0; diff --git a/src/core/runtime/amd_gpu_agent.cpp b/src/core/runtime/amd_gpu_agent.cpp index 08950dda9..de865f977 100644 --- a/src/core/runtime/amd_gpu_agent.cpp +++ b/src/core/runtime/amd_gpu_agent.cpp @@ -963,6 +963,7 @@ void GpuAgent::AcquireQueueScratch(ScratchInfo& scratch) { bool large = (scratch.size > single_limit) || (scratch_pool_.size() - scratch_pool_.remaining() + scratch.size > small_limit); large = (isa_->GetMajorVersion() < 8) ? false : large; + large = core::Runtime::runtime_singleton_->flag().no_scratch_reclaim() ? false : large; if (large) scratch.queue_base = scratch_pool_.alloc_high(scratch.size); else diff --git a/src/core/runtime/hsa.cpp b/src/core/runtime/hsa.cpp index bb8e75881..6bc2919d6 100644 --- a/src/core/runtime/hsa.cpp +++ b/src/core/runtime/hsa.cpp @@ -2329,7 +2329,7 @@ hsa_status_t hsa_executable_freeze( return HSA_STATUS_ERROR_INVALID_EXECUTABLE; } - return exec->Freeze(options); + return GetLoader()->FreezeExecutable(exec, options); CATCH; } diff --git a/src/core/util/flag.h b/src/core/util/flag.h index 7f6dc9fc5..68ec90a70 100644 --- a/src/core/util/flag.h +++ b/src/core/util/flag.h @@ -106,6 +106,9 @@ class Flag { var = os::GetEnvVar("HSA_FORCE_FINE_GRAIN_PCIE"); fine_grain_pcie_ = (var == "1") ? true : false; + + var = os::GetEnvVar("HSA_NO_SCRATCH_RECLAIM"); + no_scratch_reclaim_ = (var == "1") ? true : false; } bool check_flat_scratch() const { return check_flat_scratch_; } @@ -130,6 +133,8 @@ class Flag { bool fine_grain_pcie() const { return fine_grain_pcie_; } + bool no_scratch_reclaim() const { return no_scratch_reclaim_; } + std::string enable_sdma() const { return enable_sdma_; } std::string visible_gpus() const { return visible_gpus_; } @@ -152,6 +157,7 @@ class Flag { bool disable_fragment_alloc_; bool rev_copy_dir_; bool fine_grain_pcie_; + bool no_scratch_reclaim_; std::string enable_sdma_; diff --git a/src/loader/executable.cpp b/src/loader/executable.cpp index 7fb9a3f41..dc5352096 100644 --- a/src/loader/executable.cpp +++ b/src/loader/executable.cpp @@ -49,17 +49,27 @@ #include #include #include +#include #include "amd_hsa_elf.h" #include "amd_hsa_kernel_code.h" #include "amd_hsa_code.hpp" #include "amd_hsa_code_util.hpp" #include "amd_options.hpp" +#include "core/util/utils.h" #include "AMDHSAKernelDescriptor.h" using namespace amd::hsa; using namespace amd::hsa::common; +static void __attribute__((noinline, optimize(0))) _loader_debug_state() {}; +r_debug _amdgpu_r_debug __attribute__((visibility("default"))) = {1, + nullptr, + reinterpret_cast(&_loader_debug_state), + r_debug::RT_CONSISTENT, + 0}; +static link_map* r_debug_tail = nullptr; + namespace amd { namespace hsa { namespace loader { @@ -140,6 +150,10 @@ Loader* Loader::Create(Context* context) void Loader::Destroy(Loader *loader) { + // Loader resets the link_map, but the executables and loaded code objects are not deleted. + _amdgpu_r_debug.r_map = nullptr; + _amdgpu_r_debug.r_state = r_debug::RT_CONSISTENT; + r_debug_tail = nullptr; delete loader; } @@ -152,9 +166,66 @@ Executable* AmdHsaCodeLoader::CreateExecutable( return executables.back(); } -void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) -{ +static void AddCodeObjectInfoIntoDebugMap(link_map* map) { + if (r_debug_tail) { + r_debug_tail->l_next = map; + map->l_prev = r_debug_tail; + map->l_next = nullptr; + } else { + _amdgpu_r_debug.r_map = map; + map->l_prev = nullptr; + map->l_next = nullptr; + } + r_debug_tail = map; +} + +static void RemoveCodeObjectInfoFromDebugMap(link_map* map) { + if (r_debug_tail == map) { + r_debug_tail = map->l_prev; + } + if (map->l_prev) { + map->l_prev->l_next = map->l_next; + } + if (map->l_next) { + map->l_next->l_prev = map->l_prev; + } + + delete map->l_name; +} + +hsa_status_t AmdHsaCodeLoader::FreezeExecutable(Executable *executable, const char *options) { + hsa_status_t status = executable->Freeze(options); + if (status != HSA_STATUS_SUCCESS) { + return status; + } + + // Assumeing runtime atomic implements C++ std::memory_order WriterLockGuard writer_lock(rw_lock_); + atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_ADD, std::memory_order_relaxed); + atomic::Fence(std::memory_order_acq_rel); + _loader_debug_state(); + atomic::Fence(std::memory_order_acq_rel); + for (auto &lco : reinterpret_cast(executable)->loaded_code_objects) { + AddCodeObjectInfoIntoDebugMap(&(lco->r_debug_info)); + } + atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release); + _loader_debug_state(); + + return HSA_STATUS_SUCCESS; +} + +void AmdHsaCodeLoader::DestroyExecutable(Executable *executable) { + // Assumeing runtime atomic implements C++ std::memory_order + WriterLockGuard writer_lock(rw_lock_); + atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_DELETE, std::memory_order_relaxed); + atomic::Fence(std::memory_order_acq_rel); + _loader_debug_state(); + atomic::Fence(std::memory_order_acq_rel); + for (auto &lco : reinterpret_cast(executable)->loaded_code_objects) { + RemoveCodeObjectInfoFromDebugMap(&(lco->r_debug_info)); + } + atomic::Store(&_amdgpu_r_debug.r_state, r_debug::RT_CONSISTENT, std::memory_order_release); + _loader_debug_state(); executables[((ExecutableImpl*)executable)->id()] = nullptr; delete executable; @@ -1066,8 +1137,7 @@ hsa_status_t ExecutableImpl::LoadCodeObject( s2 = range.substr(mi + 1); std::istringstream is1(s1); is1 >> n1; std::istringstream is2(s2); is2 >> n2; - } - else { + } else { std::istringstream is(range); is >> n1; n2 = n1; } @@ -1775,6 +1845,17 @@ hsa_status_t ExecutableImpl::Freeze(const char *options) { for (auto &ls : lco->LoadedSegments()) { ls->Freeze(); } + // Update code object debug info after it is frozen. + std::stringstream ss; + uint64_t elf_begin = lco->getElfData(); + uint64_t elf_size = lco->getElfSize(); + ss << "file:///proc/" << getpid() << "/mem#" + << "offset=" << std::hex << std::showbase << elf_begin << "&" + << "size=" << elf_size; + lco->r_debug_info.l_addr = lco->getDelta(); + lco->r_debug_info.l_name = strdup(ss.str().c_str()); + lco->r_debug_info.l_prev = nullptr; + lco->r_debug_info.l_next = nullptr; } state_ = HSA_EXECUTABLE_STATE_FROZEN; diff --git a/src/loader/executable.hpp b/src/loader/executable.hpp index d8b9c3726..368b9aefc 100644 --- a/src/loader/executable.hpp +++ b/src/loader/executable.hpp @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -283,6 +284,7 @@ class ExecutableObject { }; class LoadedCodeObjectImpl : public LoadedCodeObject, public ExecutableObject { +friend class AmdHsaCodeLoader; private: LoadedCodeObjectImpl(const LoadedCodeObjectImpl&); LoadedCodeObjectImpl& operator=(const LoadedCodeObjectImpl&); @@ -319,6 +321,8 @@ class LoadedCodeObjectImpl : public LoadedCodeObject, public ExecutableObject { uint64_t getLoadBase() const override; uint64_t getLoadSize() const override; int64_t getDelta() const override; + + link_map r_debug_info; }; class Segment : public LoadedSegment, public ExecutableObject { @@ -396,6 +400,7 @@ struct ASH { typedef std::unordered_map AgentSymbolMap; class ExecutableImpl final: public Executable { +friend class AmdHsaCodeLoader; public: const hsa_profile_t& profile() const { return profile_; @@ -563,6 +568,7 @@ class AmdHsaCodeLoader : public Loader { const char *options, hsa_default_float_rounding_mode_t default_float_rounding_mode = HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT) override; + hsa_status_t FreezeExecutable(Executable *executable, const char *options) override; void DestroyExecutable(Executable *executable) override; hsa_status_t IterateExecutables(