From 575e1f502496fc0fdae1340c2809496b17765a97 Mon Sep 17 00:00:00 2001 From: pensun Date: Fri, 16 Sep 2016 11:13:43 -0500 Subject: [PATCH 01/66] resolve merge conflict Change-Id: I041730d4208ae1865820f5df73a67ba60fd4bc3c --- include/hcc_detail/hip_runtime_api.h | 86 +++++++++------------------- 1 file changed, 26 insertions(+), 60 deletions(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 3de715cbbc..ace9e121db 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -43,22 +43,10 @@ THE SOFTWARE. extern "C" { #endif -//--- -//API-visible structures -typedef struct ihipCtx_t *hipCtx_t; - -// Note many APIs also use integer deviceIds as an alternative to the device pointer: -typedef struct ihipDevice_t *hipDevice_t; - typedef struct ihipStream_t *hipStream_t; - -typedef struct ihipModule_t *hipModule_t; - -typedef struct ihipFunction_t *hipFunction_t; - -typedef void* hipDeviceptr_t; - -typedef struct ihipEvent_t *hipEvent_t; +typedef struct hipEvent_t { + struct ihipEvent_t *_handle; +} hipEvent_t; /** @@ -218,7 +206,7 @@ hipError_t hipDeviceReset(void) ; /** * @brief Set default device to be used for subsequent hip API calls from this thread. * - * @param[in] deviceId Valid device in range 0...hipGetDeviceCount(). + * @param[in] device Valid device in range 0...hipGetDeviceCount(). * * Sets @p device as the default device for the calling host thread. Valid device id's are 0... (hipGetDeviceCount()-1). * @@ -241,7 +229,7 @@ hipError_t hipDeviceReset(void) ; * * @see hipGetDevice, hipGetDeviceCount */ -hipError_t hipSetDevice(int deviceId); +hipError_t hipSetDevice(int device); /** @@ -257,7 +245,7 @@ hipError_t hipSetDevice(int deviceId); * * @see hipSetDevice, hipGetDevicesizeBytes */ -hipError_t hipGetDevice(int *deviceId); +hipError_t hipGetDevice(int *device); /** @@ -267,7 +255,7 @@ hipError_t hipGetDevice(int *deviceId); * * @returns #hipSuccess, #hipErrorNoDevice * - * + * * Returns in @p *count the number of devices that have ability to run compute commands. If there are no such devices, then @ref hipGetDeviceCount will return #hipErrorNoDevice. * If 1 or more devices can be found, then hipGetDeviceCount returns #hipSuccess. */ @@ -279,16 +267,16 @@ hipError_t hipGetDeviceCount(int *count); * @param [out] pi pointer to value to return * @param [in] attr attribute to query * @param [in] deviceId which device to query for information - * + * * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue */ -hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceId); +hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device); /** * @brief Returns device properties. * * @param [out] prop written with device properties - * @param [in] deviceId which device to query for information + * @param [in] device which device to query for information * * @return #hipSuccess, #hipErrorInvalidDevice * @bug HCC always returns 0 for maxThreadsPerMultiProcessor @@ -297,7 +285,7 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceI * * Populates hipGetDeviceProperties with information for the specified device. */ -hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId); +hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int device); /** @@ -328,7 +316,7 @@ hipError_t hipDeviceGetCacheConfig ( hipFuncCache *cacheConfig ); * @brief Set Cache configuration for a specific function * * @param [in] config; - * + * * @returns #hipSuccess, #hipErrorInitializationError * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. * @@ -435,14 +423,14 @@ const char *hipGetErrorName(hipError_t hip_error); /** * @brief Return handy text string message to explain the error which occurred * - * @param hipError Error code to convert to string. + * @param hip_error Error code to convert to string. * @return const char pointer to the NULL-terminated error string * * @warning : on HCC, this function returns the name of the error (same as hipGetErrorName) * * @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t */ -const char *hipGetErrorString(hipError_t hipError); +const char *hipGetErrorString(hipError_t hip_error); // end doxygen Error /** @@ -458,8 +446,11 @@ const char *hipGetErrorString(hipError_t hipError); * @{ * * The following Stream APIs are not (yet) supported in HIP: + * - cudaStreamAddCallback + * - cudaStreamAttachMemAsync * - cudaStreamCreateWithPriority * - cudaStreamGetPriority + * - cudaStreamWaitEvent */ /** @@ -471,9 +462,12 @@ const char *hipGetErrorString(hipError_t hipError); * * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to reference the newly * created stream in subsequent hipStream* commands. The stream is allocated on the heap and will remain allocated - * * even if the handle goes out-of-scope. To release the memory used by the stream, applicaiton must call hipStreamDestroy. * Flags controls behavior of the stream. See #hipStreamDefault, #hipStreamNonBlocking. + * @error hipStream_t are under development - with current HIP use the NULL stream. + * + * + * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy */ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags); @@ -491,9 +485,7 @@ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags); * even if the handle goes out-of-scope. To release the memory used by the stream, applicaiton must call hipStreamDestroy. * * - * @see hipStreamDestroy - * - * @return + * @see hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy * */ hipError_t hipStreamCreate(hipStream_t *stream); @@ -803,7 +795,7 @@ hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) __attribute * @param[out] dstPtr Device Pointer mapped to passed host pointer * @param[in] hstPtr Host Pointer allocated through hipHostAlloc * @param[in] flags Flags to be passed for extension - * + * * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation * * @see hipSetDeviceFlags, hipHostAlloc @@ -850,7 +842,7 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) ; * from the other registered memory region. * * @return #hipSuccess, #hipErrorMemoryAllocation - * + * * @see hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer */ hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) ; @@ -876,7 +868,7 @@ hipError_t hipHostUnregister(void* hostPtr) ; * @param[in] width Requested pitched allocation width (in bytes) * @param[in] height Requested pitched allocation height * @return Error code - * + * * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipMallocHost, hipFreeHost, hipMalloc3D, hipMalloc3DArray, hipHostAlloc */ @@ -1224,39 +1216,14 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src * @} */ -/** - *------------------------------------------------------------------------------------------------- - *------------------------------------------------------------------------------------------------- - * @defgroup Driver Initialization and Version - * @{ - * - */ - -/** - * @brief Explicitly initializes the HIP runtime. - * - * Most HIP APIs implicitly initialize the HIP runtime. - * This API provides control over the timing of the initialization. - */ -// TODO-ctx - more description on error codes. -hipError_t hipInit(unsigned int flags) ; /** *------------------------------------------------------------------------------------------------- *------------------------------------------------------------------------------------------------- - * @defgroup Context Management + * @defgroup Version Management * @{ - */ - -/** - * @brief Create a context and set it as current/ default context - * - * @param [out] ctx - * @param [in] flags - * @param [in] associated device handle * - * @returns #hipSuccess, #hipErrorInvalidContext */ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device); @@ -1454,7 +1421,6 @@ hipError_t hipDeviceGetPCIBusId (int *pciBusId,int len,hipDevice_t device); * @returns #hipSuccess, #hipErrorInavlidDevice */ hipError_t hipDeviceTotalMem (size_t *bytes,hipDevice_t device); - /** * @brief Returns the approximate HIP driver version. * From d69e56fdc992af863583f80b826a50b76eab0e37 Mon Sep 17 00:00:00 2001 From: pensun Date: Fri, 16 Sep 2016 16:04:48 -0500 Subject: [PATCH 02/66] Revert "resolve merge conflict" This reverts commit 575e1f502496fc0fdae1340c2809496b17765a97. --- include/hcc_detail/hip_runtime_api.h | 86 +++++++++++++++++++--------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index ace9e121db..3de715cbbc 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -43,10 +43,22 @@ THE SOFTWARE. extern "C" { #endif +//--- +//API-visible structures +typedef struct ihipCtx_t *hipCtx_t; + +// Note many APIs also use integer deviceIds as an alternative to the device pointer: +typedef struct ihipDevice_t *hipDevice_t; + typedef struct ihipStream_t *hipStream_t; -typedef struct hipEvent_t { - struct ihipEvent_t *_handle; -} hipEvent_t; + +typedef struct ihipModule_t *hipModule_t; + +typedef struct ihipFunction_t *hipFunction_t; + +typedef void* hipDeviceptr_t; + +typedef struct ihipEvent_t *hipEvent_t; /** @@ -206,7 +218,7 @@ hipError_t hipDeviceReset(void) ; /** * @brief Set default device to be used for subsequent hip API calls from this thread. * - * @param[in] device Valid device in range 0...hipGetDeviceCount(). + * @param[in] deviceId Valid device in range 0...hipGetDeviceCount(). * * Sets @p device as the default device for the calling host thread. Valid device id's are 0... (hipGetDeviceCount()-1). * @@ -229,7 +241,7 @@ hipError_t hipDeviceReset(void) ; * * @see hipGetDevice, hipGetDeviceCount */ -hipError_t hipSetDevice(int device); +hipError_t hipSetDevice(int deviceId); /** @@ -245,7 +257,7 @@ hipError_t hipSetDevice(int device); * * @see hipSetDevice, hipGetDevicesizeBytes */ -hipError_t hipGetDevice(int *device); +hipError_t hipGetDevice(int *deviceId); /** @@ -255,7 +267,7 @@ hipError_t hipGetDevice(int *device); * * @returns #hipSuccess, #hipErrorNoDevice * - * + * * Returns in @p *count the number of devices that have ability to run compute commands. If there are no such devices, then @ref hipGetDeviceCount will return #hipErrorNoDevice. * If 1 or more devices can be found, then hipGetDeviceCount returns #hipSuccess. */ @@ -267,16 +279,16 @@ hipError_t hipGetDeviceCount(int *count); * @param [out] pi pointer to value to return * @param [in] attr attribute to query * @param [in] deviceId which device to query for information - * + * * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue */ -hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device); +hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceId); /** * @brief Returns device properties. * * @param [out] prop written with device properties - * @param [in] device which device to query for information + * @param [in] deviceId which device to query for information * * @return #hipSuccess, #hipErrorInvalidDevice * @bug HCC always returns 0 for maxThreadsPerMultiProcessor @@ -285,7 +297,7 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) * * Populates hipGetDeviceProperties with information for the specified device. */ -hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int device); +hipError_t hipGetDeviceProperties(hipDeviceProp_t* prop, int deviceId); /** @@ -316,7 +328,7 @@ hipError_t hipDeviceGetCacheConfig ( hipFuncCache *cacheConfig ); * @brief Set Cache configuration for a specific function * * @param [in] config; - * + * * @returns #hipSuccess, #hipErrorInitializationError * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. * @@ -423,14 +435,14 @@ const char *hipGetErrorName(hipError_t hip_error); /** * @brief Return handy text string message to explain the error which occurred * - * @param hip_error Error code to convert to string. + * @param hipError Error code to convert to string. * @return const char pointer to the NULL-terminated error string * * @warning : on HCC, this function returns the name of the error (same as hipGetErrorName) * * @see hipGetErrorName, hipGetLastError, hipPeakAtLastError, hipError_t */ -const char *hipGetErrorString(hipError_t hip_error); +const char *hipGetErrorString(hipError_t hipError); // end doxygen Error /** @@ -446,11 +458,8 @@ const char *hipGetErrorString(hipError_t hip_error); * @{ * * The following Stream APIs are not (yet) supported in HIP: - * - cudaStreamAddCallback - * - cudaStreamAttachMemAsync * - cudaStreamCreateWithPriority * - cudaStreamGetPriority - * - cudaStreamWaitEvent */ /** @@ -462,12 +471,9 @@ const char *hipGetErrorString(hipError_t hip_error); * * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to reference the newly * created stream in subsequent hipStream* commands. The stream is allocated on the heap and will remain allocated + * * even if the handle goes out-of-scope. To release the memory used by the stream, applicaiton must call hipStreamDestroy. * Flags controls behavior of the stream. See #hipStreamDefault, #hipStreamNonBlocking. - * @error hipStream_t are under development - with current HIP use the NULL stream. - * - * - * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy */ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags); @@ -485,7 +491,9 @@ hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags); * even if the handle goes out-of-scope. To release the memory used by the stream, applicaiton must call hipStreamDestroy. * * - * @see hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy + * @see hipStreamDestroy + * + * @return * */ hipError_t hipStreamCreate(hipStream_t *stream); @@ -795,7 +803,7 @@ hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) __attribute * @param[out] dstPtr Device Pointer mapped to passed host pointer * @param[in] hstPtr Host Pointer allocated through hipHostAlloc * @param[in] flags Flags to be passed for extension - * + * * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation * * @see hipSetDeviceFlags, hipHostAlloc @@ -842,7 +850,7 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) ; * from the other registered memory region. * * @return #hipSuccess, #hipErrorMemoryAllocation - * + * * @see hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer */ hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) ; @@ -868,7 +876,7 @@ hipError_t hipHostUnregister(void* hostPtr) ; * @param[in] width Requested pitched allocation width (in bytes) * @param[in] height Requested pitched allocation height * @return Error code - * + * * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipMallocHost, hipFreeHost, hipMalloc3D, hipMalloc3DArray, hipHostAlloc */ @@ -1216,14 +1224,39 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src * @} */ +/** + *------------------------------------------------------------------------------------------------- + *------------------------------------------------------------------------------------------------- + * @defgroup Driver Initialization and Version + * @{ + * + */ + +/** + * @brief Explicitly initializes the HIP runtime. + * + * Most HIP APIs implicitly initialize the HIP runtime. + * This API provides control over the timing of the initialization. + */ +// TODO-ctx - more description on error codes. +hipError_t hipInit(unsigned int flags) ; /** *------------------------------------------------------------------------------------------------- *------------------------------------------------------------------------------------------------- - * @defgroup Version Management + * @defgroup Context Management * @{ + */ + +/** + * @brief Create a context and set it as current/ default context + * + * @param [out] ctx + * @param [in] flags + * @param [in] associated device handle * + * @returns #hipSuccess, #hipErrorInvalidContext */ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device); @@ -1421,6 +1454,7 @@ hipError_t hipDeviceGetPCIBusId (int *pciBusId,int len,hipDevice_t device); * @returns #hipSuccess, #hipErrorInavlidDevice */ hipError_t hipDeviceTotalMem (size_t *bytes,hipDevice_t device); + /** * @brief Returns the approximate HIP driver version. * From ffd49cfa37ba62d7aa6f3f8fee9a39424fc3acb9 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Sat, 17 Sep 2016 23:54:20 +0530 Subject: [PATCH 03/66] Added return error code hipErrorInvalidValue in hipMemGetInfo Change-Id: If01b012136b655ff8eb4878eb703dfe3e6a36530 --- include/hcc_detail/hip_runtime_api.h | 2 +- src/hip_memory.cpp | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 3de715cbbc..71732eb8e3 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -1112,7 +1112,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t st * Return snapshot of free memory, and total allocatable memory on the device. * * Returns in *free a snapshot of the current free memory. - * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue (if free != NULL due to bugs) + * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue * @warning On HCC, the free memory only accounts for memory allocated by this process and may be optimistic. **/ hipError_t hipMemGetInfo (size_t * free, size_t * total) ; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 0478ff26d7..0cc53f7754 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -864,6 +864,9 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total) if (total) { *total = device->_props.totalGlobalMem; } + else { + e = hipErrorInvalidValue; + } if (free) { // TODO - replace with kernel-level for reporting free memory: @@ -872,6 +875,9 @@ hipError_t hipMemGetInfo (size_t *free, size_t *total) *free = device->_props.totalGlobalMem - deviceMemSize; } + else { + e = hipErrorInvalidValue; + } } else { e = hipErrorInvalidDevice; From 1793a7160c67db4f7f5eeb6a0b6198b560d447e1 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 20 Sep 2016 12:53:55 +0530 Subject: [PATCH 04/66] FindHIP: Fix formatting issues Change-Id: I169266fe34fec1d0619b299733e9997eddc16d90 --- cmake/FindHIP.cmake | 128 ++++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 64 deletions(-) diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake index 9626e7629f..52e43f8813 100644 --- a/cmake/FindHIP.cmake +++ b/cmake/FindHIP.cmake @@ -235,9 +235,9 @@ endmacro() function(HIP_COMPUTE_BUILD_PATH path build_path) # Convert to cmake style paths file(TO_CMAKE_PATH "${path}" bpath) - if (IS_ABSOLUTE "${bpath}") + if(IS_ABSOLUTE "${bpath}") string(FIND "${bpath}" "${CMAKE_CURRENT_BINARY_DIR}" _binary_dir_pos) - if (_binary_dir_pos EQUAL 0) + if(_binary_dir_pos EQUAL 0) file(RELATIVE_PATH bpath "${CMAKE_CURRENT_BINARY_DIR}" "${bpath}") else() file(RELATIVE_PATH bpath "${CMAKE_CURRENT_SOURCE_DIR}" "${bpath}") @@ -391,72 +391,72 @@ macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files) set(host_flag TRUE) endif() - if (NOT host_flag) - # Determine output directory - HIP_COMPUTE_BUILD_PATH("${file}" hip_build_path) - set(hip_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${hip_build_path}") - - get_filename_component(basename ${file} NAME) - set(generated_file_path "${hip_compile_output_dir}/${CMAKE_CFG_INTDIR}") - set(generated_file_basename "${_target}_generated_${basename}${generated_extension}") - - # Set file names - set(generated_file "${generated_file_path}/${generated_file_basename}") - set(cmake_dependency_file "${hip_compile_output_dir}/${generated_file_basename}.depend") - set(custom_target_script_pregen "${hip_compile_output_dir}/${generated_file_basename}.cmake.pre-gen") - set(custom_target_script "${hip_compile_output_dir}/${generated_file_basename}.cmake") - - # Set properties for object files - set_source_files_properties("${generated_file}" - PROPERTIES - EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked - ) + if(NOT host_flag) + # Determine output directory + HIP_COMPUTE_BUILD_PATH("${file}" hip_build_path) + set(hip_compile_output_dir "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_target}.dir/${hip_build_path}") + + get_filename_component(basename ${file} NAME) + set(generated_file_path "${hip_compile_output_dir}/${CMAKE_CFG_INTDIR}") + set(generated_file_basename "${_target}_generated_${basename}${generated_extension}") + + # Set file names + set(generated_file "${generated_file_path}/${generated_file_basename}") + set(cmake_dependency_file "${hip_compile_output_dir}/${generated_file_basename}.depend") + set(custom_target_script_pregen "${hip_compile_output_dir}/${generated_file_basename}.cmake.pre-gen") + set(custom_target_script "${hip_compile_output_dir}/${generated_file_basename}.cmake") + + # Set properties for object files + set_source_files_properties("${generated_file}" + PROPERTIES + EXTERNAL_OBJECT true # This is an object file not to be compiled, but only be linked + ) - # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path - get_filename_component(file_path "${file}" PATH) - if(IS_ABSOLUTE "${file_path}") - set(source_file "${file}") - else() - set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}") - endif() + # Don't add CMAKE_CURRENT_SOURCE_DIR if the path is already an absolute path + get_filename_component(file_path "${file}" PATH) + if(IS_ABSOLUTE "${file_path}") + set(source_file "${file}") + else() + set(source_file "${CMAKE_CURRENT_SOURCE_DIR}/${file}") + endif() - # Bring in the dependencies - HIP_INCLUDE_HIPCC_DEPENDENCIES(${cmake_dependency_file}) + # Bring in the dependencies + HIP_INCLUDE_HIPCC_DEPENDENCIES(${cmake_dependency_file}) - # Configure the build script - configure_file("${HIP_run_hipcc}" "${custom_target_script_pregen}" @ONLY) - file(GENERATE - OUTPUT "${custom_target_script}" - INPUT "${custom_target_script_pregen}" - ) - set(main_dep DEPENDS ${source_file}) - set(verbose_output "$(VERBOSE)") - - # Create up the comment string - file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}") - set(hip_build_comment_string "Building HIPCC object ${generated_file_relative_path}") - - # Build the generated file and dependency file - add_custom_command( - OUTPUT ${generated_file} - # These output files depend on the source_file and the contents of cmake_dependency_file - ${main_dep} - DEPENDS ${HIP_HIPCC_DEPEND} - DEPENDS ${custom_target_script} - # Make sure the output directory exists before trying to write to it. - COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}" - COMMAND ${CMAKE_COMMAND} ARGS - -D verbose:BOOL=${verbose_output} - -D build_configuration:STRING=${_hip_build_configuration} - -D "generated_file:STRING=${generated_file}" - -P "${custom_target_script}" - WORKING_DIRECTORY "${hip_compile_output_dir}" - COMMENT "${hip_build_comment_string}" - ) + # Configure the build script + configure_file("${HIP_run_hipcc}" "${custom_target_script_pregen}" @ONLY) + file(GENERATE + OUTPUT "${custom_target_script}" + INPUT "${custom_target_script_pregen}" + ) + set(main_dep DEPENDS ${source_file}) + set(verbose_output "$(VERBOSE)") + + # Create up the comment string + file(RELATIVE_PATH generated_file_relative_path "${CMAKE_BINARY_DIR}" "${generated_file}") + set(hip_build_comment_string "Building HIPCC object ${generated_file_relative_path}") + + # Build the generated file and dependency file + add_custom_command( + OUTPUT ${generated_file} + # These output files depend on the source_file and the contents of cmake_dependency_file + ${main_dep} + DEPENDS ${HIP_HIPCC_DEPEND} + DEPENDS ${custom_target_script} + # Make sure the output directory exists before trying to write to it. + COMMAND ${CMAKE_COMMAND} -E make_directory "${generated_file_path}" + COMMAND ${CMAKE_COMMAND} ARGS + -D verbose:BOOL=${verbose_output} + -D build_configuration:STRING=${_hip_build_configuration} + -D "generated_file:STRING=${generated_file}" + -P "${custom_target_script}" + WORKING_DIRECTORY "${hip_compile_output_dir}" + COMMENT "${hip_build_comment_string}" + ) - # Make sure the build system knows the file is generated - set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE) - list(APPEND _hip_generated_files ${generated_file}) + # Make sure the build system knows the file is generated + set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE) + list(APPEND _hip_generated_files ${generated_file}) endif() endforeach() From 80f75f4677ef5d54af0eb3f23db8ee127711e648 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 20 Sep 2016 14:03:51 +0530 Subject: [PATCH 05/66] FindHIP: Fix bug in handling HIP_SOURCE_PROPERTY_FORMAT Change-Id: I6f19c487b222039e2653a8de1930c02da64318ed --- cmake/FindHIP.cmake | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake index 52e43f8813..64838492e5 100644 --- a/cmake/FindHIP.cmake +++ b/cmake/FindHIP.cmake @@ -316,7 +316,7 @@ endmacro() ############################################################################### # MACRO: Prepare cmake commands for the target ############################################################################### -macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files) +macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files _source_files) set(_hip_flags "") set(_hip_build_configuration "${CMAKE_BUILD_TYPE}") if(HIP_HOST_COMPILATION_CPP) @@ -378,6 +378,7 @@ macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files) # Reset the output variable set(_hip_generated_files "") + set(_hip_source_files "") # Iterate over all arguments and create custom commands for all source files foreach(file ${ARGN}) @@ -457,11 +458,13 @@ macro(HIP_PREPARE_TARGET_COMMANDS _target _format _generated_files) # Make sure the build system knows the file is generated set_source_files_properties(${generated_file} PROPERTIES GENERATED TRUE) list(APPEND _hip_generated_files ${generated_file}) + list(APPEND _hip_source_files ${file}) endif() endforeach() # Set the return parameter set(${_generated_files} ${_hip_generated_files}) + set(${_source_files} ${_hip_source_files}) endmacro() ############################################################################### @@ -470,7 +473,8 @@ endmacro() macro(HIP_ADD_EXECUTABLE hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) - HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + list(REMOVE_ITEM _sources ${_source_files}) set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_EXECUTABLE} -o ") add_executable(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE HIP) @@ -482,7 +486,8 @@ endmacro() macro(HIP_ADD_LIBRARY hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) - HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + list(REMOVE_ITEM _sources ${_source_files}) add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX}) endmacro() From 067ea6cc4cf4f899a307a80b78c1b71f4ea4e9cf Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 20 Sep 2016 20:46:49 +0530 Subject: [PATCH 06/66] FindHIP: Fix bug in parsing source files and options Change-Id: Ib1c793dd19d61b387da3f5894c4e37c66f02b753 --- cmake/FindHIP.cmake | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake index 64838492e5..5e17af72a5 100644 --- a/cmake/FindHIP.cmake +++ b/cmake/FindHIP.cmake @@ -193,9 +193,15 @@ macro(HIP_GET_SOURCES_AND_OPTIONS _sources _cmake_options _hipcc_options _hcc_op foreach(arg ${ARGN}) if("x${arg}" STREQUAL "xHIPCC_OPTIONS") set(_hipcc_found_options TRUE) + set(_hcc_found_options FALSE) + set(_nvcc_found_options FALSE) elseif("x${arg}" STREQUAL "xHCC_OPTIONS") + set(_hipcc_found_options FALSE) set(_hcc_found_options TRUE) + set(_nvcc_found_options FALSE) elseif("x${arg}" STREQUAL "xNVCC_OPTIONS") + set(_hipcc_found_options FALSE) + set(_hcc_found_options FALSE) set(_nvcc_found_options TRUE) elseif( "x${arg}" STREQUAL "xEXCLUDE_FROM_ALL" OR From 751a2cfce514b8625e6da484de8a88d996b16d4e Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 21 Sep 2016 12:03:01 -0500 Subject: [PATCH 07/66] doc fix for hipStream Change-Id: I618464c8b9ffa0a566434e72d2bc87a152884ca1 --- include/hcc_detail/hip_runtime_api.h | 103 +++++++++++++++------------ 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 71732eb8e3..8a2802e600 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -267,7 +267,7 @@ hipError_t hipGetDevice(int *deviceId); * * @returns #hipSuccess, #hipErrorNoDevice * - * + * * Returns in @p *count the number of devices that have ability to run compute commands. If there are no such devices, then @ref hipGetDeviceCount will return #hipErrorNoDevice. * If 1 or more devices can be found, then hipGetDeviceCount returns #hipSuccess. */ @@ -279,7 +279,7 @@ hipError_t hipGetDeviceCount(int *count); * @param [out] pi pointer to value to return * @param [in] attr attribute to query * @param [in] deviceId which device to query for information - * + * * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue */ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int deviceId); @@ -328,7 +328,7 @@ hipError_t hipDeviceGetCacheConfig ( hipFuncCache *cacheConfig ); * @brief Set Cache configuration for a specific function * * @param [in] config; - * + * * @returns #hipSuccess, #hipErrorInitializationError * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. * @@ -458,63 +458,66 @@ const char *hipGetErrorString(hipError_t hipError); * @{ * * The following Stream APIs are not (yet) supported in HIP: + * - cudaStreamAddCallback + * - cudaStreamAttachMemAsync * - cudaStreamCreateWithPriority * - cudaStreamGetPriority + * - cudaStreamWaitEvent */ + /** * @brief Create an asynchronous stream. * - * @param[in, out] stream Pointer to new stream - * @param[in ] flags to control stream creation. + * @param[in, out] stream Valid pointer to hipStream_t. This function writes the memory with the newly created stream. * @return #hipSuccess, #hipErrorInvalidValue * * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to reference the newly * created stream in subsequent hipStream* commands. The stream is allocated on the heap and will remain allocated - * * even if the handle goes out-of-scope. To release the memory used by the stream, applicaiton must call hipStreamDestroy. - * Flags controls behavior of the stream. See #hipStreamDefault, #hipStreamNonBlocking. + * + * @return #hipSuccess, #hipErrorInvalidValue + * + * @see hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy */ - -hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags); - +hipError_t hipStreamCreate(hipStream_t *stream); /** * @brief Create an asynchronous stream. * - * @param[in, out] stream Valid pointer to hipStream_t. This function writes the memory with the newly created stream. + * @param[in, out] stream Pointer to new stream + * @param[in ] flags to control stream creation. * @return #hipSuccess, #hipErrorInvalidValue * * Create a new asynchronous stream. @p stream returns an opaque handle that can be used to reference the newly * created stream in subsequent hipStream* commands. The stream is allocated on the heap and will remain allocated * even if the handle goes out-of-scope. To release the memory used by the stream, applicaiton must call hipStreamDestroy. + * Flags controls behavior of the stream. See #hipStreamDefault, #hipStreamNonBlocking. * * - * @see hipStreamDestroy - * - * @return - * + * @see hipStreamCreate, hipStreamSynchronize, hipStreamWaitEvent, hipStreamDestroy */ -hipError_t hipStreamCreate(hipStream_t *stream); + +hipError_t hipStreamCreateWithFlags(hipStream_t *stream, unsigned int flags); /** - * @brief Make the specified compute stream wait for an event + * @brief Destroys the specified stream. * - * @param[in] stream stream to make wait. - * @param[in] event event to wait on - * @param[in] flags control operation [must be 0] + * @param[in, out] stream Valid pointer to hipStream_t. This function writes the memory with the newly created stream. + * @return #hipSuccess #hipErrorInvalidResourceHandle * - * @return #hipSuccess, #hipErrorInvalidResourceHandle + * Destroys the specified stream. * - * This function inserts a wait operation into the specified stream. - * All future work submitted to @p stream will wait until @p event reports completion before beginning execution. - * This function is host-asynchronous and the function may return before the wait has completed. + * If commands are still executing on the specified stream, some may complete execution before the queue is deleted. * + * The queue may be destroyed while some commands are still inflight, or may wait for all commands queued to the stream + * before destroying it. * + * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamQuery, hipStreamWaitEvent, hipStreamSynchronize */ -hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags); +hipError_t hipStreamDestroy(hipStream_t stream); /** @@ -522,57 +525,65 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int * * @param[in] stream stream to query * - * @return #hipSuccess, #hipErrorNotReady + * @return #hipSuccess, #hipErrorNotReady, #hipErrorInvalidResourceHandle * * This is thread-safe and returns a snapshot of the current state of the queue. However, if other host threads are sending work to the stream, * the status may change immediately after the function is called. It is typically used for debug. + * + * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamSynchronize, hipStreamDestroy */ hipError_t hipStreamQuery(hipStream_t stream); - /** * @brief Wait for all commands in stream to complete. * - * If the null stream is specified, this command blocks until all + * @param[in] stream stream identifier. + * + * @return #hipSuccess, #hipErrorInvalidResourceHandle * + * If the null stream is specified, this command blocks until all * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active or blocking. - * * This command is host-synchronous : the host will block until the stream is empty. * - * TODO + * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamDestroy + * */ hipError_t hipStreamSynchronize(hipStream_t stream); /** - * @brief Destroys the specified stream. + * @brief Make the specified compute stream wait for an event * - * @param[in, out] stream Valid pointer to hipStream_t. This function writes the memory with the newly created stream. - * @return #hipSuccess + * @param[in] stream stream to make wait. + * @param[in] event event to wait on + * @param[in] flags control operation [must be 0] * - * Destroys the specified stream. + * @return #hipSuccess, #hipErrorInvalidResourceHandle * - * If commands are still executing on the specified stream, some may complete execution before the queue is deleted. + * This function inserts a wait operation into the specified stream. + * All future work submitted to @p stream will wait until @p event reports completion before beginning execution. + * This function is host-asynchronous and the function may return before the wait has completed. + * + * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamDestroy * - * The queue may be destroyed while some commands are still inflight, or may wait for all commands queued to the stream - * before destroying it. */ -hipError_t hipStreamDestroy(hipStream_t stream); +hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags); + /** * @brief Return flags associated with this stream. * - * @param[in] stream - * @param[in,out] flags + * @param[in] stream stream to be queried + * @param[in,out] flags Pointer to an unsigned integer in which the stream's flags are returned * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidResourceHandle * - * Return flags associated with this stream in *@p flags. + * @returns #hipSuccess #hipErrorInvalidValue #hipErrorInvalidResourceHandle * - * @see hipStreamCreateWithFlags + * Return flags associated with this stream in *@p flags. * - * @returns #hipSuccess + * @see hipStreamCreateWithFlags */ hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags); @@ -803,7 +814,7 @@ hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) __attribute * @param[out] dstPtr Device Pointer mapped to passed host pointer * @param[in] hstPtr Host Pointer allocated through hipHostAlloc * @param[in] flags Flags to be passed for extension - * + * * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation * * @see hipSetDeviceFlags, hipHostAlloc @@ -850,7 +861,7 @@ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) ; * from the other registered memory region. * * @return #hipSuccess, #hipErrorMemoryAllocation - * + * * @see hipHostUnregister, hipHostGetFlags, hipHostGetDevicePointer */ hipError_t hipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags) ; @@ -876,7 +887,7 @@ hipError_t hipHostUnregister(void* hostPtr) ; * @param[in] width Requested pitched allocation width (in bytes) * @param[in] height Requested pitched allocation height * @return Error code - * + * * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipMallocHost, hipFreeHost, hipMalloc3D, hipMalloc3DArray, hipHostAlloc */ From a5e951e3d9ef3559d7057afd3752780a21de74ef Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 22 Sep 2016 12:27:07 +0530 Subject: [PATCH 08/66] Expose HIP_VERSION_* to HIP source and HIP applications Change-Id: I6fb0d6711642c4fad2cd89b98707ee4b548b7243 --- CMakeLists.txt | 4 ++++ bin/hipcc | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1aa5d3f342..0092746e15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -162,6 +162,10 @@ if(HIP_PLATFORM STREQUAL "hcc") set(CMAKE_CXX_COMPILER "${HCC_HOME}/bin/hcc") set(CMAKE_C_COMPILER "${HCC_HOME}/bin/hcc") + # Add HIP_VERSION to CMAKE__FLAGS + set(CMAKE_CXX_FLAGS " -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_MINOR} ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS " -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_MINOR} ${CMAKE_C_FLAGS}") + # Set HIP_HCC so we know this is HIP compile, some files are shared with HCC (staging_buffer). set(CMAKE_CXX_FLAGS " -fPIC -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -I/opt/rocm/libhsakmt/include/libhsakmt -stdlib=libc++ -DHIP_HCC ${CMAKE_CXX_FLAGS}") set(CMAKE_C_FLAGS " -fPIC -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -I/opt/rocm/libhsakmt/include/libhsakmt -stdlib=libc++ -DHIP_HCC ${CMAKE_C_FLAGS}") diff --git a/bin/hipcc b/bin/hipcc index b1f2bab552..5a75d2c7c7 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -60,6 +60,7 @@ parse_config_file("$HIP_PATH/lib/.buildInfo", \%hipConfig); #HIP_PLATFORM controls whether to use NVCC or HCC for compilation: $HIP_PLATFORM= `$HIP_PATH/bin/hipconfig --platform` // "hcc"; $HIP_VERSION= `$HIP_PATH/bin/hipconfig --version`; +($HIP_VERSION_MAJOR, $HIP_VERSION_MINOR, $HIP_VERSION_PATCH) = split(/\./, $HIP_VERSION); if ($verbose & 0x2) { print ("HIP_PATH=$HIP_PATH\n"); @@ -158,7 +159,7 @@ if ($HIP_PLATFORM eq "hcc") { } # Add paths to common HIP includes: -$HIPCXXFLAGS .= " -I$HIP_PATH/include" ; +$HIPCXXFLAGS .= " -I$HIP_PATH/include -DHIP_VERSION_MAJOR=$HIP_VERSION_MAJOR -DHIP_VERSION_MINOR=$HIP_VERSION_MINOR -DHIP_VERSION_PATCH=$HIP_VERSION_PATCH" ; my $compileOnly = 0; my $needCXXFLAGS = 0; # need to add CXX flags to compile step From bfc033cc3e3c30d07ee651a7432419e351e807ff Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 22 Sep 2016 15:21:23 +0530 Subject: [PATCH 09/66] Added hipRuntimeGetVersion function Change-Id: I59ec2beacb5a94439deed0dcc8eb37d6de1cc900 --- include/hcc_detail/hip_runtime_api.h | 19 +++++++++++++++++++ include/nvcc_detail/hip_runtime_api.h | 5 +++++ src/hip_context.cpp | 23 ++++++++++++++++++++--- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 8a2802e600..26c4f8ba32 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -1468,6 +1468,10 @@ hipError_t hipDeviceTotalMem (size_t *bytes,hipDevice_t device); /** * @brief Returns the approximate HIP driver version. + * + * @param [out] driverVersion + * + * @returns #hipSuccess, #hipErrorInavlidValue * * @warning The HIP feature set does not correspond to an exact CUDA SDK driver revision. * This function always set *driverVersion to 4 as an approximation though HIP supports @@ -1475,9 +1479,24 @@ hipError_t hipDeviceTotalMem (size_t *bytes,hipDevice_t device); * HIP apps code should not rely on the driver revision number here and should * use arch feature flags to test device capabilities or conditional compilation. * + * @see hipRuntimeGetVersion */ hipError_t hipDriverGetVersion(int *driverVersion) ; +/** + * @brief Returns the approximate HIP Runtime version. + * + * @param [out] runtimeVersion + * + * @returns #hipSuccess, #hipErrorInavlidValue + * + * @warning On HIP/HCC path this function returns HIP runtime patch version however on + * HIP/NVCC path this function return CUDA runtime version. + * + * @see hipDriverGetVersion + */ +hipError_t hipRuntimeGetVersion(int *runtimeVersion) ; + /** * @brief Loads code object from file into a hipModule_t * diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h index c90c7cbf29..518c96ecd8 100644 --- a/include/nvcc_detail/hip_runtime_api.h +++ b/include/nvcc_detail/hip_runtime_api.h @@ -546,6 +546,11 @@ inline static hipError_t hipDriverGetVersion(int *driverVersion) return hipCUDAErrorTohipError(err); } +inline static hipError_t hipRuntimeGetVersion(int *runtimeVersion) +{ + return hipCUDAErrorTohipError(cudaRuntimeGetVersion(runtimeVersion)); +} + inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int device, int peerDevice ) { return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice)); diff --git a/src/hip_context.cpp b/src/hip_context.cpp index dc0a30bd86..fa19f3a8a3 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -76,15 +76,32 @@ hipError_t hipDeviceGet(hipDevice_t *device, int deviceId) return ihipLogStatus(e); }; -hipError_t hipDriverGetVersion(int *driverVersion) +pError_t hipDriverGetVersion(int *driverVersion) { HIP_INIT_API(driverVersion); - + hipError_t e = hipSuccess; if (driverVersion) { *driverVersion = 4; } + else { + e = hipErrorInvalidValue; + } - return ihipLogStatus(hipSuccess); + return ihipLogStatus(e); +} + +hipError_t hipRuntimeGetVersion(int *runtimeVersion) +{ + HIP_INIT_API(runtimeVersion); + hipError_t e = hipSuccess; + if (runtimeVersion) { + *runtimeVersion = HIP_VERSION_PATCH; + } + else { + e = hipErrorInvalidValue; + } + + return ihipLogStatus(e); } hipError_t hipCtxDestroy(hipCtx_t ctx) From de402d7372b65f4f54ca271f4620a2121dbbbe3d Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 22 Sep 2016 16:32:05 +0530 Subject: [PATCH 10/66] Fix bug in defining HIP_VERSION_* Change-Id: I52fcd78386ee4b5035aba4c9bced3372b7894650 --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0092746e15..31c80af74d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -163,8 +163,8 @@ if(HIP_PLATFORM STREQUAL "hcc") set(CMAKE_C_COMPILER "${HCC_HOME}/bin/hcc") # Add HIP_VERSION to CMAKE__FLAGS - set(CMAKE_CXX_FLAGS " -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_MINOR} ${CMAKE_CXX_FLAGS}") - set(CMAKE_C_FLAGS " -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_MINOR} ${CMAKE_C_FLAGS}") + set(CMAKE_CXX_FLAGS " -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_PATCH} ${CMAKE_CXX_FLAGS}") + set(CMAKE_C_FLAGS " -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_PATCH} ${CMAKE_C_FLAGS}") # Set HIP_HCC so we know this is HIP compile, some files are shared with HCC (staging_buffer). set(CMAKE_CXX_FLAGS " -fPIC -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -I/opt/rocm/libhsakmt/include/libhsakmt -stdlib=libc++ -DHIP_HCC ${CMAKE_CXX_FLAGS}") From fb03e1c943bf2fb205b9d2dd7bcb51e26b040591 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 22 Sep 2016 09:30:05 -0500 Subject: [PATCH 11/66] Fixed typo in function return Change-Id: I8905bfdbc162815ac10c926e77a9bab432932c9c --- src/hip_context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_context.cpp b/src/hip_context.cpp index fa19f3a8a3..d0ee129358 100644 --- a/src/hip_context.cpp +++ b/src/hip_context.cpp @@ -76,7 +76,7 @@ hipError_t hipDeviceGet(hipDevice_t *device, int deviceId) return ihipLogStatus(e); }; -pError_t hipDriverGetVersion(int *driverVersion) +hipError_t hipDriverGetVersion(int *driverVersion) { HIP_INIT_API(driverVersion); hipError_t e = hipSuccess; From 50b78439afa7860a260d5d27e672dce984de6427 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 22 Sep 2016 12:55:41 -0500 Subject: [PATCH 12/66] added ipc runtime api for nvcc backend Change-Id: I3297c4c9db34e7bc3267fec64a6757f9ebf91905 --- include/nvcc_detail/hip_runtime_api.h | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h index 518c96ecd8..9fea77a904 100644 --- a/include/nvcc_detail/hip_runtime_api.h +++ b/include/nvcc_detail/hip_runtime_api.h @@ -64,6 +64,8 @@ hipMemcpyHostToHost typedef cudaEvent_t hipEvent_t; typedef cudaStream_t hipStream_t; +typedef cudaIpcEventHandle_t hipIpcEventHandle_t; +typedef cudaIpcMemHandle_t hipIpcMemHandle_t; typedef CUcontext hipCtx_t; typedef CUsharedconfig hipSharedMemConfig; typedef CUfunc_cache hipFuncCache; @@ -306,6 +308,26 @@ inline static hipError_t hipGetDevice(int * device){ return hipCUDAErrorTohipError(cudaGetDevice(device)); } +inline static hipError_t hipIpcCloseMemHandle(void *devPtr){ + return hipCUDAErrorTohipError(cudaIpcCloseMemHandle(devPtr)); +} + +inline static hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event){ + return hipCUDAErrorTohipError(cudaIpcGetEventHandle(handle, event)); +} + +inline static hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ + return hipCUDAErrorTohipError(cudaIpcGetMemHandle(handle, devPtr)); +} + +inline static hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle){ + return hipCUDAErrorTohipError(cudaIpcOpenEventHandle(event, handle)); +} + +inline static hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){ + return hipCUDAErrorTohipError(cudaIpcOpenMemHandle(devPtr, handle, flags)); +} + inline static hipError_t hipMemset(void* devPtr,int value, size_t count) { return hipCUDAErrorTohipError(cudaMemset(devPtr, value, count)); } From 83140f8423c51f0e20082d0eb065106e87a27112 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 17 Sep 2016 08:40:47 -0500 Subject: [PATCH 13/66] Updates docs for hipHcc* functions, move to header --- include/hcc_detail/hcc_acc.h | 4 ++++ src/hip_hcc.cpp | 6 ------ src/hip_memory.cpp | 1 + 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/include/hcc_detail/hcc_acc.h b/include/hcc_detail/hcc_acc.h index d0d605d1c9..371a0a23a4 100644 --- a/include/hcc_detail/hcc_acc.h +++ b/include/hcc_detail/hcc_acc.h @@ -7,11 +7,15 @@ #include /** * @brief Return hc::accelerator associated with the specified deviceId + * @return #hipSuccess, #hipErrorInvalidDevice */ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc); /** * @brief Return hc::accelerator_view associated with the specified stream + * + * If stream is 0, the accelerator_view for the default stream is returned. + * @return #hipSuccess */ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av); #endif diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 0f0b03d115..f14b4612b8 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1973,9 +1973,6 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig //------------------------------------------------------------------------------------------------- // HCC-specific accessor functions: -/** - * @return #hipSuccess, #hipErrorInvalidDevice - */ //--- hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc) { @@ -1993,9 +1990,6 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc) } -/** - * @return #hipSuccess - */ //--- hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av) { diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 0cc53f7754..668852bbfe 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -613,6 +613,7 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h return ihipLogStatus(e); } +// TODO - review and optimize hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { From d317d8b755fe0ff3c3057b9ce54185e1b5a54a44 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 17 Sep 2016 08:41:25 -0500 Subject: [PATCH 14/66] Doc update: release reminder, compare hip against other porting tools. --- RELEASE.md | 1 + docs/markdown/hip_faq.md | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index da689f42ea..d1f79bc3c6 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -7,6 +7,7 @@ Upcoming: number of commands (>1K) without synchronizing. - Register keyword now silently ignored on HCC (previously would emit warning). - Doc updates: Add some more frequently asked questions to FAQ, fix TOC in some files, review. +- Cookbook. =================================================================================================== diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md index e21fc984e2..13dd20d874 100644 --- a/docs/markdown/hip_faq.md +++ b/docs/markdown/hip_faq.md @@ -102,7 +102,7 @@ However, we can provide a rough summary of the features included in each CUDA SD - CUDA 7.5 - float16 (under development) - CUDA 8.0 - - No new language features. + - TBD. ### What libraries does HIP support? HIP includes growing support for the 4 key math libraries using hcBlas, hcFft, hcrng, and hcsparse). @@ -127,6 +127,19 @@ HIP offers several benefits over OpenCL: - HIP provides device-level control over memory allocation and placement. - HIP offers an offline compilation model. +### How does porting CUDA to HIP compare to porting CUDA to OpenCL? +Both HIP and CUDA are dialects of C++, and thus porting between them is relatively straightforward. +Both dialects support templates, classes, lambdas, and other C++ constructs. +As one example, the hipify tool was originally a perl script that used simple text conversions from CUDA to HIP. +HIP and CUDA provide similar math library calls as well. In summary, the HIP philospohy was to make the HIP language close enough to CUDA that the porting effort is relatively simple. +This reduces the potential for error, and also makes it easy to automate the translation. HIP's goal is to quickly get the ported program running on both platforms with little manual intervention, +so that the programmer can focus on performance optimizations. + +There have been several tools that have attempted to convert CUDA into OpenCL, such as CU2CL. OpenCL is a C99-based kernel language (rather than C++) and also does not support single-source compilation. +As a result, the OpenCL syntax is quite different than CUDA, and the porting tools have to perform some heroic transformations to bridge this gap. +The tools also struggle with more complex CUDA applications, in particular those that use templates, classes, or other C++ features inside the kernel. + + ### What hardware does HIP support? - For AMD platforms, HIP runs on the same hardware that the HCC "hc" mode supports. See the ROCM documentation for the list of supported platforms. - For Nvidia platforms, HIP requires Unified Memory and should run on a device which runs the CUDA SDK 6.0 or newer. We have tested the Nvidia Titan and K40. From fb55db8711201395739ec70fa9d7b4ca17deeb93 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 22 Sep 2016 12:24:34 -0500 Subject: [PATCH 15/66] Doc update for faq. --- docs/markdown/hip_faq.md | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md index 13dd20d874..31a032469b 100644 --- a/docs/markdown/hip_faq.md +++ b/docs/markdown/hip_faq.md @@ -10,6 +10,7 @@ - [What specific version of CUDA does HIP support?](#what-specific-version-of-cuda-does-hip-support) - [What libraries does HIP support?](#what-libraries-does-hip-support) - [How does HIP compare with OpenCL?](#how-does-hip-compare-with-opencl) +- [How does porting CUDA to HIP compare to porting CUDA to OpenCL?] - [What hardware does HIP support?](#what-hardware-does-hip-support) - [Does Hipify automatically convert all source code?](#does-hipify-automatically-convert-all-source-code) - [What is NVCC?](#what-is-nvcc) @@ -185,7 +186,9 @@ A C++ dialect, hc is supported by the AMD HCC compiler. It provides C++ run time ### On HCC, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ? Yes! HIP/HCC generates the object code which conforms to the GCC ABI, and also links with libstdc++. This means you can compile host code with the compiler of your choice and link this with GPU code compiler with HIP. Larger projects often contain a mixture of accelerator code (initially written in CUDA with nvcc) plus host code (compiled with gcc, icc, or clang). These projects -can convert the accelerator code to HIP, compile that code with hipcc, and link with object code from the preferred compiler. +can convert the accelerator code to HIP, compile that code with hipcc, and link with object code from the preferred compiler.S + + ### HIP detected my platform (hcc vs nvcc) incorrectly - what should I do? @@ -202,6 +205,29 @@ If you see issues related to incorrect platform detection, please file an issue ### Can I install both CUDA SDK and HCC on same machine? Yes. You can use HIP_PLATFORM to choose which path hipcc targets. This configuration can be useful when using HIP to develop an application which is portable to both AMD and NVIDIA. + +### On CUDA, can I mix CUDA code with HIP code? +Yes. Most HIP data structures (hipStream_t, hipEvent_t) are typedefs to CUDA equivalents and can be intermixed. Both CUDA and HIP use integer device ids . +One notable exception is that hipError_t is a new type, and cannot be used where a cudaError_t is expected. In these cases, refactor the code to remove the expectation. Alternatively, hip_runtime_api.h defines functions which convert between the error code spaces: + +hipErrorToCudaError +hipCUDAErrorTohipError +hipCUResultTohipError + +If platform portability is important, use #ifdef __HIP_PLATFORM_NVCC__ to guard the CUDA-specific code. + +### On HCC, can I use HC functionality with HIP? +Yes. +The code can include hc.hpp and use HC functions inside the kernel. A typical use case is to use AMD-specific hardware features such as the permute, swizzle, or DPP operations. +The "-stdlib=libc++" must be passed to hipcc in order to compile hc.hpp. See the 'bit_extract' sample for an example. + +Also these functions can be used to extract HCC acclerator and accelerator_view structures from the HIP deviceId and hipStream_t: +hipHccGetAccelerator(int deviceId, hc::accelerator *acc); +hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av); + +If platform portability is important, use #ifdef __HIP_PLATFORM_HIPCC__ to guard the HCC-specific code. + + ### How do I trace HIP application flow? #### Using CodeXL markers for HIP Functions HIP can generate markers at function being/end which are displayed on the CodeXL timeline view. From 1160cefc6de506fd3a620f1af3b8f0b53fd5bb60 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 22 Sep 2016 12:24:55 -0500 Subject: [PATCH 16/66] Sample improvements. - Enable -O3 for hipDispatchLatency. - Use nearly-null kernel to prevent it from being optimized away. - Formatting for hipDispatchLatency. - Formatting for hipInfo. --- samples/1_Utils/hipDispatchLatency/Makefile | 8 ++-- .../hipDispatchLatency/ResultDatabase.cpp | 14 ++++--- .../hipDispatchLatency/hipDispatchLatency.cpp | 39 +++++++++++-------- samples/1_Utils/hipInfo/hipInfo.cpp | 2 +- 4 files changed, 38 insertions(+), 25 deletions(-) diff --git a/samples/1_Utils/hipDispatchLatency/Makefile b/samples/1_Utils/hipDispatchLatency/Makefile index 387cb9aac6..3b69c4a335 100644 --- a/samples/1_Utils/hipDispatchLatency/Makefile +++ b/samples/1_Utils/hipDispatchLatency/Makefile @@ -6,10 +6,12 @@ HIPCC=$(HIP_PATH)/bin/hipcc EXE=hipDispatchLatency -all: install +CXXFLAGS = -O3 -$(EXE): hipDispatchLatency.cpp - $(HIPCC) hipDispatchLatency.cpp ResultDatabase.cpp -o $@ +all: ${EXE} + +$(EXE): hipDispatchLatency.cpp ResultDatabase.cpp + $(HIPCC) $(CXXFLAGS) hipDispatchLatency.cpp ResultDatabase.cpp -o $@ install: $(EXE) cp $(EXE) $(HIP_PATH)/bin diff --git a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp index 2ec686f260..d207154e39 100644 --- a/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp +++ b/samples/1_Utils/hipDispatchLatency/ResultDatabase.cpp @@ -253,10 +253,12 @@ void ResultDatabase::DumpDetailed(ostream &out) out << endl; } - out << endl - << "Note: Any results marked with (*) had missing values." << endl - << " This can occur on systems with a mixture of" << endl - << " device types or architectural capabilities." << endl; + if (0) { + out << endl + << "Note: Any results marked with (*) had missing values." << endl + << " This can occur on systems with a mixture of" << endl + << " device types or architectural capabilities." << endl; + } } @@ -330,9 +332,11 @@ void ResultDatabase::DumpSummary(ostream &out) out << endl; } - out << endl + if (0) { + out << endl << "Note: results marked with (*) had missing values such as" << endl << "might occur with a mixture of architectural capabilities." << endl; + } } // **************************************************************************** diff --git a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp index 1c15ab51d7..65e8603a4e 100644 --- a/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp +++ b/samples/1_Utils/hipDispatchLatency/hipDispatchLatency.cpp @@ -30,15 +30,22 @@ if(status != hipSuccess){ \ #define LEN 1024*1024 #define SIZE LEN * sizeof(float) -#define ITER 5120 +#define ITER 10120 -__global__ void One(hipLaunchParm lp, float* Ad){ + +// HCC optimizes away fully NULL kernel calls, so run one that is nearly null: +__global__ void NearlyNull(hipLaunchParm lp, float* Ad){ + if (Ad) { + Ad[0] = 42; + } } + int main(){ hipError_t err; - float *A, *Ad; + float *A; + float *Ad = NULL; A = new float[LEN]; @@ -50,11 +57,10 @@ int main(){ err = hipStreamCreate(&stream); check("Creating stream",err); - err = hipMalloc(&Ad, SIZE); - check("Allocating Ad memory on device", err); - - err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); - check("Doing memory copy from A to Ad", err); + //err = hipMalloc(&Ad, SIZE); + //check("Allocating Ad memory on device", err); + //err = hipMemcpy(Ad, A, SIZE, hipMemcpyHostToDevice); + //check("Doing memory copy from A to Ad", err); float mS = 0; hipEvent_t start, stop; @@ -63,15 +69,16 @@ int main(){ ResultDatabase resultDB[8]; + hipEventRecord(start); - hipLaunchKernel(HIP_KERNEL_NAME(One), dim3(LEN/512), dim3(512), 0, 0, Ad); + hipLaunchKernel(NearlyNull, dim3(LEN/512), dim3(512), 0, 0, Ad); hipEventRecord(stop); hipEventElapsedTime(&mS, start, stop); resultDB[0].AddResult(std::string("First Kernel Launch"), "", "uS", mS*1000); // std::cout<<"First Kernel Launch: \t\t"< Date: Fri, 9 Sep 2016 10:33:00 -0500 Subject: [PATCH 17/66] Refactor staging buffer CopyHostToDevice. - Move algorithm selection inside Unpinned class. - Refactor function names. - Use size_t for size threshholds. Change-Id: Iac4de652ac9d49acbf527aa0849e388b8ecd8486 --- include/hcc_detail/unpinned_copy_engine.h | 21 ++-- src/hip_hcc.cpp | 18 ++-- src/unpinned_copy_engine.cpp | 124 +++++++++++++--------- 3 files changed, 93 insertions(+), 70 deletions(-) diff --git a/include/hcc_detail/unpinned_copy_engine.h b/include/hcc_detail/unpinned_copy_engine.h index 2dd7e15d28..f50ff54b55 100644 --- a/include/hcc_detail/unpinned_copy_engine.h +++ b/include/hcc_detail/unpinned_copy_engine.h @@ -39,15 +39,20 @@ THE SOFTWARE. // Staging buffer provides thread-safe access via a mutex. struct UnpinnedCopyEngine { + enum CopyMode {ChooseBest, UsePinInPlace, UseStaging, UseMemcpy} ; + static const int _max_buffers = 4; UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ; ~UnpinnedCopyEngine(); - void CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + /* Use hueristic to choose best copy algorithm */ + + void CopyHostToDeviceBest(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyDeviceToHost (int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyPeerToPeer( void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor); @@ -60,12 +65,12 @@ struct UnpinnedCopyEngine { int _numBuffers; char *_pinnedStagingBuffer[_max_buffers]; - hsa_signal_t _completion_signal[_max_buffers]; - hsa_signal_t _completion_signal2[_max_buffers]; // P2P needs another set of signals. - std::mutex _copy_lock; // provide thread-safe access - int _hipH2DTransferThresholdDirectOrStaging; - int _hipH2DTransferThresholdStagingOrPininplace; - int _hipD2HTransferThreshold; + hsa_signal_t _completionSignal[_max_buffers]; + hsa_signal_t _completionSignal2[_max_buffers]; // P2P needs another set of signals. + std::mutex _copyLock; // provide thread-safe access + size_t _hipH2DTransferThresholdDirectOrStaging; + size_t _hipH2DTransferThresholdStagingOrPininplace; + size_t _hipD2HTransferThreshold; }; #endif diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index f14b4612b8..99fe80753d 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1436,7 +1436,6 @@ void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, c } } -// TODO - data-up to data-down: // Called just before a kernel is launched from hipLaunchKernel. // Allows runtime to track some information about the stream. hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_launch_parm *lp, const char *kernelNameStr) @@ -1733,22 +1732,17 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const #endif } + if (kind == hipMemcpyHostToDevice) { int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2D); if(!srcTracked){ if (HIP_STAGING_BUFFERS) { tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - if(HIP_OPTIMAL_MEM_TRANSFER) - { - device->_stagingBuffer[0]->CopyHostToDevice(1,device->_isLargeBar,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } - else { - if (HIP_PININPLACE) { - device->_stagingBuffer[0]->CopyHostToDevicePinInPlace(dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } else { - device->_stagingBuffer[0]->CopyHostToDevice(0,0,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } - } + UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::ChooseBest; + if (HIP_PININPLACE) { + copyMode = UnpinnedCopyEngine::UsePinInPlace; + } + device->_stagingBuffer[0]->CopyHostToDeviceBest(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); // The copy waits for inputs and then completes before returning so can reset queue to empty: this->wait(crit, true); } diff --git a/src/unpinned_copy_engine.cpp b/src/unpinned_copy_engine.cpp index 5501c66f9d..820dda1a02 100644 --- a/src/unpinned_copy_engine.cpp +++ b/src/unpinned_copy_engine.cpp @@ -85,8 +85,8 @@ UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgen err = hsa_amd_agents_allow_access(1, &hsaAgent, NULL, _pinnedStagingBuffer[i]); ErrorCheck(err); - hsa_signal_create(0, 0, NULL, &_completion_signal[i]); - hsa_signal_create(0, 0, NULL, &_completion_signal2[i]); + hsa_signal_create(0, 0, NULL, &_completionSignal[i]); + hsa_signal_create(0, 0, NULL, &_completionSignal2[i]); } }; @@ -100,8 +100,8 @@ UnpinnedCopyEngine::~UnpinnedCopyEngine() hsa_amd_memory_pool_free(_pinnedStagingBuffer[i]); _pinnedStagingBuffer[i] = NULL; } - hsa_signal_destroy(_completion_signal[i]); - hsa_signal_destroy(_completion_signal2[i]); + hsa_signal_destroy(_completionSignal[i]); + hsa_signal_destroy(_completionSignal2[i]); } } @@ -114,13 +114,13 @@ UnpinnedCopyEngine::~UnpinnedCopyEngine() //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp = static_cast (src); char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -129,8 +129,8 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, int bufferIndex = 0; size_t theseBytes= sizeBytes; - //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - //hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); + //hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); //void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO void *locked_srcp; @@ -143,22 +143,54 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, THROW_ERROR (hipErrorRuntimeMemory); } - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, locked_srcp, _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, locked_srcp, _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } - tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); hsa_amd_memory_unlock(const_cast (srcp)); // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 waitFor = NULL; } +void UnpinnedCopyEngine::CopyHostToDeviceBest(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + if (copyMode == ChooseBest) { + if (isLargeBar && (sizeBytes < _hipH2DTransferThresholdDirectOrStaging)) { + copyMode = UseMemcpy; + } else if (sizeBytes > _hipH2DTransferThresholdStagingOrPininplace) { + copyMode = UsePinInPlace; + } else { + copyMode = UseStaging; + } + } + + if (copyMode == UseMemcpy) { + + if (!isLargeBar) { + THROW_ERROR (hipErrorInvalidValue); + } + + memcpy(dst,src,sizeBytes); + std::atomic_thread_fence(std::memory_order_release); + + } else if (copyMode == UsePinInPlace) { + CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor); + + } else if (copyMode == UseStaging) { + CopyHostToDeviceStaging(dst, src, sizeBytes, waitFor); + + } else { + // Unknown copy mode. + THROW_ERROR(hipErrorInvalidValue); + } +} //--- @@ -166,24 +198,16 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, //IN: dst - dest pointer - must be accessible from host CPU. //IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - if((tempIndex==1)&&(isLargeBar)&&(sizeBytes < _hipH2DTransferThresholdDirectOrStaging)){ - memcpy(dst,src,sizeBytes); - std::atomic_thread_fence(std::memory_order_release); - } - else if((tempIndex==1) && (sizeBytes > _hipH2DTransferThresholdStagingOrPininplace)){ - CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor); - } - else { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp = static_cast (src); char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -194,16 +218,16 @@ void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); // TODO - use uncached memcpy, someday. memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR ((hipErrorRuntimeMemory)); @@ -221,7 +245,7 @@ void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } } } @@ -229,13 +253,13 @@ void UnpinnedCopyEngine::CopyHostToDevice(int tempIndex,int isLargeBar,void* dst void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp = static_cast (src); char *dstp = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -252,15 +276,15 @@ void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, THROW_ERROR (hipErrorRuntimeMemory); } - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpuAgent , srcp, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpuAgent , srcp, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } - tprintf (DB_COPY2, "D2H: waiting... on completion signal handle=%lu\n", _completion_signal[bufferIndex].handle); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + tprintf (DB_COPY2, "D2H: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); hsa_amd_memory_unlock(const_cast (dstp)); // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 @@ -279,13 +303,13 @@ void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* s } else { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp0 = static_cast (src); char *dstp1 = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -303,8 +327,8 @@ void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* s size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -322,7 +346,7 @@ void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* s size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; tprintf (DB_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); @@ -341,14 +365,14 @@ void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* s //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor) { - std::lock_guard l (_copy_lock); + std::lock_guard l (_copyLock); const char *srcp0 = static_cast (src); char *dstp1 = static_cast (dst); for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completion_signal[i], 0); - hsa_signal_store_relaxed(_completion_signal2[i], 0); + hsa_signal_store_relaxed(_completionSignal[i], 0); + hsa_signal_store_relaxed(_completionSignal2[i], 0); } if (sizeBytes >= UINT64_MAX/2) { @@ -365,11 +389,11 @@ void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const v size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; // Wait to make sure we are not overwriting a buffer before it has been drained: - hsa_signal_wait_acquire(_completion_signal2[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal2[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); - hsa_signal_store_relaxed(_completion_signal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completion_signal[bufferIndex]); + hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); + hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); if (hsa_status != HSA_STATUS_SUCCESS) { THROW_ERROR (hipErrorRuntimeMemory); } @@ -392,14 +416,14 @@ void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const v if (hostWait) { // Host-side wait, should not be necessary: - hsa_signal_wait_acquire(_completion_signal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); - hsa_signal_store_relaxed(_completion_signal2[bufferIndex], 1); + hsa_signal_store_relaxed(_completionSignal2[bufferIndex], 1); hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent /*not used*/, theseBytes, - hostWait ? 0:1, hostWait ? NULL : &_completion_signal[bufferIndex], - _completion_signal2[bufferIndex]); + hostWait ? 0:1, hostWait ? NULL : &_completionSignal[bufferIndex], + _completionSignal2[bufferIndex]); dstp1 += theseBytes; } @@ -408,6 +432,6 @@ void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const v // Wait for the staging-buffer to dest copies to complete: for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completion_signal2[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); + hsa_signal_wait_acquire(_completionSignal2[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); } } From e300cb4405faffc08866757ac224a2bac1ca7981 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 9 Sep 2016 14:56:51 -0500 Subject: [PATCH 18/66] Refactor Staging Buffer CopyDeviceToHost Use copyMode. Embed algorithm selection inside the unpinned class. Change-Id: Ic75fd5931717a3160904402794bbed3ccd445112 --- include/hcc_detail/unpinned_copy_engine.h | 14 +++++-- src/hip_hcc.cpp | 31 ++++++++------- src/unpinned_copy_engine.cpp | 48 +++++++++++++++++------ 3 files changed, 65 insertions(+), 28 deletions(-) diff --git a/include/hcc_detail/unpinned_copy_engine.h b/include/hcc_detail/unpinned_copy_engine.h index f50ff54b55..653beb89ee 100644 --- a/include/hcc_detail/unpinned_copy_engine.h +++ b/include/hcc_detail/unpinned_copy_engine.h @@ -46,15 +46,23 @@ struct UnpinnedCopyEngine { UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ; ~UnpinnedCopyEngine(); - /* Use hueristic to choose best copy algorithm */ + // Use hueristic to choose best copy algorithm + void CopyHostToDevice(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyDeviceToHost(CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyHostToDeviceBest(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + + // Specific H2D copy algorithm implementations: void CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + - void CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + // Specific D2H copy algorithm implementations: + void CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + + // P2P Copy implementation: void CopyPeerToPeer( void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor); diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 99fe80753d..78285cefa0 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1272,7 +1272,7 @@ void ihipInit() READ_ENV_I(release, HIP_ATP_MARKER, 0, "Add HIP function begin/end to ATP file generated with CodeXL"); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction. 0=use hsa_memory_copy."); - READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy. Under development."); + READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy."); READ_ENV_I(release, HIP_OPTIMAL_MEM_TRANSFER, 0, "For optimal memory transfers for unpinned memory.Under testing."); READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, 0, "Threshold value for H2D unpinned memory transfer decision between direct copy or staging buffer usage,Under testing."); READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, 0, "Threshold value for H2D unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing."); @@ -1738,11 +1738,14 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if(!srcTracked){ if (HIP_STAGING_BUFFERS) { tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::ChooseBest; - if (HIP_PININPLACE) { + UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; + + if (HIP_OPTIMAL_MEM_TRANSFER) { + copyMode = UnpinnedCopyEngine::ChooseBest; + } else if (HIP_PININPLACE) { copyMode = UnpinnedCopyEngine::UsePinInPlace; - } - device->_stagingBuffer[0]->CopyHostToDeviceBest(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + } + device->_stagingBuffer[0]->CopyHostToDevice(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); // The copy waits for inputs and then completes before returning so can reset queue to empty: this->wait(crit, true); } @@ -1781,16 +1784,16 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if (!dstTracked){ if (HIP_STAGING_BUFFERS) { tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - //printf ("staged-copy- read dep signals\n"); - if(HIP_OPTIMAL_MEM_TRANSFER) - { - //printf ("staged-copy- read dep signals\n"); - device->_stagingBuffer[1]->CopyDeviceToHost(1,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } - else - { - device->_stagingBuffer[1]->CopyDeviceToHost(0,dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; + + if (HIP_OPTIMAL_MEM_TRANSFER) { + copyMode = UnpinnedCopyEngine::ChooseBest; + } else if (HIP_PININPLACE) { + copyMode = UnpinnedCopyEngine::UsePinInPlace; } + + device->_stagingBuffer[1]->CopyDeviceToHost(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + // The copy completes before returning so can reset queue to empty: this->wait(crit, true); diff --git a/src/unpinned_copy_engine.cpp b/src/unpinned_copy_engine.cpp index 820dda1a02..f446220e7a 100644 --- a/src/unpinned_copy_engine.cpp +++ b/src/unpinned_copy_engine.cpp @@ -159,7 +159,20 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, } -void UnpinnedCopyEngine::CopyHostToDeviceBest(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +// Copy using simple memcpy. Only works on large-bar systems. +void UnpinnedCopyEngine::CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + if (!isLargeBar) { + THROW_ERROR (hipErrorInvalidValue); + } + + memcpy(dst,src,sizeBytes); + std::atomic_thread_fence(std::memory_order_release); +}; + + + +void UnpinnedCopyEngine::CopyHostToDevice(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { if (copyMode == ChooseBest) { if (isLargeBar && (sizeBytes < _hipH2DTransferThresholdDirectOrStaging)) { @@ -173,12 +186,7 @@ void UnpinnedCopyEngine::CopyHostToDeviceBest(UnpinnedCopyEngine::CopyMode copyM if (copyMode == UseMemcpy) { - if (!isLargeBar) { - THROW_ERROR (hipErrorInvalidValue); - } - memcpy(dst,src,sizeBytes); - std::atomic_thread_fence(std::memory_order_release); } else if (copyMode == UsePinInPlace) { CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor); @@ -291,17 +299,35 @@ void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, waitFor = NULL; } + +void UnpinnedCopyEngine::CopyDeviceToHost(CopyMode copyMode ,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +{ + if (copyMode == ChooseBest) { + if (sizeBytes > _hipD2HTransferThreshold) { + copyMode = UsePinInPlace; + } else { + copyMode = UseStaging; + } + } + + + if (copyMode == UsePinInPlace) { + CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor); + } if (copyMode == UseStaging) { + CopyDeviceToHostStaging(dst, src, sizeBytes, waitFor); + } else { + // Unknown copy mode. + THROW_ERROR(hipErrorInvalidValue); + } +} + //--- //Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy //IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). //IN: src - src pointer for copy. Must be accessible from host CPU. //IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void UnpinnedCopyEngine::CopyDeviceToHost(int tempIndex,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - if((tempIndex==1) && (sizeBytes> _hipD2HTransferThreshold)){ - CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor); - } - else { std::lock_guard l (_copyLock); From 442d74f027b6809c106ee3b4f5ae7905c46983ce Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 11 Sep 2016 06:50:20 -0500 Subject: [PATCH 19/66] Move isLargeBar to UnpinnedCopyEngine constructor. Change-Id: I7a7d3a40b1d4e0c6ec856658a6a70e5e70d287ce --- include/hcc_detail/unpinned_copy_engine.h | 12 ++++++++---- src/hip_hcc.cpp | 16 ++++++++++++---- src/unpinned_copy_engine.cpp | 15 +++++++++------ 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/include/hcc_detail/unpinned_copy_engine.h b/include/hcc_detail/unpinned_copy_engine.h index 653beb89ee..678d714981 100644 --- a/include/hcc_detail/unpinned_copy_engine.h +++ b/include/hcc_detail/unpinned_copy_engine.h @@ -21,7 +21,7 @@ THE SOFTWARE. #ifndef STAGING_BUFFER_H #define STAGING_BUFFER_H -#include "hsa.h" +#include "hsa/hsa.h" //------------------------------------------------------------------------------------------------- @@ -43,18 +43,19 @@ struct UnpinnedCopyEngine { static const int _max_buffers = 4; - UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers,int thresholdH2D_directStaging,int thresholdH2D_stagingPinInPlace,int thresholdD2H) ; + UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, + bool isLargeBar, int thresholdH2D_directStaging, int thresholdH2D_stagingPinInPlace, int thresholdD2H) ; ~UnpinnedCopyEngine(); // Use hueristic to choose best copy algorithm - void CopyHostToDevice(CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDevice(CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyDeviceToHost(CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); // Specific H2D copy algorithm implementations: void CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); + void CopyHostToDeviceMemcpy(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); // Specific D2H copy algorithm implementations: @@ -72,6 +73,9 @@ struct UnpinnedCopyEngine { size_t _bufferSize; // Size of the buffers. int _numBuffers; + // True if system supports large-bar and thus can benefit from CPU directly performing copy operation. + bool _isLargeBar; + char *_pinnedStagingBuffer[_max_buffers]; hsa_signal_t _completionSignal[_max_buffers]; hsa_signal_t _completionSignal2[_max_buffers]; // P2P needs another set of signals. diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 78285cefa0..294f8d8e89 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -675,8 +675,16 @@ ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerato initProperties(&_props); - _stagingBuffer[0] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD); - _stagingBuffer[1] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS,HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING,HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE,HIP_D2H_MEM_TRANSFER_THRESHOLD); + _stagingBuffer[0] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS, + _isLargeBar, + HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, + HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, + HIP_D2H_MEM_TRANSFER_THRESHOLD); + _stagingBuffer[1] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS, + _isLargeBar, + HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, + HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, + HIP_D2H_MEM_TRANSFER_THRESHOLD); _primaryCtx = new ihipCtx_t(this, deviceCnt, hipDeviceMapHost); } @@ -925,7 +933,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) FindDevicePool(); int access=checkAccess(g_cpu_agent, gpu_pool_); - if (0!= access){ + if (0 != access){ _isLargeBar= 1; } else { _isLargeBar=0; @@ -1745,7 +1753,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const } else if (HIP_PININPLACE) { copyMode = UnpinnedCopyEngine::UsePinInPlace; } - device->_stagingBuffer[0]->CopyHostToDevice(copyMode, device->_isLargeBar, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + device->_stagingBuffer[0]->CopyHostToDevice(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); // The copy waits for inputs and then completes before returning so can reset queue to empty: this->wait(crit, true); } diff --git a/src/unpinned_copy_engine.cpp b/src/unpinned_copy_engine.cpp index f446220e7a..abeb5910e4 100644 --- a/src/unpinned_copy_engine.cpp +++ b/src/unpinned_copy_engine.cpp @@ -19,7 +19,7 @@ THE SOFTWARE. #include -#include "hsa_ext_amd.h" +#include #include "hcc_detail/unpinned_copy_engine.h" @@ -62,11 +62,14 @@ hsa_status_t findGlobalPool(hsa_amd_memory_pool_t pool, void* data) { } //------------------------------------------------------------------------------------------------- -UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, int thresholdH2DDirectStaging,int thresholdH2DStagingPinInPlace,int thresholdD2H) : +UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, + bool isLargeBar, int thresholdH2DDirectStaging, + int thresholdH2DStagingPinInPlace, int thresholdD2H) : _hsaAgent(hsaAgent), _cpuAgent(cpuAgent), _bufferSize(bufferSize), _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers), + _isLargeBar(isLargeBar), _hipH2DTransferThresholdDirectOrStaging(thresholdH2DDirectStaging), _hipH2DTransferThresholdStagingOrPininplace(thresholdH2DStagingPinInPlace), _hipD2HTransferThreshold(thresholdD2H) @@ -160,9 +163,9 @@ void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, // Copy using simple memcpy. Only works on large-bar systems. -void UnpinnedCopyEngine::CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyHostToDeviceMemcpy(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { - if (!isLargeBar) { + if (!_isLargeBar) { THROW_ERROR (hipErrorInvalidValue); } @@ -172,10 +175,10 @@ void UnpinnedCopyEngine::CopyHostToDeviceMemcpy(int isLargeBar, void* dst, const -void UnpinnedCopyEngine::CopyHostToDevice(UnpinnedCopyEngine::CopyMode copyMode, int isLargeBar,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) +void UnpinnedCopyEngine::CopyHostToDevice(UnpinnedCopyEngine::CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) { if (copyMode == ChooseBest) { - if (isLargeBar && (sizeBytes < _hipH2DTransferThresholdDirectOrStaging)) { + if (_isLargeBar && (sizeBytes < _hipH2DTransferThresholdDirectOrStaging)) { copyMode = UseMemcpy; } else if (sizeBytes > _hipH2DTransferThresholdStagingOrPininplace) { copyMode = UsePinInPlace; From a352ee923e7cd42d00510b223761d4bb42b3e920 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 11 Sep 2016 10:29:18 -0500 Subject: [PATCH 20/66] Fix bugs for different CopyModes Change-Id: Ie59fb3b36a9ff2de178307a6d5756b5c71e0306b --- src/unpinned_copy_engine.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/unpinned_copy_engine.cpp b/src/unpinned_copy_engine.cpp index abeb5910e4..4ae6990180 100644 --- a/src/unpinned_copy_engine.cpp +++ b/src/unpinned_copy_engine.cpp @@ -189,6 +189,7 @@ void UnpinnedCopyEngine::CopyHostToDevice(UnpinnedCopyEngine::CopyMode copyMode, if (copyMode == UseMemcpy) { + CopyHostToDeviceMemcpy(dst, src, sizeBytes, waitFor); } else if (copyMode == UsePinInPlace) { @@ -316,7 +317,7 @@ void UnpinnedCopyEngine::CopyDeviceToHost(CopyMode copyMode ,void* dst, const vo if (copyMode == UsePinInPlace) { CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor); - } if (copyMode == UseStaging) { + } else if (copyMode == UseStaging) { CopyDeviceToHostStaging(dst, src, sizeBytes, waitFor); } else { // Unknown copy mode. From ccc1bbe6b1e96cb83e4bf328133f3fbc877b6b16 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 11 Sep 2016 11:02:33 -0500 Subject: [PATCH 21/66] Remove HIP_STAGING_BUFFER Code simplification/cleanup: Remove stale fallback paths that uses something besides the unpinned engine. Remove HIP_STAGING_BUFFER env var - now is const 2, 0 no longer has special meaning. Change-Id: I7d24cdd1067dd0c244e87b6a83897cb135d307e7 --- include/hcc_detail/hip_hcc.h | 1 - src/hip_hcc.cpp | 86 ++++++++++++------------------------ 2 files changed, 29 insertions(+), 58 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 94df169e07..3ce2e7ed2d 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -55,7 +55,6 @@ extern int HIP_ATP_MARKER; extern int HIP_ATP; extern int HIP_DB; extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */ -extern int HIP_STAGING_BUFFERS; // TODO - remove, two buffers should be enough. extern int HIP_PININPLACE; extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */ extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */ diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 294f8d8e89..09fbe545cc 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -68,7 +68,7 @@ std::string HIP_TRACE_API_COLOR("green"); int HIP_ATP_MARKER= 0; int HIP_DB= 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -int HIP_STAGING_BUFFERS = 2; // TODO - remove, two buffers should be enough. +static const int HIP_STAGING_BUFFERS = 2; int HIP_PININPLACE = 0; int HIP_OPTIMAL_MEM_TRANSFER = 0; //ENV Variable to test different memory transfer logics int HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING = 0; @@ -1279,7 +1279,6 @@ void ihipInit() READ_ENV_S(release, HIP_TRACE_API_COLOR, 0, "Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White"); READ_ENV_I(release, HIP_ATP_MARKER, 0, "Add HIP function begin/end to ATP file generated with CodeXL"); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); - READ_ENV_I(release, HIP_STAGING_BUFFERS, 0, "Number of staging buffers to use in each direction. 0=use hsa_memory_copy."); READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy."); READ_ENV_I(release, HIP_OPTIMAL_MEM_TRANSFER, 0, "For optimal memory transfers for unpinned memory.Under testing."); READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, 0, "Threshold value for H2D unpinned memory transfer decision between direct copy or staging buffer usage,Under testing."); @@ -1744,28 +1743,17 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if (kind == hipMemcpyHostToDevice) { int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2D); if(!srcTracked){ - if (HIP_STAGING_BUFFERS) { - tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; - - if (HIP_OPTIMAL_MEM_TRANSFER) { - copyMode = UnpinnedCopyEngine::ChooseBest; - } else if (HIP_PININPLACE) { - copyMode = UnpinnedCopyEngine::UsePinInPlace; - } - device->_stagingBuffer[0]->CopyHostToDevice(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - // The copy waits for inputs and then completes before returning so can reset queue to empty: - this->wait(crit, true); - } - else { - // TODO - remove, slow path. - tprintf(DB_COPY1, "H2D && ! srcTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); -#if USE_AV_COPY - crit->_av.copy(src,dst,sizeBytes); -#else - hc::am_copy(dst, src, sizeBytes); -#endif + tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); + UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; + + if (HIP_OPTIMAL_MEM_TRANSFER) { + copyMode = UnpinnedCopyEngine::ChooseBest; + } else if (HIP_PININPLACE) { + copyMode = UnpinnedCopyEngine::UsePinInPlace; } + device->_stagingBuffer[0]->CopyHostToDevice(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + // The copy waits for inputs and then completes before returning so can reset queue to empty: + this->wait(crit, true); } else { // This is H2D copy, and source is pinned host memory : we can copy directly w/o using staging buffer. hsa_agent_t dstAgent = *(static_cast(dstPtrInfo._acc.get_hsa_agent())); @@ -1790,30 +1778,19 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const } else if (kind == hipMemcpyDeviceToHost) { int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyD2H); if (!dstTracked){ - if (HIP_STAGING_BUFFERS) { - tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; - - if (HIP_OPTIMAL_MEM_TRANSFER) { - copyMode = UnpinnedCopyEngine::ChooseBest; - } else if (HIP_PININPLACE) { - copyMode = UnpinnedCopyEngine::UsePinInPlace; - } + tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); + UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; - device->_stagingBuffer[1]->CopyDeviceToHost(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); + if (HIP_OPTIMAL_MEM_TRANSFER) { + copyMode = UnpinnedCopyEngine::ChooseBest; + } else if (HIP_PININPLACE) { + copyMode = UnpinnedCopyEngine::UsePinInPlace; + } - // The copy completes before returning so can reset queue to empty: - this->wait(crit, true); + device->_stagingBuffer[1]->CopyDeviceToHost(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - } else { - // TODO - remove, slow path. - tprintf(DB_COPY1, "D2H && !dstTracked: am_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); -#if USE_AV_COPY - crit->_av.copy(src, dst, sizeBytes); -#else - hc::am_copy(dst, src, sizeBytes); -#endif - } + // The copy completes before returning so can reset queue to empty: + this->wait(crit, true); } else { // This is D2H copy, and destination is pinned host memory : we can copy directly w/o using staging buffer. hsa_agent_t dstAgent = *(static_cast(dstPtrInfo._acc.get_hsa_agent())); @@ -1845,21 +1822,15 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const memcpy(dst, src, sizeBytes); } else if ((kind == hipMemcpyDeviceToDevice) && !copyEngineCanSeeSrcAndDest) { int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyP2P); - if (HIP_STAGING_BUFFERS) { - tprintf(DB_COPY1, "P2P but engine can't see both pointers: staged copy P2P dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - //printf ("staged-copy- read dep signals\n"); - hsa_agent_t dstAgent = * (static_cast (dstPtrInfo._acc.get_hsa_agent())); - hsa_agent_t srcAgent = * (static_cast (srcPtrInfo._acc.get_hsa_agent())); + tprintf(DB_COPY1, "P2P but engine can't see both pointers: staged copy P2P dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); + //printf ("staged-copy- read dep signals\n"); + hsa_agent_t dstAgent = * (static_cast (dstPtrInfo._acc.get_hsa_agent())); + hsa_agent_t srcAgent = * (static_cast (srcPtrInfo._acc.get_hsa_agent())); - device->_stagingBuffer[1]->CopyPeerToPeer(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt ? &depSignal : NULL); - - // The copy completes before returning so can reset queue to empty: - this->wait(crit, true); - - } else { - assert(0); // currently no fallback for this path. - } + device->_stagingBuffer[1]->CopyPeerToPeer(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt ? &depSignal : NULL); + // The copy completes before returning so can reset queue to empty: + this->wait(crit, true); } else { // If not special case - these can all be handled by the hsa async copy: ihipCommand_t commandType; @@ -1896,6 +1867,7 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, copySync(crit, dst, src, sizeBytes, kind, resolveOn); } + void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) { LockedAccessor_StreamCrit_t crit(_criticalData); From e843d8cb51bc18e3c4c872854f5616aa93f5b9ac Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 11 Sep 2016 11:11:17 -0500 Subject: [PATCH 22/66] Remove USE_AV_COPY, USE_PEER_TO_PEER fallback paths. Change-Id: I9c20173e62029c4caebabc98784c6d7697758e4f --- include/hcc_detail/hip_hcc.h | 7 ------- src/hip_hcc.cpp | 4 ---- src/hip_peer.cpp | 12 ------------ src/hip_stream.cpp | 9 --------- 4 files changed, 32 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 3ce2e7ed2d..fcf5c558c6 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -31,13 +31,6 @@ THE SOFTWARE. // #define USE_MEMCPYTOSYMBOL // -//Use the new HCC accelerator_view::copy instead of am_copy -#define USE_AV_COPY (__hcc_workweek__ >= 16351) - -// Compile peer-to-peer support. -// >= 2 : use HCC hc:accelerator::get_is_peer -// >= 3 : use hc::am_memtracker_update_peers(...) -#define USE_PEER_TO_PEER 3 //--- diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 09fbe545cc..f5f028603e 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -293,7 +293,6 @@ void ihipStream_t::locked_wait(bool assertQueueEmpty) }; -#if USE_AV_COPY // Causes current stream to wait for specified event to complete: void ihipStream_t::locked_waitEvent(hipEvent_t event) { @@ -302,7 +301,6 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event) // TODO - check state of event here: crit->_av.create_blocking_marker(event->_marker); } -#endif // Create a marker in this stream. // Save state in the event so it can track the status of the event. @@ -1728,7 +1726,6 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const bool copyEngineCanSeeSrcAndDest = false; if (kind == hipMemcpyDeviceToDevice) { -#if USE_PEER_TO_PEER>=2 // Lock to prevent another thread from modifying peer list while we are trying to look at it. LockedAccessor_CtxCrit_t dcrit(ctx->criticalData()); // FIXME - this assumes peer access only from primary context. @@ -1736,7 +1733,6 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const if (dcrit->isPeer(ihipGetPrimaryCtx(dstPtrInfo._appId)) && (dcrit->isPeer(ihipGetPrimaryCtx(srcPtrInfo._appId)))) { copyEngineCanSeeSrcAndDest = true; } -#endif } diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index 8f75d99918..731b872c74 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -47,11 +47,7 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_ if (thisCtx == peerCtx) { *canAccessPeer = 0; } else { -#if USE_PEER_TO_PEER>=2 *canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); -#else - *canAccessPeer = 0; -#endif } } else { @@ -75,12 +71,8 @@ hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx) auto thisCtx = ihipGetTlsDefaultCtx(); if ((thisCtx != NULL) && (peerCtx != NULL)) { -#if USE_PEER_TO_PEER>=2 // Return true if thisCtx can access peerCtx's memory: bool canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); -#else - bool canAccessPeer = 0; -#endif if (! canAccessPeer) { err = hipErrorInvalidDevice; // P2P not allowed between these devices. @@ -90,10 +82,8 @@ hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx) LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); bool changed = peerCrit->removePeer(thisCtx); if (changed) { -#if USE_PEER_TO_PEER>=3 // Update the peers for all memory already saved in the tracker: am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); -#endif } else { err = hipErrorPeerAccessNotEnabled; // never enabled P2P access. } @@ -124,9 +114,7 @@ hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); bool isNewPeer = peerCrit->addPeer(thisCtx); if (isNewPeer) { -#if USE_PEER_TO_PEER>=3 am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); -#endif } else { err = hipErrorPeerAccessAlreadyEnabled; } diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 7b3dc31f07..9475e61b21 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -74,12 +74,6 @@ hipError_t hipStreamCreate(hipStream_t *stream) } -#if USE_AV_COPY==0 -//--- -/** - * @bug This function conservatively waits for all work in the specified stream to complete. - */ -#endif hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags) { HIP_INIT_API(stream, event, flags); @@ -93,14 +87,11 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int bool fastWait = false; -#if USE_AV_COPY if (stream != hipStreamNull) { stream->locked_waitEvent(event); fastWait = true; // don't use the slow host-side synchronization. } - // TODO - clean up if/else logic when USE_AV_COPY enabled. -#endif if (!fastWait) { // TODO-hcc Convert to use create_blocking_marker(...) functionality. From da44f3f90746792b334b34d39b37dcfc5429cd3c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 19 Sep 2016 11:20:51 -0500 Subject: [PATCH 23/66] Use HCC's synchronous accelerator_view::copy Replace large block of HIP code with a call to HCC av::copy(). Change-Id: Ic32e1801cf8d4cd116ac02b72c41b1a1e4b6065c --- src/hip_hcc.cpp | 148 +----------------------------------------------- 1 file changed, 2 insertions(+), 146 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index f5f028603e..8bd1e0be3c 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -541,7 +541,7 @@ void ihipStream_t::launchModuleKernel( uint32_t gridDimY, uint32_t gridDimZ, uint32_t groupSegmentSize, - uint32_t privateSegmentSize, + uint32_t privateSegmentSize, void *kernarg, size_t kernSize, uint64_t kernel){ @@ -1708,151 +1708,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const throw ihipException(hipErrorInvalidDevice); } - hc::accelerator acc; - hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); - hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); - - bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); - bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); - bool srcInDeviceMem = srcPtrInfo._isInDeviceMem; - bool dstInDeviceMem = dstPtrInfo._isInDeviceMem; - - // Resolve default to a specific Kind so we know which algorithm to use: - if (kind == hipMemcpyDefault && resolveOn) { - kind = resolveMemcpyDirection(srcTracked, dstTracked, srcInDeviceMem, dstInDeviceMem); - }; - - hsa_signal_t depSignal; - - bool copyEngineCanSeeSrcAndDest = false; - if (kind == hipMemcpyDeviceToDevice) { - // Lock to prevent another thread from modifying peer list while we are trying to look at it. - LockedAccessor_CtxCrit_t dcrit(ctx->criticalData()); - // FIXME - this assumes peer access only from primary context. - // Would need to change the tracker to store a void * parameter that we could map to the ctx where the pointer is allocated. - if (dcrit->isPeer(ihipGetPrimaryCtx(dstPtrInfo._appId)) && (dcrit->isPeer(ihipGetPrimaryCtx(srcPtrInfo._appId)))) { - copyEngineCanSeeSrcAndDest = true; - } - } - - - if (kind == hipMemcpyHostToDevice) { - int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2D); - if(!srcTracked){ - tprintf(DB_COPY1, "D2H && !dstTracked: staged copy H2D dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; - - if (HIP_OPTIMAL_MEM_TRANSFER) { - copyMode = UnpinnedCopyEngine::ChooseBest; - } else if (HIP_PININPLACE) { - copyMode = UnpinnedCopyEngine::UsePinInPlace; - } - device->_stagingBuffer[0]->CopyHostToDevice(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - // The copy waits for inputs and then completes before returning so can reset queue to empty: - this->wait(crit, true); - } else { - // This is H2D copy, and source is pinned host memory : we can copy directly w/o using staging buffer. - hsa_agent_t dstAgent = *(static_cast(dstPtrInfo._acc.get_hsa_agent())); - hsa_agent_t srcAgent = *(static_cast(srcPtrInfo._acc.get_hsa_agent())); - - ihipSignal_t *ihipSignal = allocSignal(crit); - hsa_signal_t copyCompleteSignal = ihipSignal->_hsaSignal; - - hsa_signal_store_relaxed(copyCompleteSignal, 1); - void *devPtrSrc = srcPtrInfo._devicePointer; - tprintf(DB_COPY1, "HSA Async_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, dstAgent, devPtrSrc, g_cpu_agent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, copyCompleteSignal); - - // This is sync copy, so let's wait for copy right here: - if (hsa_status == HSA_STATUS_SUCCESS) { - waitCopy(crit, ihipSignal); // wait for copy, and return to pool. - } else { - throw ihipException(hipErrorInvalidValue); - } - } - } else if (kind == hipMemcpyDeviceToHost) { - int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyD2H); - if (!dstTracked){ - tprintf(DB_COPY1, "D2H && !dstTracked: staged copy D2H dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - UnpinnedCopyEngine::CopyMode copyMode = UnpinnedCopyEngine::UseStaging; - - if (HIP_OPTIMAL_MEM_TRANSFER) { - copyMode = UnpinnedCopyEngine::ChooseBest; - } else if (HIP_PININPLACE) { - copyMode = UnpinnedCopyEngine::UsePinInPlace; - } - - device->_stagingBuffer[1]->CopyDeviceToHost(copyMode, dst, src, sizeBytes, depSignalCnt ? &depSignal : NULL); - - // The copy completes before returning so can reset queue to empty: - this->wait(crit, true); - } else { - // This is D2H copy, and destination is pinned host memory : we can copy directly w/o using staging buffer. - hsa_agent_t dstAgent = *(static_cast(dstPtrInfo._acc.get_hsa_agent())); - hsa_agent_t srcAgent = *(static_cast(srcPtrInfo._acc.get_hsa_agent())); - - ihipSignal_t *ihipSignal = allocSignal(crit); - hsa_signal_t copyCompleteSignal = ihipSignal->_hsaSignal; - - hsa_signal_store_relaxed(copyCompleteSignal, 1); - void *devPtrDst = dstPtrInfo._devicePointer; - tprintf(DB_COPY1, "HSA Async_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(devPtrDst, g_cpu_agent, src, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, copyCompleteSignal); - - // This is sync copy, so let's wait for copy right here: - if (hsa_status == HSA_STATUS_SUCCESS) { - waitCopy(crit, ihipSignal); // wait for copy, and return to pool. - } else { - throw ihipException(hipErrorInvalidValue); - } - } - } else if (kind == hipMemcpyHostToHost) { - int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyH2H); - - if (depSignalCnt) { - // host waits before doing host memory copy. - hsa_signal_wait_acquire(depSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } - memcpy(dst, src, sizeBytes); - } else if ((kind == hipMemcpyDeviceToDevice) && !copyEngineCanSeeSrcAndDest) { - int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, ihipCommandCopyP2P); - tprintf(DB_COPY1, "P2P but engine can't see both pointers: staged copy P2P dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - //printf ("staged-copy- read dep signals\n"); - hsa_agent_t dstAgent = * (static_cast (dstPtrInfo._acc.get_hsa_agent())); - hsa_agent_t srcAgent = * (static_cast (srcPtrInfo._acc.get_hsa_agent())); - - device->_stagingBuffer[1]->CopyPeerToPeer(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt ? &depSignal : NULL); - - // The copy completes before returning so can reset queue to empty: - this->wait(crit, true); - } else { - // If not special case - these can all be handled by the hsa async copy: - ihipCommand_t commandType; - hsa_agent_t srcAgent, dstAgent; - setAsyncCopyAgents(kind, &commandType, &srcAgent, &dstAgent); - - int depSignalCnt = preCopyCommand(crit, NULL, &depSignal, commandType); - - // Get a completion signal: - ihipSignal_t *ihipSignal = allocSignal(crit); - hsa_signal_t copyCompleteSignal = ihipSignal->_hsaSignal; - - hsa_signal_store_relaxed(copyCompleteSignal, 1); - - tprintf(DB_COPY1, "HSA Async_copy dst=%p src=%p sz=%zu\n", dst, src, sizeBytes); - - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, copyCompleteSignal); - - // This is sync copy, so let's wait for copy right here: - if (hsa_status == HSA_STATUS_SUCCESS) { - waitCopy(crit, ihipSignal); // wait for copy, and return to pool. - } else { - throw ihipException(hipErrorInvalidValue); - } - } - + crit->_av.copy(src, dst, sizeBytes); } From 9c9b0ab555efa8abef8fbb9fe3f57d91584ca753 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 19 Sep 2016 11:45:07 -0500 Subject: [PATCH 24/66] Change HIP async copy to call av::copy_async. Change-Id: I4274b63ced3940d5249c32bd9d156296529c70e8 --- src/hip_hcc.cpp | 48 +++++++++++++++++------------------------------- 1 file changed, 17 insertions(+), 31 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 8bd1e0be3c..e4365dab6f 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1757,42 +1757,28 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig trueAsync = false; } - if (kind == hipMemcpyDefault) { - bool srcInDeviceMem = (srcTracked && srcPtrInfo._isInDeviceMem); - bool dstInDeviceMem = (dstTracked && dstPtrInfo._isInDeviceMem); - kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); - } - - - ihipSignal_t *ihip_signal = allocSignal(crit); - hsa_signal_store_relaxed(ihip_signal->_hsaSignal, 1); - - - if(trueAsync == true){ - ihipCommand_t commandType; - hsa_agent_t srcAgent, dstAgent; - setAsyncCopyAgents(kind, &commandType, &srcAgent, &dstAgent); - hsa_signal_t depSignal; - int depSignalCnt = preCopyCommand(crit, ihip_signal, &depSignal, commandType); + if (trueAsync == true) { + // Perform a syncrhonous copy: + try { + crit->_av.copy_async(src, dst, sizeBytes); + } catch (Kalmar::runtime_exception) { + throw ihipException(hipErrorRuntimeOther); + }; - tprintf (DB_SYNC, " copy-async, waitFor=%lu completion=#%lu(%lu)\n", depSignalCnt? depSignal.handle:0x0, ihip_signal->_sigId, ihip_signal->_hsaSignal.handle); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dst, dstAgent, src, srcAgent, sizeBytes, depSignalCnt, depSignalCnt ? &depSignal:0x0, ihip_signal->_hsaSignal); - - - if (hsa_status == HSA_STATUS_SUCCESS) { - if (HIP_LAUNCH_BLOCKING) { - tprintf(DB_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes); - this->wait(crit); - } - } else { - // This path can be hit if src or dst point to unpinned host memory. - // TODO-stream - does async-copy fall back to sync if input pointers are not pinned? - throw ihipException(hipErrorInvalidValue); - } + if (HIP_LAUNCH_BLOCKING) { + tprintf(DB_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes); + this->wait(crit); + } } else { + // Perform a syncrhonous copy: + if (kind == hipMemcpyDefault) { + bool srcInDeviceMem = (srcTracked && srcPtrInfo._isInDeviceMem); + bool dstInDeviceMem = (dstTracked && dstPtrInfo._isInDeviceMem); + kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); + } copySync(crit, dst, src, sizeBytes, kind); } } From 8c4cecf367bf2404f4dab739ac2aa49b7d7648b6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 19 Sep 2016 16:34:10 -0500 Subject: [PATCH 25/66] Cleanup, remove preCopyCommand. Change-Id: I3768d3789a99be8136b43179d4152fa1875665cb --- include/hcc_detail/hip_hcc.h | 11 -------- src/hip_hcc.cpp | 50 ------------------------------------ 2 files changed, 61 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index fcf5c558c6..01f1b01313 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -56,7 +56,6 @@ extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU id //--- // Chicken bits for disabling functionality to work around potential issues: extern int HIP_DISABLE_HW_KERNEL_DEP; -extern int HIP_DISABLE_HW_COPY_DEP; //--- //Extern tls @@ -90,15 +89,6 @@ extern const char *API_COLOR_END; #define CTX_THREAD_SAFE 1 -// If FORCE_COPY_DEP=1 , HIP runtime will add -// synchronization for copy commands in the same stream, regardless of command type. -// If FORCE_COPY_DEP=0 data copies of the same kind (H2H, H2D, D2H, D2D) are assumed to be implicitly ordered. -// ROCR runtime implementation currently provides this guarantee when using SDMA queues but not -// when using shader queues. -// TODO - measure if this matters for performance, in particular for back-to-back small copies. -// If not, we can simplify the copy dependency tracking by collapsing to a single Copy type, and always forcing dependencies for copy commands. -#define FORCE_SAMEDIR_COPY_DEP 1 - // Compile debug trace mode - this prints debug messages to stderr when env var HIP_DB is set. // May be set to 0 to remove debug if checks - possible code size and performance difference? @@ -447,7 +437,6 @@ typedef uint64_t SeqNum_t ; void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind); - int preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *lastCopy, hsa_signal_t *waitSignal, ihipCommand_t copyType); //--- // Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex. diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index e4365dab6f..ad76dbce77 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -81,7 +81,6 @@ int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU ident //--- // Chicken bits for disabling functionality to work around potential issues: int HIP_DISABLE_HW_KERNEL_DEP = 0; -int HIP_DISABLE_HW_COPY_DEP = 0; @@ -481,54 +480,6 @@ void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFutu -//--- -// Called whenever a copy command is set to the stream. -// Examines the last command sent to this stream and returns a signal to wait on, if required. -int ihipStream_t::preCopyCommand(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *copyCompletionSignal, hsa_signal_t *waitSignal, ihipCommand_t copyType) -{ - int needSync = 0; - - waitSignal->handle = 0; - - // If switching command types, we need to add a barrier packet to synchronize things. - if (FORCE_SAMEDIR_COPY_DEP || (crit->_last_command_type != copyType)) { - - - if (crit->_last_command_type == ihipCommandKernel) { - tprintf (DB_SYNC, "stream %p switch %s to %s (async copy dep on prev kernel)\n", - this, ihipCommandName[crit->_last_command_type], ihipCommandName[copyType]); - needSync = 1; - ihipSignal_t *depSignal = allocSignal(crit); - hsa_signal_store_relaxed(depSignal->_hsaSignal,1); - this->enqueueBarrier(static_cast(crit->_av.get_hsa_queue()), NULL, depSignal); - *waitSignal = depSignal->_hsaSignal; - } else if (crit->_last_copy_signal) { - needSync = 1; - tprintf (DB_SYNC, "stream %p switch %s to %s (async copy dep on other copy #%lu)\n", - this, ihipCommandName[crit->_last_command_type], ihipCommandName[copyType], crit->_last_copy_signal->_sigId); - *waitSignal = crit->_last_copy_signal->_hsaSignal; - } - - if (HIP_DISABLE_HW_COPY_DEP && needSync) { - if (HIP_DISABLE_HW_COPY_DEP == -1) { - tprintf (DB_SYNC, "IGNORE copy dependency\n") - - } else { - tprintf (DB_SYNC, "HOST-wait for copy dependency\n") - // do the wait here on the host, and disable the device-side command resolution. - hsa_signal_wait_acquire(*waitSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - needSync = 0; - } - } - - crit->_last_command_type = copyType; - } - - crit->_last_copy_signal = copyCompletionSignal; - - return needSync; -} - // Precursor: the stream is already locked,specifically so this routine can enqueue work into the specified av. void ihipStream_t::launchModuleKernel( @@ -1286,7 +1237,6 @@ void ihipInit() READ_ENV_I(release, HIP_VISIBLE_DEVICES, CUDA_VISIBLE_DEVICES, "Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence" ); READ_ENV_I(release, HIP_DISABLE_HW_KERNEL_DEP, 0, "Disable HW dependencies before kernel commands - instead wait for dependency on host. -1 means ignore these dependencies. (debug mode)"); - READ_ENV_I(release, HIP_DISABLE_HW_COPY_DEP, 0, "Disable HW dependencies before copy commands - instead wait for dependency on host. -1 means ifnore these dependencies (debug mode)"); READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Number of kernels per stream "); From 7530fa6dbea6bdbc3df9c39d52235e80cd644aed Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 19 Sep 2016 17:09:50 -0500 Subject: [PATCH 26/66] Remove HIP command dependency tracking. Change-Id: I991c13bc5108193959ba70f9f6f9c692c9ad3a5b --- include/hcc_detail/hip_hcc.h | 15 ---------- src/hip_event.cpp | 2 -- src/hip_hcc.cpp | 58 ++---------------------------------- 3 files changed, 3 insertions(+), 72 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 01f1b01313..d853c59c3f 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -373,8 +373,6 @@ class ihipStreamCriticalBase_t : public LockedBase { public: ihipStreamCriticalBase_t(hc::accelerator_view av) : - _last_command_type(ihipCommandCopyH2H), - _last_copy_signal(NULL), _signalCursor(0), _oldest_live_sig_id(1), _streamSigId(0), @@ -392,15 +390,6 @@ class ihipStreamCriticalBase_t : public LockedBase ihipStreamCriticalBase_t * mlock() { LockedBase::lock(); return this;}; public: - // Critical Data: - ihipCommand_t _last_command_type; // type of the last command - - // signal of last copy command sent to the stream. - // May be NULL, indicating the previous command has completley finished and future commands don't need to create a dependency. - // Copy can be either H2D or D2H. - ihipSignal_t *_last_copy_signal; - - hc::completion_future _last_kernel_future; // Completion future of last kernel command sent to GPU. // Signal pool: int _signalCursor; @@ -444,7 +433,6 @@ typedef uint64_t SeqNum_t ; void lockclose_postKernelCommand(hc::completion_future &kernel_future); - void locked_reclaimSignals(SIGSEQNUM sigNum); void locked_wait(bool assertQueueEmpty=false); hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); }; @@ -467,7 +455,6 @@ typedef uint64_t SeqNum_t ; void *kernarg, size_t kernSize, uint64_t kernel); // Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function. - SIGSEQNUM lastCopySeqId (LockedAccessor_StreamCrit_t &crit) const { return crit->_last_copy_signal ? crit->_last_copy_signal->_sigId : 0; }; ihipSignal_t * allocSignal (LockedAccessor_StreamCrit_t &crit); @@ -526,8 +513,6 @@ struct ihipEvent_t { hc::completion_future _marker; uint64_t _timestamp; // store timestamp, may be set on host or by marker. - - SIGSEQNUM _copySeqId; } ; diff --git a/src/hip_event.cpp b/src/hip_event.cpp index 77d33cb6c2..52b25fc19b 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -39,7 +39,6 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) eh->_stream = NULL; eh->_flags = flags; eh->_timestamp = 0; - eh->_copySeqId = 0; *event = eh; // TODO - allocat the event directly, no copy needed. } else { e = hipErrorInvalidValue; @@ -123,7 +122,6 @@ hipError_t hipEventSynchronize(hipEvent_t event) return ihipLogStatus(hipSuccess); } else { event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); - event->_stream->locked_reclaimSignals(event->_copySeqId); return ihipLogStatus(hipSuccess); } diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index ad76dbce77..995efb584c 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -212,20 +212,6 @@ ihipStream_t::~ihipStream_t() } -//--- -//TODO - this function is dangerous since it does not propertly account -//for younger commands which may be depending on the signals we are reclaiming. -//Will fix when we move to HCC management of copy signals. -void ihipStream_t::locked_reclaimSignals(SIGSEQNUM sigNum) -{ - LockedAccessor_StreamCrit_t crit(_criticalData); - - tprintf(DB_SIGNAL, "reclaim signal #%lu\n", sigNum); - // Mark all signals older and including this one as available for re-allocation. - crit->_oldest_live_sig_id = sigNum+1; -} - - //--- void ihipStream_t::waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal) { @@ -253,17 +239,7 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty waitOnAllCFs(crit); } - if (crit->_last_copy_signal) { - tprintf (DB_SYNC, "stream %p wait for lastCopy:#%lu...\n", this, lastCopySeqId(crit) ); - this->waitCopy(crit, crit->_last_copy_signal); - } - crit->_kernelCnt = 0; - - // Reset the stream to "empty" - next command will not set up an inpute dependency on any older signal. - crit->_last_command_type = ihipCommandCopyH2D; - crit->_last_copy_signal = NULL; -// crit->_signalCnt = 0; } void ihipStream_t::addCFtoStream(LockedAccessor_StreamCrit_t &crit, hc::completion_future *cf) @@ -309,7 +285,6 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) LockedAccessor_StreamCrit_t crit(_criticalData); event->_marker = crit->_av.create_marker(); - event->_copySeqId = lastCopySeqId(crit); } //============================================================================= @@ -422,42 +397,16 @@ void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, i int HIP_NUM_KERNELS_INFLIGHT = 128; //-- -//When the commands in a stream change types (ie kernel command follows a data command, -//or data command follows a kernel command), then we need to add a barrier packet -//into the stream to mimic CUDA stream semantics. (some hardware uses separate -//queues for data commands and kernel commands, and no implicit ordering is provided). -// +// Lock the stream to prevent other threads from intervening. LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() { LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/); - if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){ - this->wait(crit); + this->wait(crit); crit->_kernelCnt = 0; } crit->_kernelCnt++; - // If switching command types, we need to add a barrier packet to synchronize things. - if (crit->_last_command_type != ihipCommandKernel) { - if (crit->_last_copy_signal) { - - hsa_queue_t * q = (hsa_queue_t*) (crit->_av.get_hsa_queue()); - if (HIP_DISABLE_HW_KERNEL_DEP == 0) { - this->enqueueBarrier(q, crit->_last_copy_signal, NULL); - tprintf (DB_SYNC, "stream %p switch %s to %s (barrier pkt inserted with wait on #%lu)\n", - this, ihipCommandName[crit->_last_command_type], ihipCommandName[ihipCommandKernel], crit->_last_copy_signal->_sigId) - - } else if (HIP_DISABLE_HW_KERNEL_DEP>0) { - tprintf (DB_SYNC, "stream %p switch %s to %s (HOST wait for previous...)\n", - this, ihipCommandName[crit->_last_command_type], ihipCommandName[ihipCommandKernel]); - this->waitCopy(crit, crit->_last_copy_signal); - } else if (HIP_DISABLE_HW_KERNEL_DEP==-1) { - tprintf (DB_SYNC, "stream %p switch %s to %s (IGNORE dependency)\n", - this, ihipCommandName[crit->_last_command_type], ihipCommandName[ihipCommandKernel]); - } - } - crit->_last_command_type = ihipCommandKernel; - } return crit; } @@ -467,8 +416,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() // Must be called after kernel finishes, this releases the lock on the stream so other commands can submit. void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFuture) { - // We locked _criticalData in the lockopen_preKernelCommand() so OK to access here: - _criticalData._last_kernel_future = kernelFuture; if (HIP_LAUNCH_BLOCKING) { kernelFuture.wait(); @@ -1217,6 +1164,7 @@ void ihipInit() READ_ENV_I(release, HIP_PRINT_ENV, 0, "Print HIP environment variables."); //-- READ HIP_PRINT_ENV env first, since it has impact on later env var reading + // TODO: In HIP/hcc, this variable blocks after both kernel commmands and data transfer. Maybe should be bit-mask for each command type? READ_ENV_I(release, HIP_LAUNCH_BLOCKING, CUDA_LAUNCH_BLOCKING, "Make HIP APIs 'host-synchronous', so they block until any kernel launches or data copy commands complete. Alias: CUDA_LAUNCH_BLOCKING." ); READ_ENV_I(release, HIP_DB, 0, "Print various debug info. Bitmask, see hip_hcc.cpp for more information."); if ((HIP_DB & (1< Date: Mon, 19 Sep 2016 17:30:15 -0500 Subject: [PATCH 27/66] Cleanup: Remove HIP signal pool. Change-Id: Icebfd0509d12396cc5933d5556d68b53e1be36e0 --- include/hcc_detail/hip_hcc.h | 18 +-------- src/hip_hcc.cpp | 76 ------------------------------------ 2 files changed, 1 insertion(+), 93 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index d853c59c3f..3d1b43994e 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -48,7 +48,6 @@ extern int HIP_ATP_MARKER; extern int HIP_ATP; extern int HIP_DB; extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */ -extern int HIP_PININPLACE; extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */ extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */ @@ -373,33 +372,20 @@ class ihipStreamCriticalBase_t : public LockedBase { public: ihipStreamCriticalBase_t(hc::accelerator_view av) : - _signalCursor(0), - _oldest_live_sig_id(1), - _streamSigId(0), _kernelCnt(0), - _signalCnt(0), _av(av) { - _signalPool.resize(HIP_STREAM_SIGNALS > 0 ? HIP_STREAM_SIGNALS : 1); }; ~ihipStreamCriticalBase_t() { - _signalPool.clear(); } ihipStreamCriticalBase_t * mlock() { LockedBase::lock(); return this;}; public: + // TODO - remove _kernelCnt mechanism: - // Signal pool: - int _signalCursor; - SIGSEQNUM _oldest_live_sig_id; // oldest live seq_id, anything < this can be allocated. - std::deque _signalPool; // Pool of signals for use by this stream. - uint32_t _signalCnt; // Count of inflight commands using signals from the signal pool. - // Each copy may use 1-2 signals depending on command transitions: - // 2 are required if a barrier packet is inserted. uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait(). - SIGSEQNUM _streamSigId; // Monotonically increasing unique signal id. hc::accelerator_view _av; @@ -454,8 +440,6 @@ typedef uint64_t SeqNum_t ; uint32_t groupSegmentSize, uint32_t sharedMemBytes, void *kernarg, size_t kernSize, uint64_t kernel); - // Non-threadsafe accessors - must be protected by high-level stream lock with accessor passed to function. - ihipSignal_t * allocSignal (LockedAccessor_StreamCrit_t &crit); //-- Non-racy accessors: diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 995efb584c..36146868db 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -69,8 +69,6 @@ int HIP_ATP_MARKER= 0; int HIP_DB= 0; int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ static const int HIP_STAGING_BUFFERS = 2; -int HIP_PININPLACE = 0; -int HIP_OPTIMAL_MEM_TRANSFER = 0; //ENV Variable to test different memory transfer logics int HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING = 0; int HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE = 0; int HIP_D2H_MEM_TRANSFER_THRESHOLD = 0; @@ -222,11 +220,6 @@ void ihipStream_t::waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *sig tprintf(DB_SIGNAL, "waitCopy reclaim signal #%lu\n", sigNum); - // Mark all signals older and including this one as available for reclaim - if (sigNum > crit->_oldest_live_sig_id) { - crit->_oldest_live_sig_id = sigNum+1; // TODO, +1 here seems dangerous. - } - } //Wait for all kernel and data copy commands in this stream to complete. @@ -311,59 +304,6 @@ ihipCtx_t * ihipStream_t::getCtx() const #define HIP_NUM_SIGNALS_PER_STREAM 32 -//--- -// Allocate a new signal from the signal pool. -// Returned signals have value of 0. -// Signals are intended for use in this stream and are always reclaimed "in-order". -ihipSignal_t *ihipStream_t::allocSignal(LockedAccessor_StreamCrit_t &crit) -{ - int numToScan = crit->_signalPool.size(); - crit->_signalCnt++; - if(crit->_signalCnt == HIP_STREAM_SIGNALS){ - this->wait(crit); - crit->_signalCnt = 0; - } - - return &crit->_signalPool[crit->_signalCnt]; - - do { - auto thisCursor = crit->_signalCursor; - - if (++crit->_signalCursor == crit->_signalPool.size()) { - crit->_signalCursor = 0; - } - - if (crit->_signalPool[thisCursor]._sigId < crit->_oldest_live_sig_id) { - SIGSEQNUM oldSigId = crit->_signalPool[thisCursor]._sigId; - crit->_signalPool[thisCursor]._index = thisCursor; - crit->_signalPool[thisCursor]._sigId = ++crit->_streamSigId; // allocate it. - tprintf(DB_SIGNAL, "allocatSignal #%lu at pos:%i (old sigId:%lu < oldest_live:%lu)\n", - crit->_signalPool[thisCursor]._sigId, - thisCursor, oldSigId, crit->_oldest_live_sig_id); - - - - return &crit->_signalPool[thisCursor]; - } - - } while (--numToScan) ; - - assert(numToScan == 0); - - // Have to grow the pool: - crit->_signalCursor = crit->_signalPool.size(); // set to the beginning of the new entries: - if (crit->_signalCursor > 10000) { - fprintf (stderr, "warning: signal pool size=%d, may indicate runaway number of inflight commands\n", crit->_signalCursor); - } - crit->_signalPool.resize(crit->_signalPool.size() * 2); - tprintf (DB_SIGNAL, "grow signal pool to %zu entries, cursor=%d\n", crit->_signalPool.size(), crit->_signalCursor); - return allocSignal(crit); // try again, - - // Should never reach here. - assert(0); -} - - //--- void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal) { @@ -1176,8 +1116,6 @@ void ihipInit() READ_ENV_S(release, HIP_TRACE_API_COLOR, 0, "Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White"); READ_ENV_I(release, HIP_ATP_MARKER, 0, "Add HIP function begin/end to ATP file generated with CodeXL"); READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); - READ_ENV_I(release, HIP_PININPLACE, 0, "For unpinned transfers, pin the memory in-place in chunks before doing the copy."); - READ_ENV_I(release, HIP_OPTIMAL_MEM_TRANSFER, 0, "For optimal memory transfers for unpinned memory.Under testing."); READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, 0, "Threshold value for H2D unpinned memory transfer decision between direct copy or staging buffer usage,Under testing."); READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, 0, "Threshold value for H2D unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing."); READ_ENV_I(release, HIP_D2H_MEM_TRANSFER_THRESHOLD, 0, "Threshold value for D2H unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing."); @@ -1188,20 +1126,6 @@ void ihipInit() READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Number of kernels per stream "); - if (HIP_OPTIMAL_MEM_TRANSFER && !HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING) { - HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING= MEMCPY_H2D_DIRECT_VS_STAGING_COPY_THRESHOLD; - fprintf (stderr, "warning: env var HIP_OPTIMAL_MEM_TRANSFER=0x%x but HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING=0.Using default value for this.\n", HIP_OPTIMAL_MEM_TRANSFER); - } - - if (HIP_OPTIMAL_MEM_TRANSFER && !HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE) { - HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE= MEMCPY_H2D_STAGING_VS_PININPLACE_COPY_THRESHOLD; - fprintf (stderr, "warning: env var HIP_OPTIMAL_MEM_TRANSFER=0x%x but HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE=0.Using default value for this.\n", HIP_OPTIMAL_MEM_TRANSFER); - } - - if (HIP_OPTIMAL_MEM_TRANSFER && !HIP_D2H_MEM_TRANSFER_THRESHOLD) { - HIP_D2H_MEM_TRANSFER_THRESHOLD= MEMCPY_D2H_STAGING_VS_PININPLACE_COPY_THRESHOLD; - fprintf (stderr, "warning: env var HIP_OPTIMAL_MEM_TRANSFER=0x%x but HIP_D2H_MEM_TRANSFER_THRESHOLD=0.Using default value for this.\n", HIP_OPTIMAL_MEM_TRANSFER); - } // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { fprintf (stderr, "warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0. (perhaps enable COMPILE_HIP_DB in src code before compiling?)", HIP_DB); From e0ce1d395412bea66ee23962de11e0f8e994fa50 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 19 Sep 2016 17:49:25 -0500 Subject: [PATCH 28/66] Cleanup. Remove cfs, ihipSignal_t, staging buffer calls. Change-Id: I8bb67c484e3a65be06a03665f059217930da2bed --- include/hcc_detail/hip_hcc.h | 48 +--------- src/hip_hcc.cpp | 173 +---------------------------------- 2 files changed, 7 insertions(+), 214 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 3d1b43994e..39c04a1afc 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -227,40 +227,6 @@ extern "C" { const hipStream_t hipStreamNull = 0x0; -enum ihipCommand_t { - ihipCommandCopyH2H, - ihipCommandCopyH2D, - ihipCommandCopyD2H, - ihipCommandCopyD2D, - ihipCommandCopyP2P, - ihipCommandKernel, -}; - -static const char* ihipCommandName[] = { - "CopyH2H", "CopyH2D", "CopyD2H", "CopyD2D", "CopyP2P", "Kernel" -}; - - - -typedef uint64_t SIGSEQNUM; - -//--- -// Small wrapper around signals. -// Designed to be used from stream. -// TODO-someday refactor this class so it can be stored in a vector<> -// we already store the index here so we can use for garbage collection. -struct ihipSignal_t { - hsa_signal_t _hsaSignal; // hsa signal handle - int _index; // Index in pool, used for garbage collection. - SIGSEQNUM _sigId; // unique sequentially increasing ID. - - ihipSignal_t(); - ~ihipSignal_t(); - - void release(); -}; - - // Used to remove lock, for performance or stimulating bugs. class FakeMutex { @@ -384,13 +350,8 @@ class ihipStreamCriticalBase_t : public LockedBase public: // TODO - remove _kernelCnt mechanism: - uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait(). - hc::accelerator_view _av; - - std::vector _cfs; - }; @@ -426,8 +387,6 @@ typedef uint64_t SeqNum_t ; void locked_waitEvent(hipEvent_t event); void locked_recordEvent(hipEvent_t event); - void addCFtoStream(LockedAccessor_StreamCrit_t &crit, hc::completion_future* cf); - void waitOnAllCFs(LockedAccessor_StreamCrit_t &crit); //--- @@ -456,17 +415,14 @@ typedef uint64_t SeqNum_t ; private: - void enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal); - void waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal); // The unsigned return is hipMemcpyKind unsigned resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem); - void setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent); private: // Data - // Critical Data. THis MUST be accessed through LockedAccessor_StreamCrit_t + // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t ihipStreamCritical_t _criticalData; ihipCtx_t *_ctx; // parent context that owns this stream. @@ -525,7 +481,7 @@ class ihipDevice_t unsigned _computeUnits; hipDeviceProp_t _props; // saved device properties. - UnpinnedCopyEngine *_stagingBuffer[2]; // one buffer for each direction. + // TODO - report this through device properties, base on HCC API call. int _isLargeBar; ihipCtx_t *_primaryCtx; diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 36146868db..c3624c5fe3 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -67,21 +67,8 @@ int HIP_TRACE_API= 0; std::string HIP_TRACE_API_COLOR("green"); int HIP_ATP_MARKER= 0; int HIP_DB= 0; -int HIP_STAGING_SIZE = 64; /* size of staging buffers, in KB */ -static const int HIP_STAGING_BUFFERS = 2; -int HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING = 0; -int HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE = 0; -int HIP_D2H_MEM_TRANSFER_THRESHOLD = 0; -int HIP_STREAM_SIGNALS = 32; /* number of signals to allocate at stream creation */ int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ - - -//--- -// Chicken bits for disabling functionality to work around potential issues: -int HIP_DISABLE_HW_KERNEL_DEP = 0; - - - +int HIP_NUM_KERNELS_INFLIGHT = 128; std::once_flag hip_initialized; @@ -96,9 +83,6 @@ std::vector g_hip_visible_devices; hsa_agent_t g_cpu_agent; -// TODO, remove these if possible: -hsa_agent_t gpu_agent_; -hsa_amd_memory_pool_t gpu_pool_; //================================================================================================= // Thread-local storage: @@ -166,29 +150,6 @@ hipError_t ihipSynchronize(void) return (hipSuccess); } -//================================================================================================= -// ihipSignal_t: -//================================================================================================= -// -//--- -ihipSignal_t::ihipSignal_t() : _sigId(0) -{ - if (hsa_signal_create(0/*value*/, 0, NULL, &_hsaSignal) != HSA_STATUS_SUCCESS) { - throw ihipException(hipErrorRuntimeMemory); - } - //tprintf (DB_SIGNAL, " allocated hsa_signal=%lu\n", (_hsaSignal.handle)); -} - -//--- -ihipSignal_t::~ihipSignal_t() -{ - tprintf (DB_SIGNAL, " destroy hsa_signal #%lu (#%lu)\n", (_hsaSignal.handle), _sigId); - if (hsa_signal_destroy(_hsaSignal) != HSA_STATUS_SUCCESS) { - throw ihipException(hipErrorRuntimeOther); - } -}; - - //================================================================================================= // ihipStream_t: @@ -210,47 +171,18 @@ ihipStream_t::~ihipStream_t() } -//--- -void ihipStream_t::waitCopy(LockedAccessor_StreamCrit_t &crit, ihipSignal_t *signal) -{ - SIGSEQNUM sigNum = signal->_sigId; - tprintf(DB_SYNC, "waitCopy signal:#%lu\n", sigNum); - - hsa_signal_wait_acquire(signal->_hsaSignal, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - - - tprintf(DB_SIGNAL, "waitCopy reclaim signal #%lu\n", sigNum); -} - //Wait for all kernel and data copy commands in this stream to complete. //This signature should be used in routines that already have locked the stream mutex void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty) { if (! assertQueueEmpty) { tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this); -// crit->_av.wait(); - waitOnAllCFs(crit); + crit->_av.wait(); } crit->_kernelCnt = 0; } -void ihipStream_t::addCFtoStream(LockedAccessor_StreamCrit_t &crit, hc::completion_future *cf) -{ - crit->_cfs.push_back(cf); -} - -void ihipStream_t::waitOnAllCFs(LockedAccessor_StreamCrit_t &crit) -{ - for(uint32_t i=0;i_cfs.size();i++){ - if(crit->_cfs[i] != NULL){ - crit->_cfs[i]->wait(); - delete crit->_cfs[i]; - } - } - crit->_cfs.clear(); -} - //--- //Wait for all kernel and data copy commands in this stream to complete. void ihipStream_t::locked_wait(bool assertQueueEmpty) @@ -301,40 +233,7 @@ ihipCtx_t * ihipStream_t::getCtx() const }; -#define HIP_NUM_SIGNALS_PER_STREAM 32 - - -//--- -void ihipStream_t::enqueueBarrier(hsa_queue_t* queue, ihipSignal_t *depSignal, ihipSignal_t *completionSignal) -{ - - // Obtain the write index for the command queue - uint64_t index = hsa_queue_load_write_index_relaxed(queue); - const uint32_t queueMask = queue->size - 1; - - // Define the barrier packet to be at the calculated queue index address - hsa_barrier_and_packet_t* barrier = &(((hsa_barrier_and_packet_t*)(queue->base_address))[index&queueMask]); - memset(barrier, 0, sizeof(hsa_barrier_and_packet_t)); - - // setup header - uint16_t header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE; - header |= 1 << HSA_PACKET_HEADER_BARRIER; - //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; - //header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; - barrier->header = header; - - barrier->dep_signal[0].handle = depSignal ? depSignal->_hsaSignal.handle: 0; - - barrier->completion_signal.handle = completionSignal ? completionSignal->_hsaSignal.handle : 0; - - // TODO - check queue overflow, return error: - // Increment write index and ring doorbell to dispatch the kernel - hsa_queue_store_write_index_relaxed(queue, index+1); - hsa_signal_store_relaxed(queue->doorbell_signal, index); -} - -int HIP_NUM_KERNELS_INFLIGHT = 128; //-- // Lock the stream to prevent other threads from intervening. @@ -511,16 +410,6 @@ ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerato initProperties(&_props); - _stagingBuffer[0] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS, - _isLargeBar, - HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, - HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, - HIP_D2H_MEM_TRANSFER_THRESHOLD); - _stagingBuffer[1] = new UnpinnedCopyEngine(_hsaAgent,g_cpu_agent, HIP_STAGING_SIZE*1024, HIP_STAGING_BUFFERS, - _isLargeBar, - HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, - HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, - HIP_D2H_MEM_TRANSFER_THRESHOLD); _primaryCtx = new ihipCtx_t(this, deviceCnt, hipDeviceMapHost); } @@ -528,12 +417,8 @@ ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerato ihipDevice_t::~ihipDevice_t() { - for (int i=0; i<2; i++) { - if (_stagingBuffer[i]) { - delete _stagingBuffer[i]; - _stagingBuffer[i] = NULL; - } - } + delete _primaryCtx; + _primaryCtx = NULL; } @@ -607,14 +492,6 @@ hsa_status_t GetDevicePool(hsa_amd_memory_pool_t pool, void* data) { return HSA_STATUS_SUCCESS; } -void FindDevicePool() -{ - hsa_status_t err = hsa_iterate_agents(FindGpuDevice, &gpu_agent_); - ErrorCheck(err); - - err = hsa_amd_agent_iterate_memory_pools(gpu_agent_, GetDevicePool, &gpu_pool_); - ErrorCheck(err); -} int checkAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool) { @@ -767,14 +644,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) /* Computemode for HSA Devices is always : cudaComputeModeDefault */ prop->computeMode = 0; - FindDevicePool(); - int access=checkAccess(g_cpu_agent, gpu_pool_); - if (0 != access){ - _isLargeBar= 1; - } else { - _isLargeBar=0; - } - + _isLargeBar = _acc.has_cpu_accessible_am(); // Get Max Threads Per Multiprocessor @@ -1115,14 +985,8 @@ void ihipInit() READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_S(release, HIP_TRACE_API_COLOR, 0, "Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White"); READ_ENV_I(release, HIP_ATP_MARKER, 0, "Add HIP function begin/end to ATP file generated with CodeXL"); - READ_ENV_I(release, HIP_STAGING_SIZE, 0, "Size of each staging buffer (in KB)" ); - READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_DIRECT_OR_STAGING, 0, "Threshold value for H2D unpinned memory transfer decision between direct copy or staging buffer usage,Under testing."); - READ_ENV_I(release, HIP_H2D_MEM_TRANSFER_THRESHOLD_STAGING_OR_PININPLACE, 0, "Threshold value for H2D unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing."); - READ_ENV_I(release, HIP_D2H_MEM_TRANSFER_THRESHOLD, 0, "Threshold value for D2H unpinned memory transfer decision between staging buffer usage or pininplace usage .Under testing."); - READ_ENV_I(release, HIP_STREAM_SIGNALS, 0, "Number of signals to allocate when new stream is created (signal pool will grow on demand)"); READ_ENV_I(release, HIP_VISIBLE_DEVICES, CUDA_VISIBLE_DEVICES, "Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence" ); - READ_ENV_I(release, HIP_DISABLE_HW_KERNEL_DEP, 0, "Disable HW dependencies before kernel commands - instead wait for dependency on host. -1 means ignore these dependencies. (debug mode)"); READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Number of kernels per stream "); @@ -1281,7 +1145,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_ auto crit = stream->lockopen_preKernelCommand(); lp->av = &(crit->_av); lp->cf = new hc::completion_future; - stream->addCFtoStream(crit, lp->cf); ihipPrintKernelLaunch(kernelNameStr, lp, stream); return (stream); @@ -1304,7 +1167,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri auto crit = stream->lockopen_preKernelCommand(); lp->av = &(crit->_av); lp->cf = new hc::completion_future; - stream->addCFtoStream(crit, lp->cf); ihipPrintKernelLaunch(kernelNameStr, lp, stream); return (stream); } @@ -1326,7 +1188,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri auto crit = stream->lockopen_preKernelCommand(); lp->av = &(crit->_av); lp->cf = new hc::completion_future; - stream->addCFtoStream(crit, lp->cf); ihipPrintKernelLaunch(kernelNameStr, lp, stream); return (stream); } @@ -1349,7 +1210,6 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g lp->av = &(crit->_av); lp->cf = new hc::completion_future; // TODO, is this necessary? - stream->addCFtoStream(crit, lp->cf); ihipPrintKernelLaunch(kernelNameStr, lp, stream); return (stream); @@ -1498,29 +1358,6 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, } -// Setup the copyCommandType and the copy agents (for hsa_amd_memory_async_copy) -// srcPhysAcc is the physical location of the src data. For many copies this is the -void ihipStream_t::setAsyncCopyAgents(unsigned kind, ihipCommand_t *commandType, hsa_agent_t *srcAgent, hsa_agent_t *dstAgent) -{ - // current* represents the device associated with the specified stream. - const ihipDevice_t *streamDevice = this->getDevice(); - hsa_agent_t streamAgent = streamDevice->_hsaAgent; - - // ROCR runtime logic is : - // - If both src and dst are cpu agent, launch thread and memcpy. We want to avoid this. - // - If either/both src or dst is a gpu agent, use the first gpu agent’s DMA engine to perform the copy. - - switch (kind) { - //case hipMemcpyHostToHost : *commandType = ihipCommandCopyH2H; *srcAgent=streamAgent; *dstAgent=streamAgent; break; // TODO - enable me, for async copy use SDMA. - case hipMemcpyHostToHost : *commandType = ihipCommandCopyH2H; *srcAgent=g_cpu_agent; *dstAgent=g_cpu_agent; break; - case hipMemcpyHostToDevice : *commandType = ihipCommandCopyH2D; *srcAgent=g_cpu_agent; *dstAgent=streamAgent; break; - case hipMemcpyDeviceToHost : *commandType = ihipCommandCopyD2H; *srcAgent=streamAgent; *dstAgent=g_cpu_agent; break; - case hipMemcpyDeviceToDevice : *commandType = ihipCommandCopyD2D; *srcAgent=streamAgent; *dstAgent=streamAgent; break; - default: throw ihipException(hipErrorInvalidMemcpyDirection); - }; -} - - void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) { ihipCtx_t *ctx = this->getCtx(); From c645e53fddb72896232a56944e7efde904a9773e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 19 Sep 2016 18:04:44 -0500 Subject: [PATCH 29/66] Remove unpinned_copy code. Other cleanup. Change-Id: Ie3f71439cf1ba729ef223d078917c403d3de879a --- CMakeLists.txt | 1 - include/hcc_detail/hip_hcc.h | 6 +- include/hcc_detail/unpinned_copy_engine.h | 88 ---- src/hip_hcc.cpp | 12 - src/unpinned_copy_engine.cpp | 467 ---------------------- 5 files changed, 1 insertion(+), 573 deletions(-) delete mode 100644 include/hcc_detail/unpinned_copy_engine.h delete mode 100644 src/unpinned_copy_engine.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 31c80af74d..5ee852611d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,7 +181,6 @@ if(HIP_PLATFORM STREQUAL "hcc") src/hip_peer.cpp src/hip_stream.cpp src/hip_fp16.cpp - src/unpinned_copy_engine.cpp src/hip_module.cpp) if(${HIP_LIB_TYPE} EQUAL 0) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 39c04a1afc..2f85f83851 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -21,8 +21,8 @@ THE SOFTWARE. #define HIP_HCC_H #include +#include #include "hip/hcc_detail/hip_util.h" -#include "hip/hcc_detail/unpinned_copy_engine.h" #if defined(__HCC__) && (__hcc_workweek__ < 16354) @@ -109,10 +109,6 @@ extern const char *API_COLOR_END; #endif -// #include CPP files to produce one object file -#define ONE_OBJECT_FILE 0 - - // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary. // TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned // through ptr-to-args (ie the pointers allocated by hipMalloc). diff --git a/include/hcc_detail/unpinned_copy_engine.h b/include/hcc_detail/unpinned_copy_engine.h deleted file mode 100644 index 678d714981..0000000000 --- a/include/hcc_detail/unpinned_copy_engine.h +++ /dev/null @@ -1,88 +0,0 @@ -/* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -//#pragma once -#ifndef STAGING_BUFFER_H -#define STAGING_BUFFER_H - -#include "hsa/hsa.h" - - -//------------------------------------------------------------------------------------------------- -// An optimized "staging buffer" used to implement Host-To-Device and Device-To-Host copies. -// Some GPUs may not be able to directly access host memory, and in these cases we need to -// stage the copy through a pinned staging buffer. For example, the CopyHostToDevice -// uses the CPU to copy to a pinned "staging buffer", and then use the GPU DMA engine to copy -// from the staging buffer to the final destination. The copy is broken into buffer-sized chunks -// to limit the size of the buffer and also to provide better performance by overlapping the CPU copies -// with the DMA copies. -// -// PinInPlace is another algorithm which pins the host memory "in-place", and copies it with the DMA -// engine. This routine is under development. -// -// Staging buffer provides thread-safe access via a mutex. -struct UnpinnedCopyEngine { - - enum CopyMode {ChooseBest, UsePinInPlace, UseStaging, UseMemcpy} ; - - static const int _max_buffers = 4; - - UnpinnedCopyEngine(hsa_agent_t hsaAgent,hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, - bool isLargeBar, int thresholdH2D_directStaging, int thresholdH2D_stagingPinInPlace, int thresholdD2H) ; - ~UnpinnedCopyEngine(); - - // Use hueristic to choose best copy algorithm - void CopyHostToDevice(CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyDeviceToHost(CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - - - // Specific H2D copy algorithm implementations: - void CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyHostToDeviceMemcpy(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - - - // Specific D2H copy algorithm implementations: - void CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - void CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor); - - - // P2P Copy implementation: - void CopyPeerToPeer( void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor); - - -private: - hsa_agent_t _hsaAgent; - hsa_agent_t _cpuAgent; - size_t _bufferSize; // Size of the buffers. - int _numBuffers; - - // True if system supports large-bar and thus can benefit from CPU directly performing copy operation. - bool _isLargeBar; - - char *_pinnedStagingBuffer[_max_buffers]; - hsa_signal_t _completionSignal[_max_buffers]; - hsa_signal_t _completionSignal2[_max_buffers]; // P2P needs another set of signals. - std::mutex _copyLock; // provide thread-safe access - size_t _hipH2DTransferThresholdDirectOrStaging; - size_t _hipH2DTransferThresholdStagingOrPininplace; - size_t _hipD2HTransferThreshold; -}; - -#endif diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index c3624c5fe3..8f15b25a60 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1480,16 +1480,4 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a return ihipLogStatus(err); } -// TODO - review signal / error reporting code. -// TODO - describe naming convention. ihip _. No accessors. No early returns from functions. Set status to success at top, only set error codes in implementation. No tabs. -// Caps convention _ or camelCase -// if { } -// Should use ihip* data structures inside code rather than app-facing hip. For example, use ihipCtx_t (rather than hipDevice_t), ihipStream_t (rather than hipStream_t). -// locked_ -// TODO - describe MT strategy -// //// TODO - add identifier numbers for streams and devices to help with debugging. - -#if ONE_OBJECT_FILE -#include "unpinned_copy_engine.cpp" -#endif diff --git a/src/unpinned_copy_engine.cpp b/src/unpinned_copy_engine.cpp deleted file mode 100644 index 4ae6990180..0000000000 --- a/src/unpinned_copy_engine.cpp +++ /dev/null @@ -1,467 +0,0 @@ -/* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include - -#include - -#include "hcc_detail/unpinned_copy_engine.h" - -#ifdef HIP_HCC -#include "hcc_detail/hip_runtime.h" -#include "hcc_detail/hip_hcc.h" -#define THROW_ERROR(e) throw ihipException(e) -#else -#define THROW_ERROR(e) throw -#define tprintf(trace_level, ...) -#endif - -void errorCheck(hsa_status_t hsa_error_code, int line_num, std::string str) { - if ((hsa_error_code != HSA_STATUS_SUCCESS)&& (hsa_error_code != HSA_STATUS_INFO_BREAK)) { - printf("HSA reported error!\n In file: %s\nAt line: %d\n", str.c_str(),line_num); - } -} - -#define ErrorCheck(x) errorCheck(x, __LINE__, __FILE__) -hsa_amd_memory_pool_t sys_pool_; - -hsa_status_t findGlobalPool(hsa_amd_memory_pool_t pool, void* data) { - if (NULL == data) { - return HSA_STATUS_ERROR_INVALID_ARGUMENT; - } - - hsa_status_t err; - hsa_amd_segment_t segment; - uint32_t flag; - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &segment); - ErrorCheck(err); - - err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &flag); - ErrorCheck(err); - if ((HSA_AMD_SEGMENT_GLOBAL == segment) && - (flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED)) { - *((hsa_amd_memory_pool_t*)data) = pool; - } - return HSA_STATUS_SUCCESS; -} - -//------------------------------------------------------------------------------------------------- -UnpinnedCopyEngine::UnpinnedCopyEngine(hsa_agent_t hsaAgent, hsa_agent_t cpuAgent, size_t bufferSize, int numBuffers, - bool isLargeBar, int thresholdH2DDirectStaging, - int thresholdH2DStagingPinInPlace, int thresholdD2H) : - _hsaAgent(hsaAgent), - _cpuAgent(cpuAgent), - _bufferSize(bufferSize), - _numBuffers(numBuffers > _max_buffers ? _max_buffers : numBuffers), - _isLargeBar(isLargeBar), - _hipH2DTransferThresholdDirectOrStaging(thresholdH2DDirectStaging), - _hipH2DTransferThresholdStagingOrPininplace(thresholdH2DStagingPinInPlace), - _hipD2HTransferThreshold(thresholdD2H) -{ - hsa_status_t err = hsa_amd_agent_iterate_memory_pools(_cpuAgent, findGlobalPool, &sys_pool_); - ErrorCheck(err); - for (int i=0; i<_numBuffers; i++) { - // TODO - experiment with alignment here. - err = hsa_amd_memory_pool_allocate(sys_pool_, _bufferSize, 0, (void**)(&_pinnedStagingBuffer[i])); - ErrorCheck(err); - - if ((err != HSA_STATUS_SUCCESS) || (_pinnedStagingBuffer[i] == NULL)) { - THROW_ERROR(hipErrorMemoryAllocation); - } - - err = hsa_amd_agents_allow_access(1, &hsaAgent, NULL, _pinnedStagingBuffer[i]); - ErrorCheck(err); - - hsa_signal_create(0, 0, NULL, &_completionSignal[i]); - hsa_signal_create(0, 0, NULL, &_completionSignal2[i]); - } - -}; - - -//--- -UnpinnedCopyEngine::~UnpinnedCopyEngine() -{ - for (int i=0; i<_numBuffers; i++) { - if (_pinnedStagingBuffer[i]) { - hsa_amd_memory_pool_free(_pinnedStagingBuffer[i]); - _pinnedStagingBuffer[i] = NULL; - } - hsa_signal_destroy(_completionSignal[i]); - hsa_signal_destroy(_completionSignal2[i]); - } -} - - - -//--- -//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy -//IN: dst - dest pointer - must be accessible from host CPU. -//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) -//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void UnpinnedCopyEngine::CopyHostToDevicePinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) -{ - std::lock_guard l (_copyLock); - - const char *srcp = static_cast (src); - char *dstp = static_cast (dst); - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completionSignal[i], 0); - } - - if (sizeBytes >= UINT64_MAX/2) { - THROW_ERROR (hipErrorInvalidValue); - } - int bufferIndex = 0; - - size_t theseBytes= sizeBytes; - //tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); - //hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - - //void * masked_srcp = (void*) ((uintptr_t)srcp & (uintptr_t)(~0x3f)) ; // TODO - void *locked_srcp; - //hsa_status_t hsa_status = hsa_amd_memory_lock(masked_srcp, theseBytes, &_hsaAgent, 1, &locked_srcp); - hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (srcp), theseBytes, &_hsaAgent, 1, &locked_srcp); - //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: pin-in-place:%p+%zu bufferIndex[%d]\n", bytesRemaining, srcp, theseBytes, bufferIndex); - //printf ("status=%x srcp=%p, masked_srcp=%p, locked_srcp=%p\n", hsa_status, srcp, masked_srcp, locked_srcp); - - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); - } - - hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - - hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, locked_srcp, _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); - //tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); - - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); - } - tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); - hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - hsa_amd_memory_unlock(const_cast (srcp)); - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; -} - - -// Copy using simple memcpy. Only works on large-bar systems. -void UnpinnedCopyEngine::CopyHostToDeviceMemcpy(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) -{ - if (!_isLargeBar) { - THROW_ERROR (hipErrorInvalidValue); - } - - memcpy(dst,src,sizeBytes); - std::atomic_thread_fence(std::memory_order_release); -}; - - - -void UnpinnedCopyEngine::CopyHostToDevice(UnpinnedCopyEngine::CopyMode copyMode, void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) -{ - if (copyMode == ChooseBest) { - if (_isLargeBar && (sizeBytes < _hipH2DTransferThresholdDirectOrStaging)) { - copyMode = UseMemcpy; - } else if (sizeBytes > _hipH2DTransferThresholdStagingOrPininplace) { - copyMode = UsePinInPlace; - } else { - copyMode = UseStaging; - } - } - - if (copyMode == UseMemcpy) { - - CopyHostToDeviceMemcpy(dst, src, sizeBytes, waitFor); - - - } else if (copyMode == UsePinInPlace) { - CopyHostToDevicePinInPlace(dst, src, sizeBytes, waitFor); - - } else if (copyMode == UseStaging) { - CopyHostToDeviceStaging(dst, src, sizeBytes, waitFor); - - } else { - // Unknown copy mode. - THROW_ERROR(hipErrorInvalidValue); - } -} - - -//--- -//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy -//IN: dst - dest pointer - must be accessible from host CPU. -//IN: src - src pointer for copy. Must be accessible from agent this buffer is associated with (via _hsaAgent) -//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void UnpinnedCopyEngine::CopyHostToDeviceStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) -{ - { - std::lock_guard l (_copyLock); - - const char *srcp = static_cast (src); - char *dstp = static_cast (dst); - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completionSignal[i], 0); - } - - if (sizeBytes >= UINT64_MAX/2) { - THROW_ERROR (hipErrorInvalidValue); - } - int bufferIndex = 0; - for (int64_t bytesRemaining=sizeBytes; bytesRemaining>0 ; bytesRemaining -= _bufferSize) { - - size_t theseBytes = (bytesRemaining > _bufferSize) ? _bufferSize : bytesRemaining; - - tprintf (DB_COPY2, "H2D: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); - hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - - tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: copy %zu bytes %p to stagingBuf[%d]:%p\n", bytesRemaining, theseBytes, srcp, bufferIndex, _pinnedStagingBuffer[bufferIndex]); - // TODO - use uncached memcpy, someday. - memcpy(_pinnedStagingBuffer[bufferIndex], srcp, theseBytes); - - - hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp, _hsaAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); - tprintf (DB_COPY2, "H2D: bytesRemaining=%zu: async_copy %zu bytes %p to %p status=%x\n", bytesRemaining, theseBytes, _pinnedStagingBuffer[bufferIndex], dstp, hsa_status); - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR ((hipErrorRuntimeMemory)); - } - - srcp += theseBytes; - dstp += theseBytes; - if (++bufferIndex >= _numBuffers) { - bufferIndex = 0; - } - - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; - } - - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completionSignal[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } - } -} - - -void UnpinnedCopyEngine::CopyDeviceToHostPinInPlace(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) -{ - std::lock_guard l (_copyLock); - - const char *srcp = static_cast (src); - char *dstp = static_cast (dst); - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completionSignal[i], 0); - } - - if (sizeBytes >= UINT64_MAX/2) { - THROW_ERROR (hipErrorInvalidValue); - } - int bufferIndex = 0; - size_t theseBytes= sizeBytes; - void *locked_destp; - - hsa_status_t hsa_status = hsa_amd_memory_lock(const_cast (dstp), theseBytes, &_hsaAgent, 1, &locked_destp); - - - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); - } - - hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - - hsa_status = hsa_amd_memory_async_copy(locked_destp,_cpuAgent , srcp, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); - - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); - } - tprintf (DB_COPY2, "D2H: waiting... on completion signal handle=%lu\n", _completionSignal[bufferIndex].handle); - hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - hsa_amd_memory_unlock(const_cast (dstp)); - - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; -} - - -void UnpinnedCopyEngine::CopyDeviceToHost(CopyMode copyMode ,void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) -{ - if (copyMode == ChooseBest) { - if (sizeBytes > _hipD2HTransferThreshold) { - copyMode = UsePinInPlace; - } else { - copyMode = UseStaging; - } - } - - - if (copyMode == UsePinInPlace) { - CopyDeviceToHostPinInPlace(dst, src, sizeBytes, waitFor); - } else if (copyMode == UseStaging) { - CopyDeviceToHostStaging(dst, src, sizeBytes, waitFor); - } else { - // Unknown copy mode. - THROW_ERROR(hipErrorInvalidValue); - } -} - -//--- -//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy -//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). -//IN: src - src pointer for copy. Must be accessible from host CPU. -//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void UnpinnedCopyEngine::CopyDeviceToHostStaging(void* dst, const void* src, size_t sizeBytes, hsa_signal_t *waitFor) -{ - { - std::lock_guard l (_copyLock); - - const char *srcp0 = static_cast (src); - char *dstp1 = static_cast (dst); - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completionSignal[i], 0); - } - - if (sizeBytes >= UINT64_MAX/2) { - THROW_ERROR (hipErrorInvalidValue); - } - - int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer. - int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest - - while (bytesRemaining1 > 0) - { - // First launch the async copies to copy from dest to host - for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { - - size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; - - tprintf (DB_COPY2, "D2H: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); - hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, _hsaAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); - } - - srcp0 += theseBytes; - - - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; - } - - // Now unload the staging buffers: - for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { - - size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; - - tprintf (DB_COPY2, "D2H: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); - hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - - tprintf (DB_COPY2, "D2H: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to dst:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); - memcpy(dstp1, _pinnedStagingBuffer[bufferIndex], theseBytes); - - dstp1 += theseBytes; - } - } - } -} - - -//--- -//Copies sizeBytes from src to dst, using either a copy to a staging buffer or a staged pin-in-place strategy -//IN: dst - dest pointer - must be accessible from agent this buffer is associated with (via _hsaAgent). -//IN: src - src pointer for copy. Must be accessible from host CPU. -//IN: waitFor - hsaSignal to wait for - the copy will begin only when the specified dependency is resolved. May be NULL indicating no dependency. -void UnpinnedCopyEngine::CopyPeerToPeer(void* dst, hsa_agent_t dstAgent, const void* src, hsa_agent_t srcAgent, size_t sizeBytes, hsa_signal_t *waitFor) -{ - std::lock_guard l (_copyLock); - - const char *srcp0 = static_cast (src); - char *dstp1 = static_cast (dst); - - for (int i=0; i<_numBuffers; i++) { - hsa_signal_store_relaxed(_completionSignal[i], 0); - hsa_signal_store_relaxed(_completionSignal2[i], 0); - } - - if (sizeBytes >= UINT64_MAX/2) { - THROW_ERROR (hipErrorInvalidValue); - } - - int64_t bytesRemaining0 = sizeBytes; // bytes to copy from dest into staging buffer. - int64_t bytesRemaining1 = sizeBytes; // bytes to copy from staging buffer into final dest - - while (bytesRemaining1 > 0) { - // First launch the async copies to copy from dest to host - for (int bufferIndex = 0; (bytesRemaining0>0) && (bufferIndex < _numBuffers); bytesRemaining0 -= _bufferSize, bufferIndex++) { - - size_t theseBytes = (bytesRemaining0 > _bufferSize) ? _bufferSize : bytesRemaining0; - - // Wait to make sure we are not overwriting a buffer before it has been drained: - hsa_signal_wait_acquire(_completionSignal2[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - - tprintf (DB_COPY2, "P2P: bytesRemaining0=%zu async_copy %zu bytes src:%p to staging:%p\n", bytesRemaining0, theseBytes, srcp0, _pinnedStagingBuffer[bufferIndex]); - hsa_signal_store_relaxed(_completionSignal[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(_pinnedStagingBuffer[bufferIndex], _cpuAgent, srcp0, srcAgent, theseBytes, waitFor ? 1:0, waitFor, _completionSignal[bufferIndex]); - if (hsa_status != HSA_STATUS_SUCCESS) { - THROW_ERROR (hipErrorRuntimeMemory); - } - - srcp0 += theseBytes; - - - // Assume subsequent commands are dependent on previous and don't need dependency after first copy submitted, HIP_ONESHOT_COPY_DEP=1 - waitFor = NULL; - } - - // Now unload the staging buffers: - for (int bufferIndex=0; (bytesRemaining1>0) && (bufferIndex < _numBuffers); bytesRemaining1 -= _bufferSize, bufferIndex++) { - - size_t theseBytes = (bytesRemaining1 > _bufferSize) ? _bufferSize : bytesRemaining1; - - tprintf (DB_COPY2, "P2P: wait_completion[%d] bytesRemaining=%zu\n", bufferIndex, bytesRemaining1); - - bool hostWait = 0; // TODO - remove me - - if (hostWait) { - // Host-side wait, should not be necessary: - hsa_signal_wait_acquire(_completionSignal[bufferIndex], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } - - tprintf (DB_COPY2, "P2P: bytesRemaining1=%zu copy %zu bytes stagingBuf[%d]:%p to device:%p\n", bytesRemaining1, theseBytes, bufferIndex, _pinnedStagingBuffer[bufferIndex], dstp1); - hsa_signal_store_relaxed(_completionSignal2[bufferIndex], 1); - hsa_status_t hsa_status = hsa_amd_memory_async_copy(dstp1, dstAgent, _pinnedStagingBuffer[bufferIndex], _cpuAgent /*not used*/, theseBytes, - hostWait ? 0:1, hostWait ? NULL : &_completionSignal[bufferIndex], - _completionSignal2[bufferIndex]); - - dstp1 += theseBytes; - } - } - - - // Wait for the staging-buffer to dest copies to complete: - for (int i=0; i<_numBuffers; i++) { - hsa_signal_wait_acquire(_completionSignal2[i], HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_WAIT_STATE_ACTIVE); - } -} From ff28ff15099549a57d07a7ceb089c56682360768 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 22 Sep 2016 10:37:54 -0500 Subject: [PATCH 30/66] Fix HIP_INIT_API and ihipLogStatus calls --- src/hip_peer.cpp | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index 731b872c74..63ac902a23 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -65,13 +65,10 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_ // Remove this device from peer device peerlist. hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx) { - HIP_INIT_API(peerCtx); - hipError_t err = hipSuccess; auto thisCtx = ihipGetTlsDefaultCtx(); if ((thisCtx != NULL) && (peerCtx != NULL)) { - // Return true if thisCtx can access peerCtx's memory: bool canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); if (! canAccessPeer) { @@ -92,16 +89,15 @@ hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx) err = hipErrorInvalidDevice; } - return ihipLogStatus(err); + return err; }; //--- -// Allow the current device to see all memory allocated on peerDevice. +// Allow the current device to see all memory allocated on peerCtx. // This should add this device to the peer-device peer list. hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) { - HIP_INIT_API(peerCtx, flags); hipError_t err = hipSuccess; if (flags != 0) { @@ -112,6 +108,7 @@ hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) err = hipErrorInvalidDevice; // Can't enable peer access to self. } else if ((thisCtx != NULL) && (peerCtx != NULL)) { LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); + // Add thisCtx to peerCtx's access list so that new allocations on peer will be made visible to this device: bool isNewPeer = peerCrit->addPeer(thisCtx); if (isNewPeer) { am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); @@ -123,7 +120,7 @@ hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) } } - return ihipLogStatus(err); + return err; } @@ -132,6 +129,7 @@ hipError_t hipMemcpyPeer (void* dst, hipCtx_t dstCtx, const void* src, hipCtx_t { HIP_INIT_API(dst, dstCtx, src, srcCtx, sizeBytes); + // TODO - move to ihip memory copy implementaion. // HCC has a unified memory architecture so device specifiers are not required. return hipMemcpy(dst, src, sizeBytes, hipMemcpyDefault); }; @@ -141,6 +139,8 @@ hipError_t hipMemcpyPeer (void* dst, hipCtx_t dstCtx, const void* src, hipCtx_t hipError_t hipMemcpyPeerAsync (void* dst, hipCtx_t dstDevice, const void* src, hipCtx_t srcDevice, size_t sizeBytes, hipStream_t stream) { HIP_INIT_API(dst, dstDevice, src, srcDevice, sizeBytes, stream); + + // TODO - move to ihip memory copy implementaion. // HCC has a unified memory architecture so device specifiers are not required. return hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream); }; @@ -163,7 +163,7 @@ hipError_t hipDeviceDisablePeerAccess (int peerDeviceId) { HIP_INIT_API(peerDeviceId); - return ihipDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId)); + return ihipLogStatus(ihipDisablePeerAccess(ihipGetPrimaryCtx(peerDeviceId))); } @@ -171,7 +171,7 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags) { HIP_INIT_API(peerDeviceId, flags); - return ihipEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags); + return ihipLogStatus(ihipEnablePeerAccess(ihipGetPrimaryCtx(peerDeviceId), flags)); } @@ -192,12 +192,12 @@ hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) { HIP_INIT_API(peerCtx, flags); - return ihipEnablePeerAccess(peerCtx, flags); + return ihipLogStatus(ihipEnablePeerAccess(peerCtx, flags)); } hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx) { HIP_INIT_API(peerCtx); - return ihipDisablePeerAccess(peerCtx); + return ihipLogStatus(ihipDisablePeerAccess(peerCtx)); } From 0d47e62cb03dbcf5906c5981c8b796396e2c9c89 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 22 Sep 2016 10:38:52 -0500 Subject: [PATCH 31/66] Add kind translation for hipMemcpyDeviceToDevice --- include/nvcc_detail/hip_runtime_api.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h index 9fea77a904..eff2aafe17 100644 --- a/include/nvcc_detail/hip_runtime_api.h +++ b/include/nvcc_detail/hip_runtime_api.h @@ -140,15 +140,17 @@ switch(hError) { } inline static cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind kind) { -switch(kind) { -case hipMemcpyHostToHost: - return cudaMemcpyHostToHost; -case hipMemcpyHostToDevice: - return cudaMemcpyHostToDevice; -case hipMemcpyDeviceToHost: - return cudaMemcpyDeviceToHost; -default: - return cudaMemcpyDefault; + switch(kind) { + case hipMemcpyHostToHost: + return cudaMemcpyHostToHost; + case hipMemcpyHostToDevice: + return cudaMemcpyHostToDevice; + case hipMemcpyDeviceToHost: + return cudaMemcpyDeviceToHost; + case hipMemcpyDeviceToDevice: + return cudaMemcpyDeviceToDevice; + default: + return cudaMemcpyDefault; } } From c769abcbeb7b6d2c4a547a2a512ebec1818e6d36 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 22 Sep 2016 10:39:17 -0500 Subject: [PATCH 32/66] Peer-to-Peer improvements. - Bug fix for peer visibility. Now contexts correctly detect when they can use SDMA for P2P vs staging buffers. - Interface to new HCC copy_ext function. - Improve context and peer print /debug options. - Add comments and usage to hipPeerToPeer_simple test. --- include/hcc_detail/hip_hcc.h | 15 ++++- src/hip_hcc.cpp | 99 +++++++++++++++++++++++++++--- src/hip_memory.cpp | 2 +- tests/src/hipPeerToPeer_simple.cpp | 18 +++++- 4 files changed, 124 insertions(+), 10 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 2f85f83851..6c061b01a9 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -416,6 +416,8 @@ typedef uint64_t SeqNum_t ; // The unsigned return is hipMemcpyKind unsigned resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem); + bool canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx); + private: // Data // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t @@ -516,10 +518,11 @@ class ihipCtxCriticalBase_t : LockedBase // Peer Accessor classes: - bool isPeer(const ihipCtx_t *peer); // returns Trus if peer has access to memory physically located on this device. + bool isPeer(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device. bool addPeer(ihipCtx_t *peer); bool removePeer(ihipCtx_t *peer); void resetPeers(ihipCtx_t *thisDevice); + void printPeers(FILE *f) const; uint32_t peerCnt() const { return _peerCnt; }; hsa_agent_t *peerAgents() const { return _peerAgents; }; @@ -535,6 +538,7 @@ class ihipCtxCriticalBase_t : LockedBase //--- Peer Tracker: // These reflect the currently Enabled set of peers for this GPU: // Enabled peers have permissions to access the memory physically allocated on this device. + // Note the peers always contain the self agent for easy interfacing with HSA APIs. std::list _peers; // list of enabled peer devices. uint32_t _peerCnt; // number of enabled peers hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.) @@ -578,6 +582,8 @@ class ihipCtx_t // TODO - review uses of getWriteableDevice(), can these be converted to getDevice() ihipDevice_t *getWriteableDevice() const { return _device; }; + std::string toString() const; + public: // Data // The NULL stream is used if no other stream is specified. // Default stream has special synchronization properties with other streams. @@ -663,5 +669,12 @@ inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) return os; } +inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) +{ + os << "ctx:" << static_cast (c) + << " dev:" << c->getDevice()->_deviceId; + return os; +} + #endif diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 8f15b25a60..1058d96412 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -381,6 +381,18 @@ void ihipCtxCriticalBase_t::resetPeers(ihipCtx_t *thisDevice) } +template<> +void ihipCtxCriticalBase_t::printPeers(FILE *f) const +{ + for (auto iter = _peers.begin(); iter!=_peers.end(); iter++) { + fprintf (f, "%s ", (*iter)->toString().c_str()); + }; +} + + + + + template<> void ihipCtxCriticalBase_t::addStream(ihipStream_t *stream) { @@ -784,6 +796,13 @@ void ihipCtx_t::locked_reset() }; +//--- +std::string ihipCtx_t::toString() const +{ + std::ostringstream ss; + ss << this; + return ss.str(); +}; //---- @@ -1328,8 +1347,36 @@ void ihipSetTs(hipEvent_t e) } +// Returns true if thisCtx can see the memory allocated on dstCtx and srcCtx. +// The peer-list for a context controls which contexts have access to the memory allocated on that context. +// So we check dstCtx's and srcCtx's peerList to see if the booth include thisCtx. +bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx) +{ + tprintf (DB_COPY1, "Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s\n", + thisCtx->toString().c_str(), dstCtx->toString().c_str(), srcCtx->toString().c_str()); + + // Use blocks to control scope of critical sections. + { + LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); + if (!ctxCrit->isPeer(thisCtx)) { + return false; + }; + } + + { + LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); + if (!ctxCrit->isPeer(thisCtx)) { + return false; + }; + } + + return true; +}; + + // Resolve hipMemcpyDefault to a known type. +// TODO - review why is this so complicated, does this need srcTracked and dstTracked? unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem) { hipMemcpyKind kind = hipMemcpyDefault; @@ -1358,6 +1405,7 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, } +// TODO - remove kind parm from here or use it below? void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) { ihipCtx_t *ctx = this->getCtx(); @@ -1367,7 +1415,38 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const throw ihipException(hipErrorInvalidDevice); } - crit->_av.copy(src, dst, sizeBytes); + hc::accelerator acc; + hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); + hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); + bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); + bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + + if (kind == hipMemcpyDefault) { + kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); + } + hc::hcCommandKind hcCopyDir; + switch (kind) { + case hipMemcpyHostToHost: hcCopyDir = hc::hcMemcpyHostToHost; break; + case hipMemcpyHostToDevice: hcCopyDir = hc::hcMemcpyHostToDevice; break; + case hipMemcpyDeviceToHost: hcCopyDir = hc::hcMemcpyDeviceToHost; break; + case hipMemcpyDeviceToDevice: hcCopyDir = hc::hcMemcpyDeviceToDevice; break; + }; + + + // If this is P2P accessi, we need to check to see if the copy agent (specified by the stream where the copy is enqueued) + // has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers + // and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (forceHostCopyEngine=true). + bool forceHostCopyEngine = false; + if (hcCopyDir == hc::hcMemcpyDeviceToDevice) { + if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId))) { + forceHostCopyEngine = true; + tprintf (DB_COPY1, "Forcing use of host copy engine.\n"); + } else { + tprintf (DB_COPY1, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); + } + }; + + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); } @@ -1410,16 +1489,23 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + bool copyEngineCanSeeSrcAndDest = true; + if (kind == hipMemcpyDeviceToDevice) { + copyEngineCanSeeSrcAndDest = canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId)); + } + + + + // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. - if (!dstTracked || !srcTracked) { + if (!dstTracked || !srcTracked || !copyEngineCanSeeSrcAndDest) { trueAsync = false; } - if (trueAsync == true) { - // Perform a syncrhonous copy: + // Perform a synchronous copy: try { crit->_av.copy_async(src, dst, sizeBytes); } catch (Kalmar::runtime_exception) { @@ -1432,10 +1518,8 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig this->wait(crit); } } else { - // Perform a syncrhonous copy: + // Perform a synchronous copy: if (kind == hipMemcpyDefault) { - bool srcInDeviceMem = (srcTracked && srcPtrInfo._isInDeviceMem); - bool dstInDeviceMem = (dstTracked && dstPtrInfo._isInDeviceMem); kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); } copySync(crit, dst, src, sizeBytes, kind); @@ -1481,3 +1565,4 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a } //// TODO - add identifier numbers for streams and devices to help with debugging. +//TODO - add a contect sequence number for debug. Print operator<< ctx:0.1 (device.ctx) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 668852bbfe..b973ba977b 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -104,7 +104,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) if (sizeBytes == 0) { *ptr = NULL; - return ihipLogStatus(hip_status); + return ihipLogStatus(hipSuccess); } auto ctx = ihipGetTlsDefaultCtx(); diff --git a/tests/src/hipPeerToPeer_simple.cpp b/tests/src/hipPeerToPeer_simple.cpp index 71d073b1b2..de89b36f46 100644 --- a/tests/src/hipPeerToPeer_simple.cpp +++ b/tests/src/hipPeerToPeer_simple.cpp @@ -33,6 +33,15 @@ int p_peerDevice = -1; // explicly specify which peer to use, else use p_gpuDe int g_currentDevice; int g_peerDevice; +void help(char *argv[]) +{ + printf ("usage: %s [OPTIONS]\n", argv[0]); + printf (" --memcpyWithPeer : Perform memcpy with peer.\n"); + printf (" --mirrorPeersi : Mirror memory onto both default device and peerdevice. If 0, memory is mapped only on the default device.\n"); + printf (" --peerDevice N : Set peer device.\n"); +}; + + void parseMyArguments(int argc, char *argv[]) { int more_argc = HipTest::parseStandardArguments(argc, argv, false); @@ -40,7 +49,10 @@ void parseMyArguments(int argc, char *argv[]) for (int i = 1; i < more_argc; i++) { const char *arg = argv[i]; - if (!strcmp(arg, "--memcpyWithPeer")) { + if (!strcmp(arg, "--help")) { + help(argv); + exit(-1); + } else if (!strcmp(arg, "--memcpyWithPeer")) { p_memcpyWithPeer = true; } else if (!strcmp(arg, "--mirrorPeers")) { p_mirrorPeers = true; @@ -90,10 +102,12 @@ void enablePeerFirst() setupPeerTests(); + // Always enable g_currentDevice to see the allocations on peerDevice. HIPCHECK(hipSetDevice(g_currentDevice)); HIPCHECK(hipDeviceEnablePeerAccess(g_peerDevice, 0)); if (p_mirrorPeers) { + // Mirror peers allows the peer device to see the allocations on currentDevice. int canAccessPeer; HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer, g_peerDevice, g_currentDevice)); assert(canAccessPeer); @@ -122,6 +136,8 @@ void enablePeerFirst() // Device0 push to device1, using P2P: + // NOTE : if p_mirrorPeers=0 and p_memcpyWithPeer=1, then peer device does not have mapping for A_d1 and we need to use a + // a host staging copy for the P2P access. HIPCHECK (hipSetDevice(p_memcpyWithPeer ? g_peerDevice : g_currentDevice)); HIPCHECK (hipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault)); // This is P2P copy. From ede43b3e6e66b287606b8f5eba2c74632393f827 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 23 Sep 2016 10:43:09 +0530 Subject: [PATCH 33/66] hipcc: Don't try to linking unpinned_copy_engine Change-Id: Iaa4d8b4f6dbb7940bba553f82c157a2ee9468c9e --- bin/hipcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hipcc b/bin/hipcc index 5a75d2c7c7..8a8a1715da 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -297,7 +297,7 @@ if ($needHipHcc) { $HIP_LIB_TYPE = $hipConfig{'HIP_LIB_TYPE'} // 0; if ($HIP_LIB_TYPE eq 0) { - $HIPLDFLAGS .= " $HIP_PATH/lib/device_util.cpp.o $HIP_PATH/lib/hip_device.cpp.o $HIP_PATH/lib/hip_error.cpp.o $HIP_PATH/lib/hip_event.cpp.o $HIP_PATH/lib/hip_hcc.cpp.o $HIP_PATH/lib/hip_memory.cpp.o $HIP_PATH/lib/hip_peer.cpp.o $HIP_PATH/lib/hip_stream.cpp.o $HIP_PATH/lib/unpinned_copy_engine.cpp.o $HIP_PATH/lib/hip_ldg.cpp.o $HIP_PATH/lib/hip_fp16.cpp.o $HIP_PATH/lib/hip_context.cpp.o $HIP_PATH/lib/hip_module.cpp.o"; + $HIPLDFLAGS .= " $HIP_PATH/lib/device_util.cpp.o $HIP_PATH/lib/hip_device.cpp.o $HIP_PATH/lib/hip_error.cpp.o $HIP_PATH/lib/hip_event.cpp.o $HIP_PATH/lib/hip_hcc.cpp.o $HIP_PATH/lib/hip_memory.cpp.o $HIP_PATH/lib/hip_peer.cpp.o $HIP_PATH/lib/hip_stream.cpp.o $HIP_PATH/lib/hip_ldg.cpp.o $HIP_PATH/lib/hip_fp16.cpp.o $HIP_PATH/lib/hip_context.cpp.o $HIP_PATH/lib/hip_module.cpp.o"; } elsif ($HIP_LIB_TYPE eq 1) { $HIPLDFLAGS .= " -L$HIP_PATH/lib -lhip_hcc" ; } else { From deb7493ca18fc4b015e62516ed9d07d64489e018 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 23 Sep 2016 10:44:39 +0530 Subject: [PATCH 34/66] CMakeLists.txt: No need to define HIP_HCC Change-Id: I0aa149bf3ffd4ee665bf3b822e178b3d74efe2af --- CMakeLists.txt | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ee852611d..99cd546853 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -153,22 +153,23 @@ endif() # Build hip_hcc if platform is hcc if(HIP_PLATFORM STREQUAL "hcc") include_directories(${PROJECT_SOURCE_DIR}/include) + set(HIP_HCC_BUILD_FLAGS) if(COMPILE_HIP_ATP_MARKER) include_directories(/opt/rocm/profiler/CXLActivityLogger/include) - set(CMAKE_CXX_FLAGS " -DCOMPILE_HIP_ATP_MARKER=1 ${CMAKE_CXX_FLAGS}") - set(CMAKE_C_FLAGS " -DCOMPILE_HIP_ATP_MARKER=1 ${CMAKE_C_FLAGS}") + set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DCOMPILE_HIP_ATP_MARKER=1") endif() - set(CMAKE_CXX_COMPILER "${HCC_HOME}/bin/hcc") - set(CMAKE_C_COMPILER "${HCC_HOME}/bin/hcc") - # Add HIP_VERSION to CMAKE__FLAGS - set(CMAKE_CXX_FLAGS " -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_PATCH} ${CMAKE_CXX_FLAGS}") - set(CMAKE_C_FLAGS " -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_PATCH} ${CMAKE_C_FLAGS}") + set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_PATCH}") - # Set HIP_HCC so we know this is HIP compile, some files are shared with HCC (staging_buffer). - set(CMAKE_CXX_FLAGS " -fPIC -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -I/opt/rocm/libhsakmt/include/libhsakmt -stdlib=libc++ -DHIP_HCC ${CMAKE_CXX_FLAGS}") - set(CMAKE_C_FLAGS " -fPIC -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -I/opt/rocm/libhsakmt/include/libhsakmt -stdlib=libc++ -DHIP_HCC ${CMAKE_C_FLAGS}") + # Add remaining flags + set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -I/opt/rocm/libhsakmt/include/libhsakmt -stdlib=libc++") + + # Set compiler and compiler flags + set(CMAKE_CXX_COMPILER "${HCC_HOME}/bin/hcc") + set(CMAKE_C_COMPILER "${HCC_HOME}/bin/hcc") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_HCC_BUILD_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${HIP_HCC_BUILD_FLAGS}") set(SOURCE_FILES src/device_util.cpp src/hip_hcc.cpp From f97c91d74af82cc15305b0e22724a06117894074 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 23 Sep 2016 14:44:51 +0530 Subject: [PATCH 35/66] Initial implementation of HIT infrastructure Change-Id: Icaef40cca67715fe3ec4ce3479d0f80f391f3917 --- tests/hit/HIT.cmake | 153 ++++++++++++++++++++++++++++++++++++++++++++ tests/hit/parser | 78 ++++++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 tests/hit/HIT.cmake create mode 100755 tests/hit/parser diff --git a/tests/hit/HIT.cmake b/tests/hit/HIT.cmake new file mode 100644 index 0000000000..0472bb4d39 --- /dev/null +++ b/tests/hit/HIT.cmake @@ -0,0 +1,153 @@ +include(CTest) +find_package(HIP REQUIRED) + +#------------------------------------------------------------------------------- +# Helper macro to parse BUILD instructions +macro(PARSE_BUILD_COMMAND _target _sources _hipcc_options _hcc_options _nvcc_options _exclude_platforms _dir) + set(${_target}) + set(${_sources}) + set(${_hipcc_options}) + set(${_hcc_options}) + set(${_nvcc_options}) + set(${_exclude_platforms}) + set(_target_found FALSE) + set(_hipcc_options_found FALSE) + set(_hcc_options_found FALSE) + set(_nvcc_options_found FALSE) + set(_exclude_platforms_found FALSE) + foreach(arg ${ARGN}) + if(NOT _target_found) + set(_target_found TRUE) + set(${_target} ${arg}) + elseif("x${arg}" STREQUAL "xHIPCC_OPTIONS") + set(_hipcc_options_found TRUE) + set(_hcc_options_found FALSE) + set(_nvcc_options_found FALSE) + set(_exclude_platforms_found FALSE) + elseif("x${arg}" STREQUAL "xHCC_OPTIONS") + set(_hipcc_options_found FALSE) + set(_hcc_options_found TRUE) + set(_nvcc_options_found FALSE) + set(_exclude_platforms_found FALSE) + elseif("x${arg}" STREQUAL "xNVCC_OPTIONS") + set(_hipcc_options_found FALSE) + set(_hcc_options_found FALSE) + set(_nvcc_options_found TRUE) + set(_exclude_platforms_found FALSE) + elseif("x${arg}" STREQUAL "xEXCLUDE_HIP_PLATFORM") + set(_hipcc_options_found FALSE) + set(_hcc_options_found FALSE) + set(_nvcc_options_found FALSE) + set(_exclude_platforms_found TRUE) + else() + if(_hipcc_options_found) + list(APPEND ${_hipcc_options} ${arg}) + elseif(_hcc_options_found) + list(APPEND ${_hcc_options} ${arg}) + elseif(_nvcc_options_found) + list(APPEND ${_nvcc_options} ${arg}) + elseif(_exclude_platforms_found) + set(${_exclude_platforms} ${arg}) + else() + list(APPEND ${_sources} "${_dir}/${arg}") + endif() + endif() + endforeach() +endmacro() + +# Helper macro to parse RUN instructions +macro(PARSE_RUN_COMMAND _target _arguments _exclude_platforms) + set(${_target}) + set(${_arguments} " ") + set(${_exclude_platforms}) + set(_target_found FALSE) + set(_exclude_platforms_found FALSE) + foreach(arg ${ARGN}) + if(NOT _target_found) + set(_target_found TRUE) + set(${_target} ${arg}) + elseif("x${arg}" STREQUAL "xEXCLUDE_HIP_PLATFORM") + set(_exclude_platforms_found TRUE) + else() + if(_exclude_platforms_found) + set(${_exclude_platforms} ${arg}) + else() + list(APPEND ${_arguments} ${arg}) + endif() + endif() + endforeach() +endmacro() + +# Helper macro to insert key/value pair into "hashmap" +macro(INSERT_INTO_MAP _map _key _value) + set("${_map}_${_key}" "${_value}") +endmacro() + +# Helper macro to read key/value pair from "hashmap" +macro(READ_FROM_MAP _map _key _value) + set(${_value} "${${_map}_${_key}}") +endmacro() + +# Helper macro to create a test +macro(MAKE_TEST exe) + string(REPLACE " " "" smush_args ${ARGN}) + set(testname ${PROJECT_NAME}/${exe}${smush_args}.tst) + add_test(NAME ${testname} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN}) + set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED") +endmacro() +#------------------------------------------------------------------------------- + +# Macro: HIT_ADD_FILES used to scan+add multiple files for testing. +macro(HIT_ADD_FILES _dir) + foreach (file ${ARGN}) + # Build tests + execute_process(COMMAND ${HIP_SRC_PATH}/tests/hit/parser --buildCMDs ${file} + OUTPUT_VARIABLE _contents + ERROR_QUIET + WORKING_DIRECTORY ${_dir} + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "\n" ";" _contents "${_contents}") + foreach(_cmd ${_contents}) + string(REGEX REPLACE " " ";" _cmd "${_cmd}") + parse_build_command(_target _sources _hipcc_options _hcc_options _nvcc_options _exclude_platforms ${_dir} ${_cmd}) + insert_into_map("_exclude" "${_target}" "${_exclude_platforms}") + if(_exclude_platforms STREQUAL "all" OR _exclude_platforms STREQUAL ${HIP_PLATFORM}) + else() + set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + hip_add_executable(${_target} ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + endif() + endforeach() + + # Add tests + execute_process(COMMAND ${HIP_SRC_PATH}/tests/hit/parser --runCMDs ${file} + OUTPUT_VARIABLE _contents + ERROR_QUIET + WORKING_DIRECTORY ${_dir} + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "\n" ";" _contents "${_contents}") + foreach(_cmd ${_contents}) + string(REGEX REPLACE " " ";" _cmd "${_cmd}") + parse_run_command(_target _arguments _exclude_platforms ${_cmd}) + read_from_map("_exclude" "${_target}" _exclude_platforms_from_build) + if(_exclude_platforms STREQUAL "all" OR _exclude_platforms STREQUAL ${HIP_PLATFORM} OR + _exclude_platforms_from_build STREQUAL "all" OR _exclude_platforms_from_build STREQUAL ${HIP_PLATFORM}) + else() + make_test(${_target} ${_arguments}) + endif() + endforeach() + endforeach() +endmacro() + +# Macro: HIT_ADD_DIRECTORY to scan+add all files in a directory for testing +macro(HIT_ADD_DIRECTORY _dir) + file(GLOB files "${_dir}/*.c*") + hit_add_files(${_dir} ${files}) +endmacro() + +# Macro: HIT_ADD_DIRECTORY_RECURSIVE to scan+add all files in a directory+subdirectories for testing +macro(HIT_ADD_DIRECTORY_RECURSIVE _dir) + file(GLOB_RECURSE files "${_dir}/*.c*") + hit_add_files(${_dir} ${files}) +endmacro() + +# vim: ts=4:sw=4:expandtab:smartindent diff --git a/tests/hit/parser b/tests/hit/parser new file mode 100755 index 0000000000..6f6f842587 --- /dev/null +++ b/tests/hit/parser @@ -0,0 +1,78 @@ +#!/usr/bin/perl -w + +use 5.006; use v5.10.1; +use File::Basename; +use File::Spec; + +# Scan input file for HIT information +sub parse_file { + my $file = shift; + (my $exe = $file) =~ s/\.[^.]+$//g; + my (@buildCMDs, @runCMDs); + if (open (SOURCE, '<:encoding(UTF-8)', "$file")) { + while () { + my $line=$_; + # Look for BUILD instructions + if ($line =~ /^ \* BUILD:/) { + $line =~ s/^ \* BUILD: //g; # Remove " * BUILD: " + $line =~ s/%s/$file/g; # Substitute %s -> filename + $line =~ s/%t/$exe/g; # Substitute %t -> targetname + $line =~ s/\R//g; # Remove line endings + push @buildCMDs, $line; + } + # Look for RUN instructions + if ($line =~ /^ \* RUN:/) { + $line =~ s/^ \* RUN: //g; # Remove " * RUN: " + $line =~ s/%t/$exe/g; # Subsitute %t -> targetname + $line =~ s/\R//g; # Remove line endings + push @runCMDs, $line; + } + } + close(SOURCE); + } + return (\@buildCMDs, \@runCMDs); +} + +# Exit if no arguments specified +if(scalar @ARGV == 0){ + print "No Arguments passed, exiting ...\n"; + exit(-1); +} + +# Parse command +my @options = (); +my $retBuildCMDs = 0; +my $retRunCMDs = 0; +foreach $arg (@ARGV) { + if ($retBuildCMDs or $retRunCMDs) { + push (@options, $arg); + } + if ($arg eq '--buildCMDs') { + $retBuildCMDs = 1; + } + if ($arg eq '--runCMDs') { + $retRunCMDs = 1; + } +} + +# Atleast one command needs to be specified +if ($retBuildCMDs eq 0 and $retRunCMDs eq 0) { + die "Usage: $0 <--buildCMDs|--runCMDs> FILENAMEs\n"; +} + +# Iterate over input files +foreach $file (@options) { + # Convert absolute path to path relative to working directory + my $relfile = File::Spec->abs2rel($file); + my ($buildCMDs, $runCMDs) = parse_file("$relfile"); + if ($retBuildCMDs) { + # print "BuildCMDs:\n"; + print "$_\n" for @$buildCMDs; + } + if ($retRunCMDs) { + # print "RunCMDs:\n"; + print "$_\n" for @$runCMDs; + } +} + +# vim: ts=4:sw=4:expandtab:smartindent From f7b0540c80907357884ea6d9e4f2703617e4e3fd Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 23 Sep 2016 14:46:48 +0530 Subject: [PATCH 36/66] Add target test that uses HIT to the top-level HIP cmake Change-Id: Idae1c0b39e6c4deb9f8fdb192bab17b0904941d7 --- CMakeLists.txt | 21 +++++++++++++++++++++ tests/hip_tests.txt | 12 ++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 tests/hip_tests.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 99cd546853..7a480cb32a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -304,3 +304,24 @@ if(POLICY CMP0037) cmake_policy(POP) endif() +############################# +# Testing steps +############################# +# Target: test +set(BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/hip_tests) +configure_file(tests/hip_tests.txt ${BUILD_DIR}/CMakeLists.txt @ONLY) +if(POLICY CMP0037) + cmake_policy(PUSH) + cmake_policy(SET CMP0037 OLD) +endif() +add_custom_target(install_for_test COMMAND "${CMAKE_COMMAND}" --build . --target install + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +add_custom_target(test COMMAND ${CMAKE_COMMAND} . + COMMAND make + COMMAND make test + WORKING_DIRECTORY ${BUILD_DIR} + DEPENDS install_for_test) +if(POLICY CMP0037) + cmake_policy(POP) +endif() + diff --git a/tests/hip_tests.txt b/tests/hip_tests.txt new file mode 100644 index 0000000000..35246def8a --- /dev/null +++ b/tests/hip_tests.txt @@ -0,0 +1,12 @@ +cmake_minimum_required(VERSION 2.8.3) +project(hip_tests) + +# Setup +set(HIP_PATH @CMAKE_INSTALL_PREFIX@) +set(ENV{HIP_PATH} ${HIP_PATH}) +set(HIP_SRC_PATH @hip_SOURCE_DIR@) +set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) +include(${HIP_SRC_PATH}/tests/hit/HIT.cmake) + +# Add tests +hit_add_directory(${HIP_SRC_PATH}/tests/src) From 40694485cabdd428f43cc4388fe55ba6f30853a6 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 23 Sep 2016 15:13:12 +0530 Subject: [PATCH 37/66] HIT: Fix logic in HIT_ADD_DIRECTORY_RECURSIVE Change-Id: I066787aaeec2a1562aa36527e60291594da31f83 --- tests/hit/HIT.cmake | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/hit/HIT.cmake b/tests/hit/HIT.cmake index 0472bb4d39..847f8fdadd 100644 --- a/tests/hit/HIT.cmake +++ b/tests/hit/HIT.cmake @@ -146,8 +146,15 @@ endmacro() # Macro: HIT_ADD_DIRECTORY_RECURSIVE to scan+add all files in a directory+subdirectories for testing macro(HIT_ADD_DIRECTORY_RECURSIVE _dir) - file(GLOB_RECURSE files "${_dir}/*.c*") - hit_add_files(${_dir} ${files}) + file(GLOB children RELATIVE ${_dir} ${_dir}/*) + set(dirlist "") + foreach(child ${children}) + if(IS_DIRECTORY ${_dir}/${child}) + hit_add_directory_recursive(${_dir}/${child}) + else() + hit_add_files(${_dir} ${child}) + endif() + endforeach() endmacro() # vim: ts=4:sw=4:expandtab:smartindent From 8e55fc7b4293f4857dcb88e051a3a98e6b9165f4 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 23 Sep 2016 16:15:31 +0530 Subject: [PATCH 38/66] directed tests: Enabled top-level uncategorized tests in HIT Change-Id: If37cbd00244ebfa0e5bff8362820aa68aaa163f1 --- tests/src/hipArray.cpp | 6 ++++++ tests/src/hipDynamicShared.cpp | 6 ++++++ tests/src/hipEnvVar.cpp | 4 ++++ tests/src/hipEnvVarDriver.cpp | 6 ++++++ tests/src/hipEventRecord.cpp | 5 +++++ tests/src/hipFuncDeviceSynchronize.cpp | 6 ++++++ tests/src/hipFuncGetDevice.cpp | 6 ++++++ tests/src/hipFuncSetDevice.cpp | 6 ++++++ tests/src/hipFuncSetDeviceFlags.cpp | 6 ++++++ tests/src/hipGetDeviceAttribute.cpp | 12 +++++++++--- tests/src/hipHcc.cpp | 6 ++++++ tests/src/hipHostAlloc.cpp | 6 ++++++ tests/src/hipHostGetFlags.cpp | 6 ++++++ tests/src/hipHostRegister.cpp | 6 ++++++ tests/src/hipLaunchParm.cpp | 6 ++++++ tests/src/hipPeerToPeer_simple.cpp | 8 ++++++++ tests/src/hipRandomMemcpyAsync.cpp | 6 ++++++ tests/src/hipTestMemcpyPin.cpp | 6 ++++++ 18 files changed, 110 insertions(+), 3 deletions(-) diff --git a/tests/src/hipArray.cpp b/tests/src/hipArray.cpp index 49add786d8..e32e2c0e5c 100644 --- a/tests/src/hipArray.cpp +++ b/tests/src/hipArray.cpp @@ -1,3 +1,9 @@ +/* HIT_START + * BUILD: %t %s test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * RUN: %t EXCLUDE_HIP_PLATFORM + * HIT_END + */ + #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/hipDynamicShared.cpp b/tests/src/hipDynamicShared.cpp index 5686f9ee88..2ed793608b 100644 --- a/tests/src/hipDynamicShared.cpp +++ b/tests/src/hipDynamicShared.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t EXCLUDE_HIP_PLATFORM nvcc + * HIT_END + */ + #include #include "test_common.h" diff --git a/tests/src/hipEnvVar.cpp b/tests/src/hipEnvVar.cpp index 229fa390c2..babfc73832 100644 --- a/tests/src/hipEnvVar.cpp +++ b/tests/src/hipEnvVar.cpp @@ -20,6 +20,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * HIT_END + */ #include #include diff --git a/tests/src/hipEnvVarDriver.cpp b/tests/src/hipEnvVarDriver.cpp index b5cab268b7..255cc72806 100644 --- a/tests/src/hipEnvVarDriver.cpp +++ b/tests/src/hipEnvVarDriver.cpp @@ -15,6 +15,12 @@ EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp NVCC_OPTIONS -std=c++11 + * RUN: %t + * HIT_END + */ + #include #include #include diff --git a/tests/src/hipEventRecord.cpp b/tests/src/hipEventRecord.cpp index 90e705f5c3..07b2068fac 100644 --- a/tests/src/hipEventRecord.cpp +++ b/tests/src/hipEventRecord.cpp @@ -23,6 +23,11 @@ THE SOFTWARE. // Through manual inspection of the reported timestamps, can determine if recording a NULL event forces synchronization : // set +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t --iterations 10 + * HIT_END + */ #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/hipFuncDeviceSynchronize.cpp b/tests/src/hipFuncDeviceSynchronize.cpp index bb274f47bc..6d05253536 100644 --- a/tests/src/hipFuncDeviceSynchronize.cpp +++ b/tests/src/hipFuncDeviceSynchronize.cpp @@ -22,6 +22,12 @@ THE SOFTWARE. * hipError_t hipDeviceSynchronize(); */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include"test_common.h" #define _SIZE sizeof(int)*1024*1024 diff --git a/tests/src/hipFuncGetDevice.cpp b/tests/src/hipFuncGetDevice.cpp index c4785ece00..f903149f91 100644 --- a/tests/src/hipFuncGetDevice.cpp +++ b/tests/src/hipFuncGetDevice.cpp @@ -22,6 +22,12 @@ THE SOFTWARE. * hipError_t hipGetDevice(int *device); */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include "test_common.h" int main() diff --git a/tests/src/hipFuncSetDevice.cpp b/tests/src/hipFuncSetDevice.cpp index fd13d21325..030ceb7b0c 100644 --- a/tests/src/hipFuncSetDevice.cpp +++ b/tests/src/hipFuncSetDevice.cpp @@ -17,6 +17,12 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t EXCLUDE_HIP_PLATFORM + * HIT_END + */ + #include "test_common.h" int main(){ diff --git a/tests/src/hipFuncSetDeviceFlags.cpp b/tests/src/hipFuncSetDeviceFlags.cpp index a3d81e8b5a..ccc81b11ac 100644 --- a/tests/src/hipFuncSetDeviceFlags.cpp +++ b/tests/src/hipFuncSetDeviceFlags.cpp @@ -17,6 +17,12 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include "test_common.h" int main() diff --git a/tests/src/hipGetDeviceAttribute.cpp b/tests/src/hipGetDeviceAttribute.cpp index 51bf29f9e6..f94910459f 100644 --- a/tests/src/hipGetDeviceAttribute.cpp +++ b/tests/src/hipGetDeviceAttribute.cpp @@ -21,6 +21,12 @@ THE SOFTWARE. */ // Test the device info API extensions for HIP: +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t EXCLUDE_HIP_PLATFORM nvcc + * HIT_END + */ + #include #include #include @@ -70,15 +76,15 @@ int main(int argc, char *argv[]) CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryClockRate, props.memoryClockRate)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMemoryBusWidth, props.memoryBusWidth)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMultiprocessorCount, props.multiProcessorCount)); - CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeIsMultiGpuBoard, props.isMultiGpuBoard)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeIsMultiGpuBoard, props.isMultiGpuBoard));// CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeMode, props.computeMode)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeL2CacheSize, props.l2CacheSize)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxThreadsPerMultiProcessor, props.maxThreadsPerMultiProcessor)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMajor, props.major)); - CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMinor, props.minor)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeComputeCapabilityMinor, props.minor));// CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeConcurrentKernels, props.concurrentKernels)); CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciBusId, props.pciBusID)); - CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciDeviceId, props.pciDeviceID)); + CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributePciDeviceId, props.pciDeviceID));// CHECK(test_hipDeviceGetAttribute(deviceId, hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, props.maxSharedMemoryPerMultiProcessor)); passed(); diff --git a/tests/src/hipHcc.cpp b/tests/src/hipHcc.cpp index e3740fa27b..f9bd584faa 100644 --- a/tests/src/hipHcc.cpp +++ b/tests/src/hipHcc.cpp @@ -21,6 +21,12 @@ THE SOFTWARE. */ // Test the HCC-specific API extensions for HIP: +/* HIT_START + * BUILD: %t %s HCC_OPTIONS --stdlib=libc++ + * RUN: %t + * HIT_END + */ + #include #include #include diff --git a/tests/src/hipHostAlloc.cpp b/tests/src/hipHostAlloc.cpp index a6c4cb20e0..d1950f825f 100644 --- a/tests/src/hipHostAlloc.cpp +++ b/tests/src/hipHostAlloc.cpp @@ -20,6 +20,12 @@ THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include"test_common.h" #define LEN 1024*1024 diff --git a/tests/src/hipHostGetFlags.cpp b/tests/src/hipHostGetFlags.cpp index c0250055ae..8664095422 100644 --- a/tests/src/hipHostGetFlags.cpp +++ b/tests/src/hipHostGetFlags.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include"test_common.h" #include diff --git a/tests/src/hipHostRegister.cpp b/tests/src/hipHostRegister.cpp index 4c85358e06..e9044d5a86 100644 --- a/tests/src/hipHostRegister.cpp +++ b/tests/src/hipHostRegister.cpp @@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include"test_common.h" #include diff --git a/tests/src/hipLaunchParm.cpp b/tests/src/hipLaunchParm.cpp index 26ad94f182..0e8248aebc 100644 --- a/tests/src/hipLaunchParm.cpp +++ b/tests/src/hipLaunchParm.cpp @@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include"hip_runtime.h" #include"test_common.h" #include"hip_runtime_api.h" diff --git a/tests/src/hipPeerToPeer_simple.cpp b/tests/src/hipPeerToPeer_simple.cpp index de89b36f46..0e99e9bfb8 100644 --- a/tests/src/hipPeerToPeer_simple.cpp +++ b/tests/src/hipPeerToPeer_simple.cpp @@ -22,6 +22,14 @@ THE SOFTWARE. // Simple test for memset. // Also serves as a template for other tests. +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t EXCLUDE_HIP_PLATFORM all + * RUN: %t --memcpyWithPeer EXCLUDE_HIP_PLATFORM all + * RUN: %t --mirrorPeers EXCLUDE_HIP_PLATFORM all + * HIT_END + */ + #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/hipRandomMemcpyAsync.cpp b/tests/src/hipRandomMemcpyAsync.cpp index 8f06c5fbeb..b00987b6c3 100644 --- a/tests/src/hipRandomMemcpyAsync.cpp +++ b/tests/src/hipRandomMemcpyAsync.cpp @@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include #include #include diff --git a/tests/src/hipTestMemcpyPin.cpp b/tests/src/hipTestMemcpyPin.cpp index c36170003c..7240a6cf6e 100644 --- a/tests/src/hipTestMemcpyPin.cpp +++ b/tests/src/hipTestMemcpyPin.cpp @@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s test_common.cpp + * RUN: %t + * HIT_END + */ + #include #include"test_common.h" From 29565c2ad36d921f4c5f26aa9cb4aea42633bc34 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 23 Sep 2016 22:42:52 +0530 Subject: [PATCH 39/66] hip_tests: Scan for tests recursively in tests/src Change-Id: I0297e6c94de3ea446ce99cce0aa641e74dad0d13 --- tests/hip_tests.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/hip_tests.txt b/tests/hip_tests.txt index 35246def8a..5c7e543ce5 100644 --- a/tests/hip_tests.txt +++ b/tests/hip_tests.txt @@ -9,4 +9,5 @@ set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) include(${HIP_SRC_PATH}/tests/hit/HIT.cmake) # Add tests -hit_add_directory(${HIP_SRC_PATH}/tests/src) +include_directories(${HIP_SRC_PATH}/tests/src) +hit_add_directory_recursive(${HIP_SRC_PATH}/tests/src) From 7b87a972cb370c8accec28a063fd7dfba0b6ba4f Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 26 Sep 2016 05:35:14 +0530 Subject: [PATCH 40/66] Removed deprecated hipFreeHost Change-Id: I9747ea3993090e0da6a2e1f5e9ce318762bc03e1 --- include/hcc_detail/hip_runtime_api.h | 26 +++++--------------------- include/nvcc_detail/hip_runtime_api.h | 5 ----- src/hip_memory.cpp | 6 ------ 3 files changed, 5 insertions(+), 32 deletions(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 26c4f8ba32..85c111e8bf 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -777,7 +777,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) * * @return #hipSuccess * - * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipMallocHost, hipFreeHost, hipHostAlloc + * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipMallocHost, hipHostFree, hipHostAlloc */ hipError_t hipMalloc(void** ptr, size_t size) ; @@ -790,7 +790,7 @@ hipError_t hipMalloc(void** ptr, size_t size) ; * * @return #hipSuccess, #hipErrorMemoryAllocation * - * @see hipMalloc, hipMallocPitch, hipMallocArray, hipMalloc3D, hipMalloc3DArray, hipHostAlloc, hipFree, hipFreeArray, hipMallocHost, hipFreeHost, hipHostAlloc + * @see hipMalloc, hipMallocPitch, hipMallocArray, hipMalloc3D, hipMalloc3DArray, hipHostAlloc, hipFree, hipFreeArray, hipMallocHost, hipHostFree, hipHostAlloc */ hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use hipHostMalloc instead"))) ; @@ -803,7 +803,7 @@ hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use * * @return #hipSuccess, #hipErrorMemoryAllocation * - * @see hipSetDeviceFlags, hipMallocHost, hipFreeHost + * @see hipSetDeviceFlags, hipMallocHost, hipHostFree */ hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) ; hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) __attribute__((deprecated("use hipHostMalloc instead"))) ;; @@ -888,7 +888,7 @@ hipError_t hipHostUnregister(void* hostPtr) ; * @param[in] height Requested pitched allocation height * @return Error code * - * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipMallocHost, hipFreeHost, hipMalloc3D, hipMalloc3DArray, hipHostAlloc + * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipMallocHost, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostAlloc */ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height); @@ -902,24 +902,10 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height * @return #hipSuccess * @return #hipErrorInvalidDevicePointer (if pointer is invalid, including host pointers allocated with hipHostMalloc) * - * @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipMallocHost, hipFreeHost, hipMalloc3D, hipMalloc3DArray, hipHostAlloc + * @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipMallocHost, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostAlloc */ hipError_t hipFree(void* ptr); - - -/** - * @brief Free memory allocated by the hcc hip host memory allocation API. [Deprecated.] - * - * @param[in] ptr Pointer to memory to be freed - * @return #hipSuccess, - * #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with hipMalloc) - - * @see hipHostFree - */ -hipError_t hipFreeHost(void* ptr) __attribute__((deprecated("use hipHostFree instead"))) ; - - /** * @brief Free memory allocated by the hcc hip host memory allocation API * This API performs an implicit hipDeviceSynchronize() call. @@ -933,8 +919,6 @@ hipError_t hipFreeHost(void* ptr) __attribute__((deprecated("use hipHostFree ins */ hipError_t hipHostFree(void* ptr); - - /** * @brief Copy data from src to dst. * diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h index eff2aafe17..9d5c8d8865 100644 --- a/include/nvcc_detail/hip_runtime_api.h +++ b/include/nvcc_detail/hip_runtime_api.h @@ -205,11 +205,6 @@ inline static hipError_t hipHostUnregister(void* ptr){ return hipCUDAErrorTohipError(cudaHostUnregister(ptr)); } -inline static hipError_t hipFreeHost(void* ptr) __attribute__((deprecated("use hipHostFree instead"))); -inline static hipError_t hipFreeHost(void* ptr) { - return hipCUDAErrorTohipError(cudaFreeHost(ptr)); -} - inline static hipError_t hipHostFree(void* ptr) { return hipCUDAErrorTohipError(cudaFreeHost(ptr)); } diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index b973ba977b..6381cd5332 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -941,12 +941,6 @@ hipError_t hipHostFree(void* ptr) return ihipLogStatus(hipStatus); }; -// TODO - deprecated function. -hipError_t hipFreeHost(void* ptr) -{ - return hipHostFree(ptr); -} - hipError_t hipFreeArray(hipArray* array) { HIP_INIT_API(array); From 6790ff6b118193f0d02113a522ddc2c1dd14ab0d Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 26 Sep 2016 05:52:21 +0530 Subject: [PATCH 41/66] Removed deprecated hipMallocHost Change-Id: I3141fe05a34b6a306297a30721509965f49ccb97 --- include/hcc_detail/hip_runtime_api.h | 23 +++++------------------ include/nvcc_detail/hip_runtime_api.h | 6 ------ src/hip_memory.cpp | 7 ------- 3 files changed, 5 insertions(+), 31 deletions(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 85c111e8bf..b266927a33 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -777,23 +777,10 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) * * @return #hipSuccess * - * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipMallocHost, hipHostFree, hipHostAlloc + * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc */ hipError_t hipMalloc(void** ptr, size_t size) ; - -/** - * @brief Allocate pinned host memory - * - * @param[out] ptr Pointer to the allocated host pinned memory - * @param[in] size Requested memory size - * - * @return #hipSuccess, #hipErrorMemoryAllocation - * - * @see hipMalloc, hipMallocPitch, hipMallocArray, hipMalloc3D, hipMalloc3DArray, hipHostAlloc, hipFree, hipFreeArray, hipMallocHost, hipHostFree, hipHostAlloc - */ -hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use hipHostMalloc instead"))) ; - /** * @brief Allocate device accessible page locked host memory * @@ -803,7 +790,7 @@ hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use * * @return #hipSuccess, #hipErrorMemoryAllocation * - * @see hipSetDeviceFlags, hipMallocHost, hipHostFree + * @see hipSetDeviceFlags, hipHostFree */ hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) ; hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) __attribute__((deprecated("use hipHostMalloc instead"))) ;; @@ -888,7 +875,7 @@ hipError_t hipHostUnregister(void* hostPtr) ; * @param[in] height Requested pitched allocation height * @return Error code * - * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipMallocHost, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostAlloc + * @see hipMalloc, hipFree, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostMalloc */ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height); @@ -902,7 +889,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height * @return #hipSuccess * @return #hipErrorInvalidDevicePointer (if pointer is invalid, including host pointers allocated with hipHostMalloc) * - * @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipMallocHost, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostAlloc + * @see hipMalloc, hipMallocPitch, hipMallocArray, hipFreeArray, hipHostFree, hipMalloc3D, hipMalloc3DArray, hipHostMalloc */ hipError_t hipFree(void* ptr); @@ -915,7 +902,7 @@ hipError_t hipFree(void* ptr); * @return #hipSuccess, * #hipErrorInvalidValue (if pointer is invalid, including device pointers allocated with hipMalloc) * - * @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMallocHost, hipMalloc3D, hipMalloc3DArray, hipHostAlloc + * @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostMalloc */ hipError_t hipHostFree(void* ptr); diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h index 9d5c8d8865..0814de8ec9 100644 --- a/include/nvcc_detail/hip_runtime_api.h +++ b/include/nvcc_detail/hip_runtime_api.h @@ -179,12 +179,6 @@ inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); } -inline static hipError_t hipMallocHost(void** ptr, size_t size) __attribute__((deprecated("use hipHostMalloc instead"))); - -inline static hipError_t hipMallocHost(void** ptr, size_t size) { - return hipCUDAErrorTohipError(cudaMallocHost(ptr, size)); -} - inline static hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags){ return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags)); } diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 6381cd5332..75097cd33b 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -179,13 +179,6 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) return hipHostMalloc(ptr, sizeBytes, flags); }; -//--- -// TODO - remove me, this is deprecated. -hipError_t hipMallocHost(void** ptr, size_t sizeBytes) -{ - return hipHostMalloc(ptr, sizeBytes, 0); -} - // width in bytes hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) { From daef1d29061ed9ba22ab12a1096c6c7328f3e573 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 26 Sep 2016 10:58:46 +0530 Subject: [PATCH 42/66] Removed deprecated hipHostAlloc Change-Id: Ia592a3545c5d72f37e049ce29f910e404323c01c --- bin/hipify | 2 +- include/hcc_detail/hip_runtime_api.h | 9 ++++----- src/hip_memory.cpp | 7 ------- tests/src/CMakeLists.txt | 4 ++-- tests/src/{hipHostAlloc.cpp => hipHostMalloc.cpp} | 0 5 files changed, 7 insertions(+), 15 deletions(-) rename tests/src/{hipHostAlloc.cpp => hipHostMalloc.cpp} (100%) diff --git a/bin/hipify b/bin/hipify index 7ecfccbd88..5c56465040 100755 --- a/bin/hipify +++ b/bin/hipify @@ -288,7 +288,7 @@ while (@ARGV) { $ft{'mem'} += s/\bcudaMallocHost\b/hipHostMalloc/g; # note conversion to standard hipHost* naming convention $ft{'mem'} += s/\bcudaFree\b/hipFree/g; $ft{'mem'} += s/\bcudaFreeHost\b/hipHostFree/g; # note conversion to standard hipHost* naming convention - $ft{'mem'} += s/\bcudaHostAlloc\b/hipHostAlloc/g; + $ft{'mem'} += s/\bcudaHostAlloc\b/hipHostMalloc/g; $ft{'mem'} += s/\bcudaHostGetDevicePointer\b/hipHostGetDevicePointer/g; $ft{'mem'} += s/\bcudaHostAllocDefault\b/hipHostMallocDefault/g; $ft{'mem'} += s/\bcudaHostAllocPortable\b/hipHostMallocPortable/g; diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index b266927a33..4de2797033 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -793,18 +793,17 @@ hipError_t hipMalloc(void** ptr, size_t size) ; * @see hipSetDeviceFlags, hipHostFree */ hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) ; -hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) __attribute__((deprecated("use hipHostMalloc instead"))) ;; /** - * @brief Get Device pointer from Host Pointer allocated through hipHostAlloc + * @brief Get Device pointer from Host Pointer allocated through hipHostMalloc * * @param[out] dstPtr Device Pointer mapped to passed host pointer - * @param[in] hstPtr Host Pointer allocated through hipHostAlloc + * @param[in] hstPtr Host Pointer allocated through hipHostMalloc * @param[in] flags Flags to be passed for extension * * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryAllocation * - * @see hipSetDeviceFlags, hipHostAlloc + * @see hipSetDeviceFlags, hipHostMalloc */ hipError_t hipHostGetDevicePointer(void** devPtr, void* hstPtr, unsigned int flags) ; @@ -815,7 +814,7 @@ hipError_t hipHostGetDevicePointer(void** devPtr, void* hstPtr, unsigned int fla * @param[in] hostPtr Host Pointer allocated through hipHostMalloc * @return #hipSuccess, #hipErrorInvalidValue * - * @see hipHostAlloc + * @see hipHostMalloc */ hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) ; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 75097cd33b..d2bb084fc5 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -172,13 +172,6 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) return ihipLogStatus(hip_status); } -//--- -// TODO - remove me, this is deprecated. -hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags) -{ - return hipHostMalloc(ptr, sizeBytes, flags); -}; - // width in bytes hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) { diff --git a/tests/src/CMakeLists.txt b/tests/src/CMakeLists.txt index b58b930f7f..2f8c8f0a69 100644 --- a/tests/src/CMakeLists.txt +++ b/tests/src/CMakeLists.txt @@ -184,7 +184,7 @@ build_hip_executable_libcpp(hipHcc hipHcc.cpp) # __workweek fix. #build_hip_executable_libcpp(hipPointerAttrib hipPointerAttrib.cpp) -build_hip_executable(hipHostAlloc hipHostAlloc.cpp) +build_hip_executable(hipHostMalloc hipHostMalloc.cpp) build_hip_executable(hipHostGetFlags hipHostGetFlags.cpp) build_hip_executable(hipHostRegister hipHostRegister.cpp) build_hip_executable(hipRandomMemcpyAsync hipRandomMemcpyAsync.cpp) @@ -208,7 +208,7 @@ make_test(hipLaunchParm " ") #make_test(hipPointerAttrib " ") -make_test(hipHostAlloc " ") +make_test(hipHostMalloc " ") # BS- comment out since test appears broken - asks for device pointer but pointer was never allocated. #make_test(hipHostGetFlags " ") make_test(hipHcc " ") diff --git a/tests/src/hipHostAlloc.cpp b/tests/src/hipHostMalloc.cpp similarity index 100% rename from tests/src/hipHostAlloc.cpp rename to tests/src/hipHostMalloc.cpp From f4f12cf50cc95de8eec1d7efdc4ec5a42f2cd3ff Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 26 Sep 2016 16:18:53 +0530 Subject: [PATCH 43/66] Doxygen related document updates for texture/ array related functions Change-Id: Iedc5b6512b5ddecb98c0e51a353957f08b955fde --- include/hcc_detail/hip_texture.h | 82 ++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 10 deletions(-) diff --git a/include/hcc_detail/hip_texture.h b/include/hcc_detail/hip_texture.h index 5712e5c333..c83917b8d6 100644 --- a/include/hcc_detail/hip_texture.h +++ b/include/hcc_detail/hip_texture.h @@ -97,18 +97,78 @@ typedef struct hipArray { #define tex2D(_tex, _dx, _dy) \ _tex._dataPtr[(unsigned int)_dx + (unsigned int)_dy*(_tex.width)] +/** + * @brief Allocate an array on the device. + * + * @param[out] array Pointer to allocated array in device memory + * @param[in] desc Requested channel format + * @param[in] width Requested array allocation width + * @param[in] height Requested array allocation height + * @param[in] flags Requested properties of allocated array + * @return #hipSuccess, #hipErrorMemoryAllocation + * + * @see hipMalloc, hipMallocPitch, hipFree, hipFreeArray, hipHostMalloc, hipHostFree + */ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, size_t width, size_t height = 0, unsigned int flags = 0); +/** + * @brief Frees an array on the device. + * + * @param[in] array Pointer to array to free + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInitializationError + * + * @see hipMalloc, hipMallocPitch, hipFree, hipMallocArray, hipHostMalloc, hipHostFree + */ hipError_t hipFreeArray(hipArray* array); - // -// dpitch, spitch, and width in bytes + +/** + * @brief Copies data between host and device. + * + * @param[in] dst Destination memory address + * @param[in] dpitch Pitch of destination memory + * @param[in] src Source memory address + * @param[in] spitch Pitch of source memory + * @param[in] width Width of matrix transfer (columns in bytes) + * @param[in] height Height of matrix transfer (rows) + * @param[in] kind Type of transfer + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection + * + * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync + */ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind); -// wOffset, width, and spitch in bytes +/** + * @brief Copies data between host and device. + * + * @param[in] dst Destination memory address + * @param[in] dpitch Pitch of destination memory + * @param[in] src Source memory address + * @param[in] spitch Pitch of source memory + * @param[in] width Width of matrix transfer (columns in bytes) + * @param[in] height Height of matrix transfer (rows) + * @param[in] kind Type of transfer + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection + * + * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync + */ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind); +/** + * @brief Copies data between host and device. + * + * @param[in] dst Destination memory address + * @param[in] dpitch Pitch of destination memory + * @param[in] src Source memory address + * @param[in] spitch Pitch of source memory + * @param[in] width Width of matrix transfer (columns in bytes) + * @param[in] height Height of matrix transfer (rows) + * @param[in] kind Type of transfer + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection + * + * @see hipMemcpy, hipMemcpy2DToArray, hipMemcpy2D, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync + */ hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind); @@ -152,15 +212,17 @@ hipChannelFormatDesc hipBindTexture(size_t *offset, struct textureReference *te } #endif -/* - * @brief Returns a channel descriptor with format f and number of bits of each ocmponent x,y,z and w. - * - * @par Parameters - * None. - * @return Channel descriptor +/** + * @brief Returns a channel descriptor using the specified format. * + * @param[in] x X component + * @param[in] y Y component + * @param[in] z Z component + * @param[in] w W component + * @param[in] f Channel format + * @return Channel descriptor with format f * - **/ + */ hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f); // descriptors From 280249918fc5a4fe600d89efd76c6815879a5b72 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 26 Sep 2016 23:11:07 +0530 Subject: [PATCH 44/66] Doxygen documentation changes for hipCtxXXX APIs Change-Id: I1e5e3a621b732bd8bdeef1c607c004d6adfdab32 --- include/hcc_detail/hip_runtime_api.h | 85 ++++++++++++++++++++++------ 1 file changed, 68 insertions(+), 17 deletions(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 4de2797033..6992be8475 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -1237,10 +1237,21 @@ hipError_t hipInit(unsigned int flags) ; * @param [in] flags * @param [in] associated device handle * - * @returns #hipSuccess, #hipErrorInvalidContext + * @return #hipSuccess + * + * @see hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device); +/** + * @brief Destroy a HIP context. + * + * @param [in] ctx Context to destroy + * + * @returns #hipSuccess, #hipErrorInvalidValue + * + * @see hipCtxCreate, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent,hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice + */ hipError_t hipCtxDestroy(hipCtx_t ctx); /** @@ -1248,9 +1259,10 @@ hipError_t hipCtxDestroy(hipCtx_t ctx); * * @param [out] ctx * - * @returns #hipSuccess + * @returns #hipSuccess, #hipErrorInvalidContext + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxSetCurrent, hipCtxGetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ - hipError_t hipCtxPopCurrent(hipCtx_t* ctx); /** @@ -1259,8 +1271,9 @@ hipError_t hipCtxPopCurrent(hipCtx_t* ctx); * @param [in] ctx * * @returns #hipSuccess, #hipErrorInvalidContext + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice */ - hipError_t hipCtxPushCurrent(hipCtx_t ctx); /** @@ -1268,9 +1281,10 @@ hipError_t hipCtxPushCurrent(hipCtx_t ctx); * * @param [in] ctx * - * @returns #hipSuccess + * @returns #hipSuccess, #hipErrorInvalidContext + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize , hipCtxGetDevice */ - hipError_t hipCtxSetCurrent(hipCtx_t ctx); /** @@ -1278,9 +1292,10 @@ hipError_t hipCtxSetCurrent(hipCtx_t ctx); * * @param [out] ctx * - * @returns #hipSuccess + * @returns #hipSuccess, #hipErrorInvalidContext + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ - hipError_t hipCtxGetCurrent(hipCtx_t* ctx); /** @@ -1289,6 +1304,8 @@ hipError_t hipCtxGetCurrent(hipCtx_t* ctx); * @param [out] device * * @returns #hipSuccess, #hipErrorInvalidContext + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize */ hipError_t hipCtxGetDevice(hipDevice_t *device); @@ -1296,53 +1313,81 @@ hipError_t hipCtxGetDevice(hipDevice_t *device); /** * @brief Returns the approximate HIP api version. * + * @param [in] ctx Context to check + * @param [out] apiVersion + * + * @return #hipSuccess + * * @warning The HIP feature set does not correspond to an exact CUDA SDK api revision. * This function always set *apiVersion to 4 as an approximation though HIP supports * some features which were introduced in later CUDA SDK revisions. * HIP apps code should not rely on the api revision number here and should * use arch feature flags to test device capabilities or conditional compilation. * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetDevice, hipCtxGetFlags, hipCtxPopCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion); /** * @brief Set Cache configuration for a specific function * - * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. + * @param [out] cacheConfiguration + * + * @return #hipSuccess * + * @warning AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig ); /** * @brief Set L1/Shared cache partition. + * + * @param [in] cacheConfiguration * - * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. + * @return #hipSuccess + * + * @warning AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig ); /** * @brief Set Shared memory bank configuration. * - * Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures. + * @param [in] sharedMemoryConfiguration + * + * @return #hipSuccess + * + * @warning AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures. * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config ); /** * @brief Get Shared memory bank configuration. * - * Note: AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures. + * @param [out] sharedMemoryConfiguration + * + * @return #hipSuccess + * + * @warning AMD devices and recent Nvidia GPUS do not support shared cache banking, and the hint is ignored on those architectures. * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig ); /** * @brief Blocks until the default context has completed all preceding requested tasks. * - * This function waits for all streams on the default context to complete execution, and then returns. + * @return #hipSuccess + * + * @warning This function waits for all streams on the default context to complete execution, and then returns. * - * @returns #hipSuccess. + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxGetDevice */ hipError_t hipCtxSynchronize ( void ); @@ -1351,7 +1396,9 @@ hipError_t hipCtxSynchronize ( void ); * * @param [out] flags * - * @returns #hipSuccess. + * @returns #hipSuccess + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxGetCurrent, hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice */ hipError_t hipCtxGetFlags ( unsigned int* flags ); @@ -1366,8 +1413,9 @@ hipError_t hipCtxGetFlags ( unsigned int* flags ); * @param [in] peerCtx * @param [in] flags * - * Returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue, - * @returns #hipErrorPeerAccessAlreadyEnabled if peer access is already enabled for this device. + * @returns #hipSuccess, #hipErrorInvalidDevice, #hipErrorInvalidValue, #hipErrorPeerAccessAlreadyEnabled + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice * @warning PeerToPeer support is experimental. */ hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags); @@ -1380,9 +1428,12 @@ hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags); * @param [in] peerCtx * * @returns #hipSuccess, #hipErrorPeerAccessNotEnabled + * + * @see hipCtxCreate, hipCtxDestroy, hipCtxGetFlags, hipCtxPopCurrent, hipCtxGetCurrent, hipCtxSetCurrent, hipCtxPushCurrent, hipCtxSetCacheConfig, hipCtxSynchronize, hipCtxGetDevice * @warning PeerToPeer support is experimental. */ hipError_t hipCtxDisablePeerAccess (hipCtx_t peerCtx); + // doxygen end Context Management /** * @} From b7fcdc6121b4a9c6b9ce5fae57959b8103429d57 Mon Sep 17 00:00:00 2001 From: pensun Date: Mon, 26 Sep 2016 14:39:20 -0500 Subject: [PATCH 45/66] fix of HIPCC on libc++ option on HCC path Change-Id: Ie0d3213a165fa13f033b777b490eb60b980d02da --- bin/hipcc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/hipcc b/bin/hipcc index 8a8a1715da..4c4a0dd714 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -225,12 +225,12 @@ foreach $arg (@ARGV) $needLDFLAGS = 1; } - if($arg eq '-stdlib=libc++' and $setStdLib eq 0) + if(($arg eq '-stdlib=libc++') and ($setStdLib eq 0)) { $HIPCXXFLAGS .= " -stdlib=libc++"; $setStdLib = 1; } - if($arg eq '-stdlib=libstdc++' and $setStdLib eq 0) + if(($arg eq '-stdlib=libstdc++') and ($setStdLib eq 0)) { $HIPCXXFLAGS .= " -stdlib=libstdc++"; $setStdLib = 1; From 7bc1af27761afcc0bf152dc7fec7eb54cbc75360 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 22 Sep 2016 17:51:52 -0500 Subject: [PATCH 46/66] Small tool, doc, sample enhancements. - Expand message when HIP version mismatch detected. - Doc touchup. - change sorting of hipBusBandwidth so byte results shown at top. - Change-Id: Ifb4e44a5fdfb65d59c4994b11e5f13385705f7e0 --- bin/hipcc | 2 +- .../CUDA_Runtime_API_functions_supported_by_HIP.md | 2 +- docs/markdown/hip_faq.md | 6 +++--- samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp | 14 +++++++++++--- .../1_Utils/hipBusBandwidth/hipBusBandwidth.cpp | 4 ++-- tests/README.md | 4 +++- 6 files changed, 21 insertions(+), 11 deletions(-) diff --git a/bin/hipcc b/bin/hipcc index 4c4a0dd714..b6d358532c 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -333,7 +333,7 @@ if ($printHipVersion) { } if ($runCmd) { if ($HIP_PLATFORM eq "hcc" and exists($hipConfig{'HCC_VERSION'}) and $HCC_VERSION ne $hipConfig{'HCC_VERSION'}) { - print ("HIP was built using $hipConfig{'HCC_VERSION'}, but you are using $HCC_VERSION. Please rebuild HIP.\n") && die (); + print ("HIP ($HIP_PATH) was built using hcc $hipConfig{'HCC_VERSION'}, but you are using hcc $HCC_VERSION. Please rebuild HIP including cmake.\n") && die (); } system ("$CMD") and die (); } diff --git a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md index 2d589ec415..4c92abbba6 100644 --- a/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md +++ b/docs/markdown/CUDA_Runtime_API_functions_supported_by_HIP.md @@ -49,7 +49,7 @@ | `cudaStreamDestroy` | `hipStreamDestroy` | Destroys and cleans up an asynchronous stream. | | `cudaStreamGetFlags` | `hipStreamGetFlags` | Query the flags of a stream. | | `cudaStreamGetPriority` | | Query the priority of a stream. | -| `cudaStreamQuery` | | Queries an asynchronous stream for completion status. | +| `cudaStreamQuery` | `hipStreamQuery` | Queries an asynchronous stream for completion status. | | `cudaStreamSynchronize` | `hipStreamSynchronize` | Waits for stream tasks to complete. | | `cudaStreamWaitEvent` | `hipStreamWaitEvent` | Make a compute stream wait on an event. | diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md index 31a032469b..b09771ab71 100644 --- a/docs/markdown/hip_faq.md +++ b/docs/markdown/hip_faq.md @@ -146,11 +146,11 @@ The tools also struggle with more complex CUDA applications, in particular those - For Nvidia platforms, HIP requires Unified Memory and should run on a device which runs the CUDA SDK 6.0 or newer. We have tested the Nvidia Titan and K40. ### Does Hipify automatically convert all source code? -Typically, Hipify can automatically convert almost all run-time code, and the coordinate indexing device code. +Typically, Hipify can automatically convert almost all run-time code, and the coordinate indexing device code (i.e. threadIdx.x -> hipThreadIdx_x). Most device code needs no additional conversion, since HIP and CUDA have similar names for math and built-in functions. -HIP currently requires manual addition of one more arguments to the kernel so that the host can communicate the execution configuration to the device. +The clang-hipify tool will automatically modify the kernel signature as needed (automating a step that used to be done manually) Additional porting may be required to deal with architecture feature queries or with CUDA capabilities that HIP doesn't support. -Developers should always expect to perform some platform-specific tuning and optimization. +In general, developers should always expect to perform some platform-specific tuning and optimization. ### What is NVCC? NVCC is Nvidia's compiler driver for compiling "CUDA C++" code into PTX or device code for Nvidia GPUs. It's a closed-source binary product that comes with CUDA SDKs. diff --git a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp index 2ec686f260..4be2ea258d 100644 --- a/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp +++ b/samples/1_Utils/hipBusBandwidth/ResultDatabase.cpp @@ -7,16 +7,22 @@ using namespace std; +#define SORT_RETAIN_ATTS_ORDER 1 + + bool ResultDatabase::Result::operator<(const Result &rhs) const { if (test < rhs.test) return true; if (test > rhs.test) return false; +#if (SORT_RETAIN_ATTS_ORDER == 0) + // For ties, sort by the value of the attribute: if (atts < rhs.atts) return true; if (atts > rhs.atts) return false; +#endif return false; // less-operator returns false on equal } @@ -189,7 +195,8 @@ void ResultDatabase::AddResult(const string &test_orig, void ResultDatabase::DumpDetailed(ostream &out) { vector sorted(results); - sort(sorted.begin(), sorted.end()); + + stable_sort(sorted.begin(), sorted.end()); const int testNameW = 24 ; const int attW = 12; @@ -281,7 +288,8 @@ void ResultDatabase::DumpDetailed(ostream &out) void ResultDatabase::DumpSummary(ostream &out) { vector sorted(results); - sort(sorted.begin(), sorted.end()); + + stable_sort(sorted.begin(), sorted.end()); const int testNameW = 24 ; const int attW = 12; @@ -377,7 +385,7 @@ void ResultDatabase::DumpCsv(string fileName) bool emptyFile; vector sorted(results); - sort(sorted.begin(), sorted.end()); + stable_sort(sorted.begin(), sorted.end()); //Check to see if the file is empty - if so, add the headers emptyFile = this->IsFileEmpty(fileName); diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index faff9ba6e9..a42a561ac7 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -49,8 +49,8 @@ std::string sizeToString(int size) using namespace std; stringstream ss; if (size < 0) { - // char (09, horiz tab) lexically sorts before " " so will cause Byte values to be displayed before kB. - ss << char(0x09)/*tab*/ << setfill('0') << setw(3) << -size << "B"; + // char (-) lexically sorts before " " so will cause Byte values to be displayed before kB. + ss << "+" << setfill('0') << setw(3) << -size << "By"; } else { ss << size << "kB"; } diff --git a/tests/README.md b/tests/README.md index de73652e43..56bb4e7edd 100644 --- a/tests/README.md +++ b/tests/README.md @@ -51,9 +51,11 @@ ctest -R Memcpy ``` -### If a test fails: +### If a test fails - how to debug a test Extract the commandline from the testing log: + +(From the test build directory, perhaps hip/tests/build) $ grep -A3 -m2 hipMemcpy-size Testing/Temporary/LastTest.log 36/47 Testing: hipMemcpy-size 36/47 Test: hipMemcpy-size From 225e37fdc9eaec5ae4d33c5b10863786c1ff51f6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 26 Sep 2016 16:32:35 -0500 Subject: [PATCH 47/66] Fix signal resource issue. Remove memory leak with new hc::completion_future. Implement HIP_LAUNCH_BLOCKING with queue-level wait. Change-Id: I45975f81c4d239fdeed7776970988d28449865dc --- include/hcc_detail/hip_hcc.h | 2 +- src/hip_hcc.cpp | 15 ++++++++------- src/hip_memory.cpp | 5 +++-- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 6c061b01a9..9d4b70c73f 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -373,7 +373,7 @@ typedef uint64_t SeqNum_t ; //--- // Member functions that begin with locked_ are thread-safe accessors - these acquire / release the critical mutex. LockedAccessor_StreamCrit_t lockopen_preKernelCommand(); - void lockclose_postKernelCommand(hc::completion_future &kernel_future); + void lockclose_postKernelCommand(hc::accelerator_view *av); void locked_wait(bool assertQueueEmpty=false); diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 1058d96412..fcbc553f83 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -253,11 +253,12 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() //--- // Must be called after kernel finishes, this releases the lock on the stream so other commands can submit. -void ihipStream_t::lockclose_postKernelCommand(hc::completion_future &kernelFuture) +void ihipStream_t::lockclose_postKernelCommand(hc::accelerator_view *av) { if (HIP_LAUNCH_BLOCKING) { - kernelFuture.wait(); + // TODO - fix this so it goes through proper stream::wait() call. + av->wait(); // direct wait OK since we know the stream is locked. tprintf(DB_SYNC, " %s LAUNCH_BLOCKING for kernel completion\n", ToString(this).c_str()); } @@ -1163,7 +1164,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, dim3 block, grid_ auto crit = stream->lockopen_preKernelCommand(); lp->av = &(crit->_av); - lp->cf = new hc::completion_future; + lp->cf = nullptr; ihipPrintKernelLaunch(kernelNameStr, lp, stream); return (stream); @@ -1185,7 +1186,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, dim3 block, gri auto crit = stream->lockopen_preKernelCommand(); lp->av = &(crit->_av); - lp->cf = new hc::completion_future; + lp->cf = nullptr; ihipPrintKernelLaunch(kernelNameStr, lp, stream); return (stream); } @@ -1206,7 +1207,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, dim3 grid, size_t block, gri auto crit = stream->lockopen_preKernelCommand(); lp->av = &(crit->_av); - lp->cf = new hc::completion_future; + lp->cf = nullptr; ihipPrintKernelLaunch(kernelNameStr, lp, stream); return (stream); } @@ -1227,7 +1228,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g auto crit = stream->lockopen_preKernelCommand(); lp->av = &(crit->_av); - lp->cf = new hc::completion_future; // TODO, is this necessary? + lp->cf = nullptr; ihipPrintKernelLaunch(kernelNameStr, lp, stream); @@ -1240,7 +1241,7 @@ hipStream_t ihipPreLaunchKernel(hipStream_t stream, size_t grid, size_t block, g //This releases the lock on the stream. void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp) { - stream->lockclose_postKernelCommand(*(lp.cf)); + stream->lockclose_postKernelCommand(lp.av); } diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index d2bb084fc5..08c4b4392f 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -772,7 +772,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s } } - stream->lockclose_postKernelCommand(cf); + stream->lockclose_postKernelCommand(&crit->_av); if (HIP_LAUNCH_BLOCKING) { @@ -822,9 +822,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) e = hipErrorInvalidValue; } } + // TODO - is hipMemset supposed to be async? cf.wait(); - stream->lockclose_postKernelCommand(cf); + stream->lockclose_postKernelCommand(&crit->_av); if (HIP_LAUNCH_BLOCKING) { From 0d850c32dfb75adc389193ff19dd5fc2a98d0c18 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 27 Sep 2016 17:21:18 +0530 Subject: [PATCH 48/66] make test: Build tests in parallel Change-Id: If7b9d0519554226d09d8e6264eb248cd2dd53a2e --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7a480cb32a..ddff6b6769 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -316,8 +316,9 @@ if(POLICY CMP0037) endif() add_custom_target(install_for_test COMMAND "${CMAKE_COMMAND}" --build . --target install WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) +execute_process(COMMAND getconf _NPROCESSORS_ONLN OUTPUT_VARIABLE DASH_JAY OUTPUT_STRIP_TRAILING_WHITESPACE) add_custom_target(test COMMAND ${CMAKE_COMMAND} . - COMMAND make + COMMAND make -j ${DASH_JAY} COMMAND make test WORKING_DIRECTORY ${BUILD_DIR} DEPENDS install_for_test) From b4426e4c266acda456883472f408e8ab0928cf54 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 27 Sep 2016 17:22:23 +0530 Subject: [PATCH 49/66] HIT: Support make_named_test Change-Id: I7472c14595f10c4f2e2cf8d0228cc0373458a0b7 --- tests/hit/HIT.cmake | 51 +++++++++++++++++++++++++++++++++++++++++++++ tests/hit/parser | 43 ++++++++++++++++++++++++++------------ 2 files changed, 81 insertions(+), 13 deletions(-) diff --git a/tests/hit/HIT.cmake b/tests/hit/HIT.cmake index 847f8fdadd..82f0c5eabf 100644 --- a/tests/hit/HIT.cmake +++ b/tests/hit/HIT.cmake @@ -78,6 +78,33 @@ macro(PARSE_RUN_COMMAND _target _arguments _exclude_platforms) endforeach() endmacro() +# Helper macro to parse RUN_NAMED instructions +macro(PARSE_RUN_NAMED_COMMAND _target _testname _arguments _exclude_platforms) + set(${_target}) + set(${_arguments} " ") + set(${_exclude_platforms}) + set(_target_found FALSE) + set(_testname_found FALSE) + set(_exclude_platforms_found FALSE) + foreach(arg ${ARGN}) + if(NOT _target_found) + set(_target_found TRUE) + set(${_target} ${arg}) + elseif(NOT _testname_found) + set(_testname_found TRUE) + set(${_testname} ${arg}) + elseif("x${arg}" STREQUAL "xEXCLUDE_HIP_PLATFORM") + set(_exclude_platforms_found TRUE) + else() + if(_exclude_platforms_found) + set(${_exclude_platforms} ${arg}) + else() + list(APPEND ${_arguments} ${arg}) + endif() + endif() + endforeach() +endmacro() + # Helper macro to insert key/value pair into "hashmap" macro(INSERT_INTO_MAP _map _key _value) set("${_map}_${_key}" "${_value}") @@ -95,6 +122,12 @@ macro(MAKE_TEST exe) add_test(NAME ${testname} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN}) set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED") endmacro() + +macro(MAKE_NAMED_TEST exe _testname) + set(testname ${PROJECT_NAME}/${_testname}.tst) + add_test(NAME ${testname} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN}) + set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED") +endmacro() #------------------------------------------------------------------------------- # Macro: HIT_ADD_FILES used to scan+add multiple files for testing. @@ -135,6 +168,24 @@ macro(HIT_ADD_FILES _dir) make_test(${_target} ${_arguments}) endif() endforeach() + + # Add named tests + execute_process(COMMAND ${HIP_SRC_PATH}/tests/hit/parser --runNamedCMDs ${file} + OUTPUT_VARIABLE _contents + ERROR_QUIET + WORKING_DIRECTORY ${_dir} + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "\n" ";" _contents "${_contents}") + foreach(_cmd ${_contents}) + string(REGEX REPLACE " " ";" _cmd "${_cmd}") + parse_run_named_command(_target _testname _arguments _exclude_platforms ${_cmd}) + read_from_map("_exclude" "${_target}" _exclude_platforms_from_build) + if(_exclude_platforms STREQUAL "all" OR _exclude_platforms STREQUAL ${HIP_PLATFORM} OR + _exclude_platforms_from_build STREQUAL "all" OR _exclude_platforms_from_build STREQUAL ${HIP_PLATFORM}) + else() + make_named_test(${_target} ${_testname} ${_arguments}) + endif() + endforeach() endforeach() endmacro() diff --git a/tests/hit/parser b/tests/hit/parser index 6f6f842587..3d851752e4 100755 --- a/tests/hit/parser +++ b/tests/hit/parser @@ -8,29 +8,38 @@ use File::Spec; sub parse_file { my $file = shift; (my $exe = $file) =~ s/\.[^.]+$//g; - my (@buildCMDs, @runCMDs); + my (@buildCMDs, @runCMDs, @runNamedCMDs); if (open (SOURCE, '<:encoding(UTF-8)', "$file")) { while () { my $line=$_; # Look for BUILD instructions if ($line =~ /^ \* BUILD:/) { - $line =~ s/^ \* BUILD: //g; # Remove " * BUILD: " - $line =~ s/%s/$file/g; # Substitute %s -> filename - $line =~ s/%t/$exe/g; # Substitute %t -> targetname - $line =~ s/\R//g; # Remove line endings + $line =~ s/^ \* BUILD: //g; # Remove " * BUILD: " + $line =~ s/%s/$file/g; # Substitute %s -> filename + $line =~ s/%t/$exe/g; # Substitute %t -> targetname + $line =~ s/\R//g; # Remove line endings push @buildCMDs, $line; } # Look for RUN instructions if ($line =~ /^ \* RUN:/) { - $line =~ s/^ \* RUN: //g; # Remove " * RUN: " - $line =~ s/%t/$exe/g; # Subsitute %t -> targetname - $line =~ s/\R//g; # Remove line endings + $line =~ s/^ \* RUN: //g; # Remove " * RUN: " + $line =~ s/%s/$file/g; # Substitute %s -> filename + $line =~ s/%t/$exe/g; # Subsitute %t -> targetname + $line =~ s/\R//g; # Remove line endings push @runCMDs, $line; } + # Look for RUN_NAMED instructions + if ($line =~ /^ \* RUN_NAMED:/) { + $line =~ s/^ \* RUN_NAMED: //g; # Remove " * RUN_NAMED: " + $line =~ s/%s/$file/g; # Substitute %s -> filename + $line =~ s/%t/$exe/g; # Subsitute %t -> targetname + $line =~ s/\R//g; # Remove line endings + push @runNamedCMDs, $line; + } } close(SOURCE); } - return (\@buildCMDs, \@runCMDs); + return (\@buildCMDs, \@runCMDs, \@runNamedCMDs); } # Exit if no arguments specified @@ -43,8 +52,9 @@ if(scalar @ARGV == 0){ my @options = (); my $retBuildCMDs = 0; my $retRunCMDs = 0; +my $retRunNamedCMDs = 0; foreach $arg (@ARGV) { - if ($retBuildCMDs or $retRunCMDs) { + if ($retBuildCMDs or $retRunCMDs or $retRunNamedCMDs) { push (@options, $arg); } if ($arg eq '--buildCMDs') { @@ -53,18 +63,21 @@ foreach $arg (@ARGV) { if ($arg eq '--runCMDs') { $retRunCMDs = 1; } + if ($arg eq '--runNamedCMDs') { + $retRunNamedCMDs = 1; + } } # Atleast one command needs to be specified -if ($retBuildCMDs eq 0 and $retRunCMDs eq 0) { - die "Usage: $0 <--buildCMDs|--runCMDs> FILENAMEs\n"; +if (($retBuildCMDs eq 0) and ($retRunCMDs eq 0) and ($retRunNamedCMDs eq 0)) { + die "Usage: $0 <--buildCMDs|--runCMDs|--runNamedCMDs> FILENAMEs\n"; } # Iterate over input files foreach $file (@options) { # Convert absolute path to path relative to working directory my $relfile = File::Spec->abs2rel($file); - my ($buildCMDs, $runCMDs) = parse_file("$relfile"); + my ($buildCMDs, $runCMDs, $runNamedCMDs) = parse_file("$relfile"); if ($retBuildCMDs) { # print "BuildCMDs:\n"; print "$_\n" for @$buildCMDs; @@ -73,6 +86,10 @@ foreach $file (@options) { # print "RunCMDs:\n"; print "$_\n" for @$runCMDs; } + if ($retRunNamedCMDs) { + # print "RunNamedCMDs:\n"; + print "$_\n" for @$runNamedCMDs; + } } # vim: ts=4:sw=4:expandtab:smartindent From 480727998854ce0866ed66a66b83a34a528fb517 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 27 Sep 2016 17:24:33 +0530 Subject: [PATCH 50/66] directed tests: Enable remaining tests in HIT infrastructure Change-Id: I800ee00d6f7d339bb5d2ed898a16362a8541ed3a --- tests/src/context/hipCtx_simple.cpp | 6 ++++++ tests/src/deviceLib/hipMathFunctions.cpp | 11 +++++++++++ tests/src/deviceLib/hipSimpleAtomicsTest.cpp | 6 ++++++ tests/src/deviceLib/hipTestDevice.cpp | 7 +++++++ tests/src/deviceLib/hipTestDeviceDouble.cpp | 7 +++++++ tests/src/deviceLib/hip_anyall.cpp | 5 +++++ tests/src/deviceLib/hip_ballot.cpp | 6 ++++++ tests/src/deviceLib/hip_brev.cpp | 6 ++++++ tests/src/deviceLib/hip_clz.cpp | 6 ++++++ tests/src/deviceLib/hip_ffs.cpp | 6 ++++++ tests/src/deviceLib/hip_popc.cpp | 6 ++++++ tests/src/deviceLib/hip_test_ldg.cpp | 7 +++++++ tests/src/kernel/hipGridLaunch.cpp | 6 ++++++ tests/src/kernel/hipLanguageExtensions.cpp | 6 ++++++ tests/src/kernel/launch_bounds.cpp | 5 +++++ tests/src/runtimeApi/memory/hipMemcpy.cpp | 10 ++++++++++ tests/src/runtimeApi/memory/hipMemcpyAll.cpp | 4 ++++ tests/src/runtimeApi/memory/hipMemcpy_simple.cpp | 8 ++++++++ tests/src/runtimeApi/memory/hipMemoryAllocate.cpp | 7 +++++++ tests/src/runtimeApi/memory/hipMemset.cpp | 12 ++++++++++++ .../runtimeApi/multiThread/hipMultiThreadDevice.cpp | 8 ++++++++ .../multiThread/hipMultiThreadStreams1.cpp | 9 +++++++-- .../multiThread/hipMultiThreadStreams2.cpp | 6 ++++++ tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp | 5 +++++ tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp | 5 +++++ tests/src/runtimeApi/stream/hipStreamL5.cpp | 6 ++++++ tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp | 6 ++++++ 27 files changed, 180 insertions(+), 2 deletions(-) diff --git a/tests/src/context/hipCtx_simple.cpp b/tests/src/context/hipCtx_simple.cpp index a5e08a4551..18174065d6 100644 --- a/tests/src/context/hipCtx_simple.cpp +++ b/tests/src/context/hipCtx_simple.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp HCC_OPTIONS --stdlib=libc++ + * RUN: %t + * HIT_END + */ + #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/deviceLib/hipMathFunctions.cpp b/tests/src/deviceLib/hipMathFunctions.cpp index 4cfc9f0069..e3fb8bdd35 100644 --- a/tests/src/deviceLib/hipMathFunctions.cpp +++ b/tests/src/deviceLib/hipMathFunctions.cpp @@ -19,6 +19,17 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %tHost %s hipSinglePrecisionMathHost.cpp hipDoublePrecisionMathHost.cpp ../test_common.cpp + * BUILD: %tDevice %s hipSinglePrecisionMathDevice.cpp hipDoublePrecisionMathDevice.cpp ../test_common.cpp + * BUILD: hipIntrinsics %s hipSinglePrecisionIntrinsics.cpp hipDoublePrecisionIntrinsics.cpp hipIntegerIntrinsics.cpp ../test_common.cpp + * RUN: %tHost + * RUN: %tDevice + * RUN: hipIntrinsics + * HIT_END + */ + #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp index 2faee1b5de..f72e4e995a 100644 --- a/tests/src/deviceLib/hipSimpleAtomicsTest.cpp +++ b/tests/src/deviceLib/hipSimpleAtomicsTest.cpp @@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + // includes, system #include #include diff --git a/tests/src/deviceLib/hipTestDevice.cpp b/tests/src/deviceLib/hipTestDevice.cpp index c0812efac6..186d7cec6a 100644 --- a/tests/src/deviceLib/hipTestDevice.cpp +++ b/tests/src/deviceLib/hipTestDevice.cpp @@ -16,6 +16,13 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + #include"test_common.h" #include"hip_runtime.h" #include"hip_runtime_api.h" diff --git a/tests/src/deviceLib/hipTestDeviceDouble.cpp b/tests/src/deviceLib/hipTestDeviceDouble.cpp index f800fc9aef..12be3e96ba 100644 --- a/tests/src/deviceLib/hipTestDeviceDouble.cpp +++ b/tests/src/deviceLib/hipTestDeviceDouble.cpp @@ -16,6 +16,13 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + #include"test_common.h" #include"hip_runtime.h" #include"hip_runtime_api.h" diff --git a/tests/src/deviceLib/hip_anyall.cpp b/tests/src/deviceLib/hip_anyall.cpp index 2804e6211b..b23360ac92 100644 --- a/tests/src/deviceLib/hip_anyall.cpp +++ b/tests/src/deviceLib/hip_anyall.cpp @@ -20,6 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ #include #include diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp index e1adb3095d..6bbe5d8adb 100644 --- a/tests/src/deviceLib/hip_ballot.cpp +++ b/tests/src/deviceLib/hip_ballot.cpp @@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + #include #include diff --git a/tests/src/deviceLib/hip_brev.cpp b/tests/src/deviceLib/hip_brev.cpp index 0d97cba970..5f745035d0 100644 --- a/tests/src/deviceLib/hip_brev.cpp +++ b/tests/src/deviceLib/hip_brev.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + #include #include #include diff --git a/tests/src/deviceLib/hip_clz.cpp b/tests/src/deviceLib/hip_clz.cpp index 8e7685fde1..c8ac5fc3c6 100644 --- a/tests/src/deviceLib/hip_clz.cpp +++ b/tests/src/deviceLib/hip_clz.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + #include #include #include diff --git a/tests/src/deviceLib/hip_ffs.cpp b/tests/src/deviceLib/hip_ffs.cpp index 77d31a6776..cc60c0cca2 100644 --- a/tests/src/deviceLib/hip_ffs.cpp +++ b/tests/src/deviceLib/hip_ffs.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + #include #include #include diff --git a/tests/src/deviceLib/hip_popc.cpp b/tests/src/deviceLib/hip_popc.cpp index 3ab43ef194..cfa13621a2 100644 --- a/tests/src/deviceLib/hip_popc.cpp +++ b/tests/src/deviceLib/hip_popc.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + #include #include #include diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp index fa4d402d5c..af6423f464 100644 --- a/tests/src/deviceLib/hip_test_ldg.cpp +++ b/tests/src/deviceLib/hip_test_ldg.cpp @@ -19,6 +19,13 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s ../test_common.cpp NVCC_OPTIONS --gpu-architecture=sm_35 + * RUN: %t + * HIT_END + */ + #include #include #include diff --git a/tests/src/kernel/hipGridLaunch.cpp b/tests/src/kernel/hipGridLaunch.cpp index b195a0171d..7b8f4f5c3e 100644 --- a/tests/src/kernel/hipGridLaunch.cpp +++ b/tests/src/kernel/hipGridLaunch.cpp @@ -21,6 +21,12 @@ THE SOFTWARE. */ // Test the Grid_Launch syntax. +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/kernel/hipLanguageExtensions.cpp b/tests/src/kernel/hipLanguageExtensions.cpp index e3d9519e11..767c74f366 100644 --- a/tests/src/kernel/hipLanguageExtensions.cpp +++ b/tests/src/kernel/hipLanguageExtensions.cpp @@ -21,6 +21,12 @@ THE SOFTWARE. */ // Collection of code to make sure that various features in the hip kernel language compile. +/* HIT_START + * BUILD: %t %s ../test_common.cpp HCC_OPTIONS --stdlib=libc++ + * RUN: %t + * HIT_END + */ + #include #include diff --git a/tests/src/kernel/launch_bounds.cpp b/tests/src/kernel/launch_bounds.cpp index f70327ad96..e41f782190 100644 --- a/tests/src/kernel/launch_bounds.cpp +++ b/tests/src/kernel/launch_bounds.cpp @@ -20,6 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ // Test launch bounds and initialization conditions. diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index faab1cef5f..659d16c23b 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -19,6 +19,16 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 + * RUN_NAMED: %t hipMemcpy-size --tests 0x6 + * RUN_NAMED: %t hipMemcpy-multithreaded --tests 0x8 + * HIT_END + */ + +#include "hip_runtime.h" #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/runtimeApi/memory/hipMemcpyAll.cpp b/tests/src/runtimeApi/memory/hipMemcpyAll.cpp index 49e7c94bdf..f8236b568d 100644 --- a/tests/src/runtimeApi/memory/hipMemcpyAll.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpyAll.cpp @@ -17,6 +17,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * HIT_END + */ #include #include diff --git a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index f348e5f0a6..8164896dbe 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -19,6 +19,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * RUN_NAMED: %t hipMemcpyAsync-simple --async + * HIT_END + */ + #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index dccc00b0e0..1f7599491a 100644 --- a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -16,6 +16,13 @@ LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + #include"test_common.h" #define SIZE 1024*1024*256 diff --git a/tests/src/runtimeApi/memory/hipMemset.cpp b/tests/src/runtimeApi/memory/hipMemset.cpp index 9769acaf25..a319fb3984 100644 --- a/tests/src/runtimeApi/memory/hipMemset.cpp +++ b/tests/src/runtimeApi/memory/hipMemset.cpp @@ -22,6 +22,18 @@ THE SOFTWARE. // Simple test for memset. // Also serves as a template for other tests. +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * //Small copy + * RUN: %t -N 10 --memsetval 0x42 + * // Oddball size + * RUN: %t -N 10013 --memsetval 0x5a + * // Big copy + * RUN: %t -N 256M --memsetval 0xa6 + * HIT_END + */ + #include "hip_runtime.h" #include "test_common.h" diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp index d9afda59d0..73024a79b5 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp @@ -1,3 +1,11 @@ +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN_NAMED: %t hipMultiThreadDevice-serial --tests 0x1 + * RUN_NAMED: %t hipMultiThreadDevice-pyramid --tests 0x4 + * RUN_NAMED: %t hipMultiThreadDevice-nearzero --tests 0x10 + * HIT_END + */ + #include #include "test_common.h" diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 2dc8cb27e7..65ccc624cc 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -19,10 +19,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include "hip_runtime.h" -#include "test_common.h" +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ +#include "hip_runtime.h" +#include "test_common.h" void printSep() { diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp index ddeb8686f4..d6a43f752c 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + #include #include"test_common.h" #include diff --git a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp index a419478458..a7cace0ebe 100644 --- a/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp +++ b/tests/src/runtimeApi/stream/hipAPIStreamDisable.cpp @@ -17,6 +17,11 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * HIT_END + */ + #include #include"test_common.h" diff --git a/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp b/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp index 1de6e4b880..7706ed2f0d 100644 --- a/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp +++ b/tests/src/runtimeApi/stream/hipAPIStreamEnable.cpp @@ -17,6 +17,11 @@ OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * HIT_END + */ + //#define HIP_API_PER_THREAD_DEFAULT_STREAM #include diff --git a/tests/src/runtimeApi/stream/hipStreamL5.cpp b/tests/src/runtimeApi/stream/hipStreamL5.cpp index b9d3a03c94..20d5459e06 100644 --- a/tests/src/runtimeApi/stream/hipStreamL5.cpp +++ b/tests/src/runtimeApi/stream/hipStreamL5.cpp @@ -20,6 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + #include "test_common.h" #include "hipStream.h" diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 4ad093a16e..b41a1af4d9 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -17,6 +17,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + // Test under-development. Calls async mem-copy API, experiment with functionality. #include "hip_runtime.h" From 4d56ac99e4622225bfb98a245e649455ce847fc3 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 27 Sep 2016 11:38:31 -0500 Subject: [PATCH 51/66] added negative testing for device APIs Change-Id: I7bfcdd777f240d36b29eca987f7bc9da7614d704 --- tests/src/Functional/Negative/Device/hipDeviceUtil.h | 10 ++++++++++ tests/src/Functional/Negative/Device/hipGetDevice.cpp | 10 ++++++++++ tests/src/Functional/Negative/Device/hipSetDevice.cpp | 10 ++++++++++ 3 files changed, 30 insertions(+) create mode 100644 tests/src/Functional/Negative/Device/hipDeviceUtil.h create mode 100644 tests/src/Functional/Negative/Device/hipGetDevice.cpp create mode 100644 tests/src/Functional/Negative/Device/hipSetDevice.cpp diff --git a/tests/src/Functional/Negative/Device/hipDeviceUtil.h b/tests/src/Functional/Negative/Device/hipDeviceUtil.h new file mode 100644 index 0000000000..392aa8c277 --- /dev/null +++ b/tests/src/Functional/Negative/Device/hipDeviceUtil.h @@ -0,0 +1,10 @@ +#ifndef HIPDEVICEUTIL_H +#define HIPDEVICEUTIL_H + +#include +#include + +#define HIP_CHECK(status, func) \ + std::cout<<#func<<" returned "< +#include +#include"hipDeviceUtil.h" + +int main() +{ + int device; + HIP_CHECK(hipGetDevice(NULL), hipGetDevice); + HIP_CHECK(hipGetDevice(&device), hipGetDevice); +} diff --git a/tests/src/Functional/Negative/Device/hipSetDevice.cpp b/tests/src/Functional/Negative/Device/hipSetDevice.cpp new file mode 100644 index 0000000000..4269627ad5 --- /dev/null +++ b/tests/src/Functional/Negative/Device/hipSetDevice.cpp @@ -0,0 +1,10 @@ +#include +#include +#include"hipDeviceUtil.h" + +int main() +{ + HIP_CHECK(hipSetDevice(0), hipSetDevice); + HIP_CHECK(hipSetDevice(1026), hipSetDevice); + HIP_CHECK(hipSetDevice(-1), hipSetDevice); +} From 3e64222184caeec708fbd8e3f998169be7242334 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 27 Sep 2016 11:47:58 -0500 Subject: [PATCH 52/66] fallthrough if deviceId pointer is null Change-Id: I924996d60d0286a7be1d18881ee733459de2981c --- src/hip_device.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 6ea4658a4f..760e00c5f1 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -33,11 +33,15 @@ hipError_t hipGetDevice(int *deviceId) auto ctx = ihipGetTlsDefaultCtx(); - if (ctx == nullptr) { - e = hipErrorInvalidDevice; // TODO, check error code. - *deviceId = -1; - } else { - *deviceId = ctx->getDevice()->_deviceId; + if(deviceId != nullptr){ + if (ctx == nullptr) { + e = hipErrorInvalidDevice; // TODO, check error code. + *deviceId = -1; + } else { + *deviceId = ctx->getDevice()->_deviceId; + } + }else{ + e = hipErrorInvalidDevice; } return ihipLogStatus(e); From 1b14393a85eb0644b87969e11542365e37a7837b Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 27 Sep 2016 13:04:35 -0500 Subject: [PATCH 53/66] added more device negative testing 1. Added fallback for nullptr to hipGetDeviceProperties and hipGetDeviceCount 2. Added negative tests for hipGetDeviceProperties and hipGetDeviceCount Change-Id: Iac93fd53d7d4794fb10546ddadf6ca802b047c87 --- src/hip_device.cpp | 29 +++++++++++++------ .../Negative/Device/hipGetDeviceCount.cpp | 10 +++++++ .../Device/hipGetDeviceProperties.cpp | 14 +++++++++ 3 files changed, 44 insertions(+), 9 deletions(-) create mode 100644 tests/src/Functional/Negative/Device/hipGetDeviceCount.cpp create mode 100644 tests/src/Functional/Negative/Device/hipGetDeviceProperties.cpp diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 760e00c5f1..407b8c2ae9 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -52,13 +52,20 @@ hipError_t hipGetDeviceCount(int *count) { HIP_INIT_API(count); - *count = g_deviceCnt; + hipError_t e = hipSuccess; - if (*count > 0) { - return ihipLogStatus(hipSuccess); + if(count != nullptr) { + *count = g_deviceCnt; + + if (*count > 0) { + e = ihipLogStatus(hipSuccess); + } else { + e = ihipLogStatus(hipErrorNoDevice); + } } else { - return ihipLogStatus(hipErrorNoDevice); + e = ihipLogStatus(hipErrorNoDevice); } + return e; } hipError_t hipDeviceSetCacheConfig ( hipFuncCache cacheConfig ) @@ -217,12 +224,16 @@ hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) hipError_t e; - auto * hipDevice = ihipGetDevice(device); - if (hipDevice) { + if(props != nullptr){ + auto * hipDevice = ihipGetDevice(device); + if (hipDevice) { // copy saved props - *props = hipDevice->_props; - e = hipSuccess; - } else { + *props = hipDevice->_props; + e = hipSuccess; + } else { + e = hipErrorInvalidDevice; + } + }else{ e = hipErrorInvalidDevice; } diff --git a/tests/src/Functional/Negative/Device/hipGetDeviceCount.cpp b/tests/src/Functional/Negative/Device/hipGetDeviceCount.cpp new file mode 100644 index 0000000000..c9f8ed3864 --- /dev/null +++ b/tests/src/Functional/Negative/Device/hipGetDeviceCount.cpp @@ -0,0 +1,10 @@ +#include +#include +#include"hipDeviceUtil.h" + +int main() +{ + int deviceCnt; + HIP_CHECK(hipGetDeviceCount(&deviceCnt), hipGetDeviceCount); + HIP_CHECK(hipGetDeviceCount(0), hipGetDeviceCount); +} diff --git a/tests/src/Functional/Negative/Device/hipGetDeviceProperties.cpp b/tests/src/Functional/Negative/Device/hipGetDeviceProperties.cpp new file mode 100644 index 0000000000..964df95d20 --- /dev/null +++ b/tests/src/Functional/Negative/Device/hipGetDeviceProperties.cpp @@ -0,0 +1,14 @@ +#include +#include +#include"hipDeviceUtil.h" + +int main() +{ + hipDeviceProp_t props; + HIP_CHECK(hipGetDeviceProperties(&props, 0), hipGetDeviceProperties); + HIP_CHECK(hipGetDeviceProperties(NULL, 0), hipGetDeviceProperties); + HIP_CHECK(hipGetDeviceProperties(NULL, -1), hipGetDeviceProperties); + HIP_CHECK(hipGetDeviceProperties(&props, -1), hipGetDeviceProperties); + HIP_CHECK(hipGetDeviceProperties(NULL, 1024), hipGetDeviceProperties); + HIP_CHECK(hipGetDeviceProperties(&props, 1024), hipGetDeviceProperties); +} From 5b45c97a30d621f6a3d02b48179658dae80d9be8 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 27 Sep 2016 13:33:42 -0500 Subject: [PATCH 54/66] Fixed hipDeviceGetAttribute 1. Added negative test for hipDeviceGetAttribute 2. Fixed hipDeviceGetAttribute if int ptr input is null Change-Id: I0e31f50fa407701fddf96e4eb64a87a371ff5d95 --- src/hip_device.cpp | 5 +++++ .../Negative/Device/hipDeviceGetAttribute.cpp | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 tests/src/Functional/Negative/Device/hipDeviceGetAttribute.cpp diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 407b8c2ae9..ef498acfcd 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -155,6 +155,8 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) hipError_t e = hipSuccess; + if(pi != nullptr) { + auto * hipDevice = ihipGetDevice(device); hipDeviceProp_t *prop = &hipDevice->_props; if (hipDevice) { @@ -215,6 +217,9 @@ hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) } else { e = hipErrorInvalidDevice; } + }else{ + e = hipErrorInvalidDevice; + } return ihipLogStatus(e); } diff --git a/tests/src/Functional/Negative/Device/hipDeviceGetAttribute.cpp b/tests/src/Functional/Negative/Device/hipDeviceGetAttribute.cpp new file mode 100644 index 0000000000..53aa812e06 --- /dev/null +++ b/tests/src/Functional/Negative/Device/hipDeviceGetAttribute.cpp @@ -0,0 +1,21 @@ +#include +#include +#include"hipDeviceUtil.h" + +int main() +{ + int pi; + int attr = 0; +// hipDeviceAttribute_t attr = hipDeviceAttributeMaxThreadsPerBlock; + HIP_CHECK(hipDeviceGetAttribute(nullptr, hipDeviceAttribute_t(attr), 0), hipDeviceGetAttribute); + HIP_CHECK(hipDeviceGetAttribute(&pi, hipDeviceAttribute_t(attr), 0), hipDeviceGetAttribute); + attr = -1; + HIP_CHECK(hipDeviceGetAttribute(nullptr, hipDeviceAttribute_t(attr), 0), hipDeviceGetAttribute); + HIP_CHECK(hipDeviceGetAttribute(&pi, hipDeviceAttribute_t(attr), 0), hipDeviceGetAttribute); + attr = 0; + HIP_CHECK(hipDeviceGetAttribute(nullptr, hipDeviceAttribute_t(attr), -1), hipDeviceGetAttribute); + HIP_CHECK(hipDeviceGetAttribute(&pi, hipDeviceAttribute_t(attr), -1), hipDeviceGetAttribute); + attr = -1; + HIP_CHECK(hipDeviceGetAttribute(nullptr, hipDeviceAttribute_t(attr), -1), hipDeviceGetAttribute); + HIP_CHECK(hipDeviceGetAttribute(&pi, hipDeviceAttribute_t(attr), -1), hipDeviceGetAttribute); +} From c36ba0ea5825c012cfab3a2a8047deecc66b8086 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 27 Sep 2016 14:28:25 -0500 Subject: [PATCH 55/66] Add iterations option to MT test. Change-Id: I945706a75601006ee55a408f965483dd263190ef --- .../runtimeApi/multiThread/hipMultiThreadStreams2.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp index d6a43f752c..bc28ebfd16 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp @@ -31,6 +31,8 @@ THE SOFTWARE. #include #define N 1000 + + template __global__ void Inc(hipLaunchParm lp, T *Array){ int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; @@ -90,7 +92,10 @@ void run(size_t size, hipStream_t stream1, hipStream_t stream2){ HIPASSERT(Ehh[10] = Ahh[10] + 1.0f); } -int main(int argc, char **argv){ +int main(int argc, char **argv) +{ + iterations = 100; + HipTest::parseStandardArguments(argc, argv, true); @@ -100,6 +105,8 @@ int main(int argc, char **argv){ } const size_t size = N * sizeof(float); + + for (int i=0; i< iterations; i++) { std::thread t1(run1, size, stream[0]); std::thread t2(run1, size, stream[0]); @@ -109,6 +116,7 @@ int main(int argc, char **argv){ t2.join(); // std::cout<<"T2"< Date: Tue, 27 Sep 2016 14:53:13 -0500 Subject: [PATCH 56/66] Add debug option to print ThreadID with each message. Also print messages with single fprintf to prevents threads from interleaving. Change-Id: Ib3999fe6b1e67b4a16cd7dcde82f3dfc99dd48ff --- include/hcc_detail/hip_hcc.h | 28 ++++++++++++++++++++++++---- src/hip_hcc.cpp | 2 ++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 9d4b70c73f..34c743d07e 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -109,6 +109,19 @@ extern const char *API_COLOR_END; #endif +#define DB_SHOW_TID 1 + +#if DB_SHOW_TID +#define COMPUTE_TID_STR \ + std::stringstream tid_ss;\ + std::stringstream tid_ss_num;\ + tid_ss_num << std::this_thread::get_id();\ + tid_ss << " tid:" << std::hex << std::stoull(tid_ss_num.str()); +#else +#define COMPUTE_TID_STR std::stringstream tid_ss; +#endif + + // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary. // TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned // through ptr-to-args (ie the pointers allocated by hipMalloc). @@ -127,7 +140,8 @@ extern const char *API_COLOR_END; if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\ std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ if (COMPILE_HIP_DB && HIP_TRACE_API) {\ - fprintf (stderr, "%s<_mutex.lock(); }; ~LockedAccessor() { if (_autoUnlock) { + tprintf(DB_SYNC, "auto-unlock critical data %s.%p\n",typeid(T).name(), _criticalData); _criticalData->_mutex.unlock(); } } void unlock() { + tprintf(DB_SYNC, "unlock critical data %s.%p\n", typeid(T).name(), _criticalData); _criticalData->_mutex.unlock(); } diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index fcbc553f83..1055e0ce46 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1359,6 +1359,7 @@ bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, // Use blocks to control scope of critical sections. { LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); + tprintf(DB_SYNC, "dstCrit lock succeeded\n"); if (!ctxCrit->isPeer(thisCtx)) { return false; }; @@ -1366,6 +1367,7 @@ bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, { LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); + tprintf(DB_SYNC, "srcCrit lock succeeded\n"); if (!ctxCrit->isPeer(thisCtx)) { return false; }; From 4ff6dc8f387b326492fb5ce60a774f4416158ad9 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 27 Sep 2016 15:27:21 -0500 Subject: [PATCH 57/66] Refactor asyncCopy and syncCopy to fix deadlock case. - Minimize time that locks are held. - Eliminate copy code that locked stream and ctx at same time. - Stream was locked to ensure thread-safe enqueue to the queue. - Devices were locked to query peer-lists. Change-Id: Ibe8880bb7fb995a3da8f90ff911f212d81525018 --- include/hcc_detail/hip_hcc.h | 12 ++++++++--- src/hip_hcc.cpp | 40 ++++++++++++------------------------ src/hip_memory.cpp | 8 ++++---- 3 files changed, 26 insertions(+), 34 deletions(-) diff --git a/include/hcc_detail/hip_hcc.h b/include/hcc_detail/hip_hcc.h index 34c743d07e..09f38ec331 100644 --- a/include/hcc_detail/hip_hcc.h +++ b/include/hcc_detail/hip_hcc.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Link errors represented as this:Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -371,10 +371,17 @@ class ihipStreamCriticalBase_t : public LockedBase }; +// if HIP code needs to acquire locks for both ihipCtx_t and ihipStream_t, it should first acquire the lock +// for the ihipCtx_t and then for the individual streams. The locks should not be acquired in reverse order +// or deadlock may occur. In some cases, it may be possible to reduce the range where the locks must be held. +// HIP routines should avoid acquiring and releasing the same lock during the execution of a single HIP API. + + typedef ihipStreamCriticalBase_t ihipStreamCritical_t; typedef LockedAccessor LockedAccessor_StreamCrit_t; +//--- // Internal stream structure. class ihipStream_t { public: @@ -383,11 +390,10 @@ typedef uint64_t SeqNum_t ; ~ihipStream_t(); // kind is hipMemcpyKind - void copySync (LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true); void locked_copySync (void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn = true); - void copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind); + void locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind); //--- diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 1055e0ce46..3c8dfb1f3e 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1409,7 +1409,7 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, // TODO - remove kind parm from here or use it below? -void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) +void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) { ihipCtx_t *ctx = this->getCtx(); const ihipDevice_t *device = ctx->getDevice(); @@ -1436,7 +1436,7 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const }; - // If this is P2P accessi, we need to check to see if the copy agent (specified by the stream where the copy is enqueued) + // If this is P2P access, we need to check to see if the copy agent (specified by the stream where the copy is enqueued) // has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers // and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (forceHostCopyEngine=true). bool forceHostCopyEngine = false; @@ -1449,21 +1449,15 @@ void ihipStream_t::copySync(LockedAccessor_StreamCrit_t &crit, void* dst, const } }; - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); + { + LockedAccessor_StreamCrit_t crit (_criticalData); + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); + } } -// Sync copy that acquires lock: -void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) +void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) { - LockedAccessor_StreamCrit_t crit (_criticalData); - copySync(crit, dst, src, sizeBytes, kind, resolveOn); -} - - -void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) -{ - LockedAccessor_StreamCrit_t crit(_criticalData); const ihipCtx_t *ctx = this->getCtx(); @@ -1478,12 +1472,12 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig /* As this is a CPU op, we need to wait until all the commands in current stream are finished. */ + LockedAccessor_StreamCrit_t crit(_criticalData); this->wait(crit); memcpy(dst, src, sizeBytes); } else { - bool trueAsync = true; hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); @@ -1498,17 +1492,12 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig } - - // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. - if (!dstTracked || !srcTracked || !copyEngineCanSeeSrcAndDest) { - trueAsync = false; - } + if (dstTracked && srcTracked && copyEngineCanSeeSrcAndDest) { + LockedAccessor_StreamCrit_t crit(_criticalData); - - if (trueAsync == true) { - // Perform a synchronous copy: + // Perform asynchronous copy: try { crit->_av.copy_async(src, dst, sizeBytes); } catch (Kalmar::runtime_exception) { @@ -1520,12 +1509,9 @@ void ihipStream_t::copyAsync(void* dst, const void* src, size_t sizeBytes, unsig tprintf(DB_SYNC, "LAUNCH_BLOCKING for completion of hipMemcpyAsync(%zu)\n", sizeBytes); this->wait(crit); } + } else { - // Perform a synchronous copy: - if (kind == hipMemcpyDefault) { - kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); - } - copySync(crit, dst, src, sizeBytes, kind); + locked_copySync(dst, src, sizeBytes, kind); } } } diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 08c4b4392f..68811be8ee 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -508,7 +508,7 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp e= hipErrorInvalidValue; } else if (stream) { try { - stream->copyAsync(dst, src, sizeBytes, kind); + stream->locked_copyAsync(dst, src, sizeBytes, kind); } catch (ihipException ex) { e = ex._code; @@ -534,7 +534,7 @@ hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, h e= hipErrorInvalidValue; } else if (stream) { try { - stream->copyAsync((void*)dst, src, sizeBytes, kind); + stream->locked_copyAsync((void*)dst, src, sizeBytes, kind); } catch (ihipException ex) { e = ex._code; @@ -561,7 +561,7 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t siz e= hipErrorInvalidValue; } else if (stream) { try { - stream->copyAsync((void*)dst, (void*)src, sizeBytes, kind); + stream->locked_copyAsync((void*)dst, (void*)src, sizeBytes, kind); } catch (ihipException ex) { e = ex._code; @@ -587,7 +587,7 @@ hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, h e= hipErrorInvalidValue; } else if (stream) { try { - stream->copyAsync(dst, (void*)src, sizeBytes, kind); + stream->locked_copyAsync(dst, (void*)src, sizeBytes, kind); } catch (ihipException ex) { e = ex._code; From eac2533a92957208e4472fa717d86ec59da637c5 Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 27 Sep 2016 23:00:11 -0500 Subject: [PATCH 58/66] change hipEnvVarDriver to reduce communications to hipEnvVar Change-Id: I65a5f67dd51d53594fa51f88505e32321643d189 --- tests/src/hipEnvVarDriver.cpp | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/tests/src/hipEnvVarDriver.cpp b/tests/src/hipEnvVarDriver.cpp index 255cc72806..7046849525 100644 --- a/tests/src/hipEnvVarDriver.cpp +++ b/tests/src/hipEnvVarDriver.cpp @@ -27,13 +27,9 @@ THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include - +#include using namespace std; -//./hipEnvVar -c -d 0 -h - //putenv("SomeVariable=SomeValue"); - //putenv("export HIP_VISIBLE_DEVICES=0,1,2,3"); - int getDeviceNumber(){ FILE *in; char buff[512]; @@ -46,7 +42,8 @@ int getDeviceNumber(){ return atoi(buff); } -int getDevicePCIBusNum(int deviceID){ +// Query the current device ID remotely to hipEnvVar +int getDevicePCIBusNumRemote(int deviceID){ FILE *in; char buff[512]; string str = "./hipEnvVar -d "; @@ -59,6 +56,19 @@ int getDevicePCIBusNum(int deviceID){ return atoi(buff); } +// Query the current device ID locally +int getDevicePCIBusNum(int deviceID){ + hipSetDevice(deviceID); + hipDeviceProp_t devProp; + + hipGetDeviceProperties(&devProp, deviceID); + if (devProp.major < 1) { + printf("%d does not support HIP\n", deviceID); + return -1; + } + return devProp.pciBusID; +} + int main() { unsetenv("HIP_VISIBLE_DEVICES"); unsetenv("CUDA_VISIBLE_DEVICES"); @@ -78,11 +88,9 @@ int main() { for (int i = 0; i < totalDeviceNum ; i++) { setenv("HIP_VISIBLE_DEVICES",(char*)std::to_string(i).c_str(),1); setenv("CUDA_VISIBLE_DEVICES",(char*)std::to_string(i).c_str(),1); - //cout<<"HIP_VISIBLE_DEVICES is "< Date: Thu, 29 Sep 2016 10:29:18 +0530 Subject: [PATCH 59/66] Rebuild hip_hcc if hcc version changes And also force rebuild of .buildInfo & .version everytime Change-Id: I97e0d3c24fd693366a293803088014d13ca640cc --- CMakeLists.txt | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index ddff6b6769..cd754db4d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -145,6 +145,9 @@ add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER) ############################# # Build steps ############################# +# Rebuild cmake cache updates .buildInfo and .version +add_custom_target(update_build_and_version_info ALL COMMAND make rebuild_cache) + # Build clang hipify if enabled if(BUILD_CLANG_HIPIFY) add_subdirectory(clang-hipify) @@ -192,6 +195,13 @@ if(HIP_PLATFORM STREQUAL "hcc") add_library(hip_hcc SHARED ${SOURCE_FILES}) endif() + # Generate hcc_version.txt + add_custom_target(query_hcc_version COMMAND ${HCC_HOME}/bin/hcc --version > ${PROJECT_BINARY_DIR}/hcc_version.tmp) + add_custom_target(check_hcc_version COMMAND ${CMAKE_COMMAND} -E copy_if_different ${PROJECT_BINARY_DIR}/hcc_version.tmp ${PROJECT_BINARY_DIR}/hcc_version.txt DEPENDS query_hcc_version) + set_source_files_properties(${PROJECT_BINARY_DIR}/hcc_version.txt PROPERTIES GENERATED TRUE) + set_source_files_properties(${SOURCE_FILES} PROPERTIES OBJECT_DEPENDS ${PROJECT_BINARY_DIR}/hcc_version.txt) + add_dependencies(hip_hcc check_hcc_version update_build_and_version_info) + # Generate .buildInfo file(WRITE "${PROJECT_BINARY_DIR}/.buildInfo" ${_buildInfo}) endif() From 4fa5e980efd31d026534a8a213f1c94e39592c35 Mon Sep 17 00:00:00 2001 From: pensun Date: Thu, 29 Sep 2016 01:22:19 -0500 Subject: [PATCH 60/66] add hipEvent* macros on NV path Change-Id: I28ff0fa24f69560e13366e7cd8d3a485665c67e1 --- include/nvcc_detail/hip_runtime_api.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/include/nvcc_detail/hip_runtime_api.h b/include/nvcc_detail/hip_runtime_api.h index 0814de8ec9..a36b951400 100644 --- a/include/nvcc_detail/hip_runtime_api.h +++ b/include/nvcc_detail/hip_runtime_api.h @@ -50,6 +50,12 @@ hipMemcpyHostToHost } hipTextureFilterMode;*/ #define hipFilterModePoint cudaFilterModePoint +//! Flags that can be used with hipEventCreateWithFlags: +#define hipEventDefault cudaEventDefault +#define hipEventBlockingSync cudaEventBlockingSync +#define hipEventDisableTiming cudaEventDisableTiming +#define hipEventInterprocess cudaEventInterprocess + #define hipHostMallocDefault cudaHostAllocDefault #define hipHostMallocPortable cudaHostAllocPortable #define hipHostMallocMapped cudaHostAllocMapped @@ -230,13 +236,13 @@ inline static hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* pr return hipCUDAErrorTohipError(cudaChooseDevice(device,&cdprop)); } -inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, +inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t size) { return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size)); } -inline static hipError_t hipMemcpyDtoH(void* dst, +inline static hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t size) { return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size)); @@ -248,13 +254,13 @@ inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size)); } -inline static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, +inline static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t size, hipStream_t stream) { return hipCUResultTohipError(cuMemcpyHtoDAsync(dst, src, size, stream)); } -inline static hipError_t hipMemcpyDtoHAsync(void* dst, +inline static hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t size, hipStream_t stream) { return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size)); @@ -747,7 +753,7 @@ inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int sharedMemBytes, hipStream_t stream, void **kernelParams, void **extra) { - return hipCUResultTohipError(cuLaunchKernel(f, + return hipCUResultTohipError(cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, stream, kernelParams, extra)); From 1e56dead78e725fed3c22d97e6b3d0bee3a8b322 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 29 Sep 2016 10:06:26 -0500 Subject: [PATCH 61/66] Fixed issue on git #39 Change-Id: I5b507fc2b544df0cd3a900a2763c7a3ad6295c13 --- include/hcc_detail/hip_runtime_api.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hcc_detail/hip_runtime_api.h b/include/hcc_detail/hip_runtime_api.h index 6992be8475..ee4ff2fd2b 100644 --- a/include/hcc_detail/hip_runtime_api.h +++ b/include/hcc_detail/hip_runtime_api.h @@ -1083,7 +1083,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ); * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorMemoryFree */ #if __cplusplus -hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t = 0 ); +hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream = 0 ); #else hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream); #endif From e04b8d6aa7f9dd27dbcc647fda1f5aa8aa4a5004 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 29 Sep 2016 13:54:34 -0500 Subject: [PATCH 62/66] Added C guard for math headers Change-Id: I0e71819c6436b3e6c99b2deddb2ee2d2a16acedd --- include/hcc_detail/hip_runtime.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/include/hcc_detail/hip_runtime.h b/include/hcc_detail/hip_runtime.h index 92406b4a75..c89faaae11 100644 --- a/include/hcc_detail/hip_runtime.h +++ b/include/hcc_detail/hip_runtime.h @@ -33,10 +33,13 @@ THE SOFTWARE. //#include +#if __cplusplus #include +#else +#include #include #include - +#endif // Define NVCC_COMPAT for CUDA compatibility #define NVCC_COMPAT #define CUDA_SUCCESS hipSuccess @@ -493,6 +496,13 @@ __device__ float __dsqrt_rd(double x); __device__ float __dsqrt_rn(double x); __device__ float __dsqrt_ru(double x); __device__ float __dsqrt_rz(double x); + +/** + * CUDA 8 device function features + + */ + + /** * Kernel launching */ From 6a97ff68a46f5bd59fd5272b967d0f74e3868392 Mon Sep 17 00:00:00 2001 From: pensun Date: Thu, 29 Sep 2016 15:11:57 -0500 Subject: [PATCH 63/66] update hipcc to search for lib using CUDA_PATH on NV path Change-Id: I7e19d1f82237a8a13f3d8284b313ac049be3f920 --- bin/hipcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hipcc b/bin/hipcc index b6d358532c..9c174dc5a5 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -152,7 +152,7 @@ if ($HIP_PLATFORM eq "hcc") { $HIPCC="$CUDA_PATH/bin/nvcc"; $HIPCXXFLAGS .= " -I$CUDA_PATH/include"; - $HIPLDFLAGS = "-lcuda -lcudart"; + $HIPLDFLAGS = "-lcuda -lcudart -L$CUDA_PATH/lib64"; } else { printf ("error: unknown HIP_PLATFORM = '$HIP_PLATFORM'"); exit (-1); From 2835e86e39c05807a422d13ea314307e283210b4 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 29 Sep 2016 19:44:00 -0500 Subject: [PATCH 64/66] added vector types for C path Change-Id: I4ccd4082f1b2ce4f0f4fd9fb498506234783a803 --- include/hcc_detail/hip_vector_types.h | 149 +++++++++++++++++++++++++- 1 file changed, 147 insertions(+), 2 deletions(-) diff --git a/include/hcc_detail/hip_vector_types.h b/include/hcc_detail/hip_vector_types.h index 1537b0fb70..3b0cab031d 100644 --- a/include/hcc_detail/hip_vector_types.h +++ b/include/hcc_detail/hip_vector_types.h @@ -32,11 +32,11 @@ THE SOFTWARE. #error("This version of HIP requires a newer version of HCC."); #endif -#if __cplusplus +#if __HCC__ #include using namespace hc::short_vector; -#endif + //-- Signed // Define char vector types @@ -115,6 +115,151 @@ typedef hc::short_vector::double2 double2; typedef hc::short_vector::double3 double3; typedef hc::short_vector::double4 double4; +#else + +#define __hip_align(name, val, data) \ + __attribute__((aligned(val))) name \ + { data } + +struct __hip_align(char1, 1, signed char x;); +struct __hip_align(uchar1, 1, unsigned char x;); + +struct __hip_align(char2, 2, signed char x; signed char y;); +struct __hip_align(uchar2, 2, unsigned char x; unsigned char y;); + +struct char3 +{ + signed char x, y, z; +}; + +struct uchar3 +{ + unsigned char x, y, z; +}; + +struct __hip_align(char4, 4, signed char x; signed char y; signed char z; signed char w;); +struct __hip_align(uchar4, 4, unsigned char x; unsigned char y; unsigned char z; unsigned char w;); + +struct __hip_align(short1, 2, signed short x;); +struct __hip_align(ushort1, 2, unsigned short x;); + +struct __hip_align(short2, 4, signed short x; signed short y;); +struct __hip_align(ushort2, 4, unsigned short x; unsigned short y;); + +struct short3 +{ + signed short x, y, z; +}; + +struct ushort3 +{ + unsigned short x, y, z; +}; + +struct __hip_align(short4, 8, signed short x; signed short y; signed short z; signed short w;); +struct __hip_align(ushort4, 8, unsigned short x; unsigned short y; unsigned short z; unsigned short w;); + +struct __hip_align(int1, 4, signed int x;); +struct __hip_align(uint1, 4, unsigned int x;); + +struct __hip_align(int2, 8, signed int x; signed int y;); +struct __hip_align(uint2, 8, unsigned int x; unsigned int y;); + +struct int3{ + signed int x, y, z; +}; +struct uint3{ + unsigned int x, y, z; +}; + +struct __hip_align(int4, 16, signed int x; signed int y; signed int z; signed int w;); +struct __hip_align(uint4, 16, unsigned int x; unsigned int y; unsigned int z; unsigned int w;); + +struct __hip_align(long1, 8, long int x;); +struct __hip_align(ulong1, 8, unsigned long x;); + +struct __hip_align(long2, 16, long int x; long int y;); +struct __hip_align(ulong2, 16, unsigned long x; unsigned long y;); + +struct long3{ + long int x, y, z; +}; +struct ulong3{ + unsigned long x, y, z; +}; + +struct __hip_align(long4, 32, long int x; long int y; long int z; long int w;); +struct __hip_align(ulong4, 32, unsigned long x; unsigned long y; unsigned long z; unsigned long w;); + +struct float1 +{ + float x; +}; + +struct __hip_align(float2, 8, float x; float y;); + +struct float3 +{ + float x, y, z; +}; + +struct __hip_align(float4, 16, float x; float y; float z; float w;); + +struct __hip_align(longlong1, 16, long long int x;); +struct __hip_align(ulonglong1, 16, unsigned long long int x;); + +struct __attribute__((aligned(32))) longlong2 +{ + long long int x, y; +}; + +struct __attribute__((aligned(32))) ulonglong2 +{ + unsigned long long int x, y; +}; + +struct longlong3 +{ + long long int x, y, z; +}; + +struct ulonglong3 +{ + unsigned long long int x, y, z; +}; + +struct __attribute__((aligned(64))) longlong4 +{ + long long int x, y, z, w; +}; + +struct __attribute__((aligned(64))) ulonglong4 +{ + unsigned long long int x, y, z, w; +}; + +struct double1 +{ + double x; +}; + +struct __attribute__((aligned(16))) double2 +{ + double x, y; +}; + +struct double3 +{ + double x, y, z; +}; + +struct __attribute__((aligned(32))) double4 +{ + double x, y, z, w; +}; + +#endif + #if __HCC__ #include"hip/hcc_detail/host_defines.h" #define __HIP_DEVICE__ __device__ __host__ From dcfe5ce2eaea6aa135b38582f504717e1df36c5d Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 30 Sep 2016 12:49:11 +0530 Subject: [PATCH 65/66] HIT: maintain source hierarchy for generated test executables Change-Id: I997650d10cf38f35edb6b88b130a62c3541a850c --- tests/hip_tests.txt | 2 +- tests/hit/HIT.cmake | 38 +++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/hip_tests.txt b/tests/hip_tests.txt index 5c7e543ce5..f3ea49a0f9 100644 --- a/tests/hip_tests.txt +++ b/tests/hip_tests.txt @@ -10,4 +10,4 @@ include(${HIP_SRC_PATH}/tests/hit/HIT.cmake) # Add tests include_directories(${HIP_SRC_PATH}/tests/src) -hit_add_directory_recursive(${HIP_SRC_PATH}/tests/src) +hit_add_directory_recursive(${HIP_SRC_PATH}/tests/src "directed_tests") diff --git a/tests/hit/HIT.cmake b/tests/hit/HIT.cmake index 82f0c5eabf..206d63c77f 100644 --- a/tests/hit/HIT.cmake +++ b/tests/hit/HIT.cmake @@ -118,20 +118,19 @@ endmacro() # Helper macro to create a test macro(MAKE_TEST exe) string(REPLACE " " "" smush_args ${ARGN}) - set(testname ${PROJECT_NAME}/${exe}${smush_args}.tst) + set(testname ${exe}${smush_args}.tst) add_test(NAME ${testname} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN}) set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED") endmacro() -macro(MAKE_NAMED_TEST exe _testname) - set(testname ${PROJECT_NAME}/${_testname}.tst) +macro(MAKE_NAMED_TEST exe testname) add_test(NAME ${testname} COMMAND ${PROJECT_BINARY_DIR}/${exe} ${ARGN}) set_tests_properties(${testname} PROPERTIES PASS_REGULAR_EXPRESSION "PASSED") endmacro() #------------------------------------------------------------------------------- # Macro: HIT_ADD_FILES used to scan+add multiple files for testing. -macro(HIT_ADD_FILES _dir) +macro(HIT_ADD_FILES _dir _label) foreach (file ${ARGN}) # Build tests execute_process(COMMAND ${HIP_SRC_PATH}/tests/hit/parser --buildCMDs ${file} @@ -143,11 +142,13 @@ macro(HIT_ADD_FILES _dir) foreach(_cmd ${_contents}) string(REGEX REPLACE " " ";" _cmd "${_cmd}") parse_build_command(_target _sources _hipcc_options _hcc_options _nvcc_options _exclude_platforms ${_dir} ${_cmd}) - insert_into_map("_exclude" "${_target}" "${_exclude_platforms}") + string(REGEX REPLACE "/" "." target ${_label}/${_target}) + insert_into_map("_exclude" "${target}" "${_exclude_platforms}") if(_exclude_platforms STREQUAL "all" OR _exclude_platforms STREQUAL ${HIP_PLATFORM}) else() set_source_files_properties(${_sources} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) - hip_add_executable(${_target} ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + hip_add_executable(${target} ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + set_target_properties(${target} PROPERTIES OUTPUT_NAME ${_target} RUNTIME_OUTPUT_DIRECTORY ${_label}) endif() endforeach() @@ -161,11 +162,12 @@ macro(HIT_ADD_FILES _dir) foreach(_cmd ${_contents}) string(REGEX REPLACE " " ";" _cmd "${_cmd}") parse_run_command(_target _arguments _exclude_platforms ${_cmd}) - read_from_map("_exclude" "${_target}" _exclude_platforms_from_build) + string(REGEX REPLACE "/" "." target ${_label}/${_target}) + read_from_map("_exclude" "${target}" _exclude_platforms_from_build) if(_exclude_platforms STREQUAL "all" OR _exclude_platforms STREQUAL ${HIP_PLATFORM} OR _exclude_platforms_from_build STREQUAL "all" OR _exclude_platforms_from_build STREQUAL ${HIP_PLATFORM}) else() - make_test(${_target} ${_arguments}) + make_test(${_label}/${_target} ${_arguments}) endif() endforeach() @@ -179,33 +181,39 @@ macro(HIT_ADD_FILES _dir) foreach(_cmd ${_contents}) string(REGEX REPLACE " " ";" _cmd "${_cmd}") parse_run_named_command(_target _testname _arguments _exclude_platforms ${_cmd}) - read_from_map("_exclude" "${_target}" _exclude_platforms_from_build) + string(REGEX REPLACE "/" "." target ${_label}/${_target}) + read_from_map("_exclude" "${target}" _exclude_platforms_from_build) if(_exclude_platforms STREQUAL "all" OR _exclude_platforms STREQUAL ${HIP_PLATFORM} OR _exclude_platforms_from_build STREQUAL "all" OR _exclude_platforms_from_build STREQUAL ${HIP_PLATFORM}) else() - make_named_test(${_target} ${_testname} ${_arguments}) + make_named_test(${_label}/${_target} ${_label}/${_testname}.tst ${_arguments}) endif() endforeach() endforeach() endmacro() # Macro: HIT_ADD_DIRECTORY to scan+add all files in a directory for testing -macro(HIT_ADD_DIRECTORY _dir) +macro(HIT_ADD_DIRECTORY _dir _label) + execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${_label} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) file(GLOB files "${_dir}/*.c*") - hit_add_files(${_dir} ${files}) + hit_add_files(${_dir} ${_label} ${files}) endmacro() # Macro: HIT_ADD_DIRECTORY_RECURSIVE to scan+add all files in a directory+subdirectories for testing -macro(HIT_ADD_DIRECTORY_RECURSIVE _dir) +macro(HIT_ADD_DIRECTORY_RECURSIVE _dir _label) + execute_process(COMMAND ${CMAKE_COMMAND} -E make_directory ${_label} WORKING_DIRECTORY ${PROJECT_BINARY_DIR}) file(GLOB children RELATIVE ${_dir} ${_dir}/*) set(dirlist "") foreach(child ${children}) if(IS_DIRECTORY ${_dir}/${child}) - hit_add_directory_recursive(${_dir}/${child}) + list(APPEND dirlist ${child}) else() - hit_add_files(${_dir} ${child}) + hit_add_files(${_dir} ${_label} ${child}) endif() endforeach() + foreach(child ${dirlist}) + hit_add_directory_recursive(${_dir}/${child} ${_label}/${child}) + endforeach() endmacro() # vim: ts=4:sw=4:expandtab:smartindent From 4f6112730b7796e7416b15079adaea343b2dd3c0 Mon Sep 17 00:00:00 2001 From: sandeep kumar Date: Wed, 7 Sep 2016 17:16:12 +0530 Subject: [PATCH 66/66] Add 2_Cookbook Change-Id: I10bbbd4bcb80a5900fe6af466c8f4c94ea5efe9a --- samples/2_Cookbook/0_MatrixTranspose/Makefile | 36 ++++ .../0_MatrixTranspose/MatrixTranspose.cpp | 137 ++++++++++++++ .../2_Cookbook/0_MatrixTranspose/Readme.md | 100 ++++++++++ samples/2_Cookbook/1_hipEvent/Makefile | 36 ++++ samples/2_Cookbook/1_hipEvent/Readme.md | 74 ++++++++ samples/2_Cookbook/1_hipEvent/hipEvent.cpp | 174 ++++++++++++++++++ samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile | 36 ++++ .../2_HIP_ATP_MARKER/MatrixTranspose.cpp | 174 ++++++++++++++++++ samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md | 51 +++++ samples/2_Cookbook/3_shared_memory/Makefile | 36 ++++ .../3_shared_memory/sharedMemory.cpp | 144 +++++++++++++++ samples/2_Cookbook/4_shfl/Makefile | 36 ++++ samples/2_Cookbook/4_shfl/shfl.cpp | 143 ++++++++++++++ samples/2_Cookbook/5_2dshfl/2dshfl.cpp | 139 ++++++++++++++ samples/2_Cookbook/5_2dshfl/Makefile | 36 ++++ 15 files changed, 1352 insertions(+) create mode 100644 samples/2_Cookbook/0_MatrixTranspose/Makefile create mode 100644 samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp create mode 100644 samples/2_Cookbook/0_MatrixTranspose/Readme.md create mode 100644 samples/2_Cookbook/1_hipEvent/Makefile create mode 100644 samples/2_Cookbook/1_hipEvent/Readme.md create mode 100644 samples/2_Cookbook/1_hipEvent/hipEvent.cpp create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp create mode 100644 samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md create mode 100644 samples/2_Cookbook/3_shared_memory/Makefile create mode 100644 samples/2_Cookbook/3_shared_memory/sharedMemory.cpp create mode 100644 samples/2_Cookbook/4_shfl/Makefile create mode 100644 samples/2_Cookbook/4_shfl/shfl.cpp create mode 100644 samples/2_Cookbook/5_2dshfl/2dshfl.cpp create mode 100644 samples/2_Cookbook/5_2dshfl/Makefile diff --git a/samples/2_Cookbook/0_MatrixTranspose/Makefile b/samples/2_Cookbook/0_MatrixTranspose/Makefile new file mode 100644 index 0000000000..ffb442e443 --- /dev/null +++ b/samples/2_Cookbook/0_MatrixTranspose/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = MatrixTranspose.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp new file mode 100644 index 0000000000..c43785f5c9 --- /dev/null +++ b/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp @@ -0,0 +1,137 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + + +#define WIDTH 1024 +#define HEIGHT 1024 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 16 +#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + out[y * width + x] = in[x * height + y]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/0_MatrixTranspose/Readme.md b/samples/2_Cookbook/0_MatrixTranspose/Readme.md new file mode 100644 index 0000000000..b1c0b261b9 --- /dev/null +++ b/samples/2_Cookbook/0_MatrixTranspose/Readme.md @@ -0,0 +1,100 @@ +## Writing first HIP program ### + +This tutorial shows how to get write simple HIP application. We will write the simplest Matrix Transpose program. + +## HIP Introduction: + +HIP is a C++ runtime API and kernel language that allows developers to create portable applications that can run on AMD and other GPU’s. Our goal was to rise above the lowest-common-denominator paths and deliver a solution that allows you, the developer, to use essential hardware features and maximize your application’s performance on GPU hardware. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +Here is simple example showing how to write your first program in HIP. +In order to use the HIP framework, we need to add the "hip_runtime.h" header file. SInce its c++ api you can add any header file you have been using earlier while writing your c/c++ program. For gpgpu programming, we have host(microprocessor) and the device(gpu). + +## Device-side code +We will work on device side code first, Here is simple example showing a snippet of HIP device side code: + +`__global__ void matrixTranspose(hipLaunchParm lp, ` +` float *out, ` +` float *in, ` +` const int width, ` +` const int height) ` +`{ ` +` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; ` +` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; ` +` ` +` out[y * width + x] = in[x * height + y]; ` +`} ` + +`__global__` keyword is the Function-Type Qualifiers, it is used with functions that are executed on device and are called/launched from the hosts. +other function-type qualifiers are: +`__device__` functions are Executed on the device and Called from the device only +`__host__` functions are Executed on the host and Called from the host + +`__host__` can combine with `__device__`, in which case the function compiles for both the host and device. These functions cannot use the HIP grid coordinate functions (for example, "hipThreadIdx_x", will talk about it latter). A possible workaround is to pass the necessary coordinate info as an argument to the function. +`__host__` cannot combine with `__global__`. + +`__global__` functions are often referred to as *kernels, and calling one is termed *launching the kernel*. + +Next keyword is `void`. HIP `__global__` functions must have a `void` return type, and the first parameter to a HIP `__global__` function must have the type `hipLaunchParm`, which is for execution configuration. Global functions require the caller to specify an "execution configuration" that includes the grid and block dimensions. The execution configuration can also include other information for the launch, such as the amount of additional shared memory to allocate and the stream where the kernel should execute. + +After `hipLaunchParm`, Kernel arguments follows next(i.e., `float *out, float *in, const int width, const int height`). + +The kernel function begins with +` int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;` +` int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;` +here the keyword hipBlockIdx_x, hipBlockIdx_y and hipBlockIdx_z(not used here) are the built-in functions to identify the threads in a block. The keyword hipBlockDim_x, hipBlockDim_y and hipBlockDim_z(not used here) are to identify the dimensions of the block. + +We are familiar with rest of the code on device-side. + +## Host-side code + +Now, we'll see how to call the kernel from the host. Inside the main() function, we first defined the pointers(for both, the host-side as well as device). The declaration of device pointer is similar to that of the host. Next, we have `hipDeviceProp_t`, it is the pre-defined struct for hip device properties. This is followed by `hipGetDeviceProperties(&devProp, 0)` It is used to extract the device information. The first parameter is the struct, second parameter is the device number to get properties for. Next line print the name of the device. + +We allocated memory to the Matrix on host side by using malloc and initiallized it. While in order to allocate memory on device side we will be using `hipMalloc`, it's quiet similar to that of malloc instruction. After this, we will copy the data to the allocated memory on device-side using `hipMemcpy`. +` hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);` +here the first parameter is the destination pointer, second is the source pointer, third is the size of memory copy and the last specify the direction on memory copy(which is in this case froom host to device). While in order to transfer memory from device to host, use `hipMemcpyDeviceToHost` and for device to device memory copy use `hipMemcpyDeviceToDevice`. + +Now, we'll see how to launch the kernel. +` hipLaunchKernel(matrixTranspose, ` +` dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), ` +` dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), ` +` 0, 0, ` +` gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); ` + +HIP introduces a standard C++ calling convention to pass the execution configuration to the kernel (this convention replaces the `Cuda <<< >>>` syntax). In HIP, +- Kernels launch with the `"hipLaunchKernel"` function +- The first five parameters to hipLaunchKernel are the following: + - **symbol kernelName**: the name of the kernel to launch. To support template kernels which contains "," use the HIP_KERNEL_NAME macro. In current application it's "matrixTranspose". + - **dim3 gridDim**: 3D-grid dimensions specifying the number of blocks to launch. In MatrixTranspose sample, it's "dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y)". + - **dim3 blockDim**: 3D-block dimensions specifying the number of threads in each block.In MatrixTranspose sample, it's "dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y)". + - **size_t dynamicShared**: amount of additional shared memory to allocate when launching the kernel. In MatrixTranspose sample, it's '0'. + - **hipStream_t**: stream where the kernel should execute. A value of 0 corresponds to the NULL stream.In MatrixTranspose sample, it's '0'. +- Kernel arguments follow these first five parameters. Here, these are "gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT". + +Next, we'll copy the computed values/data back to the device using the `hipMemcpy`. Here the last parameter will be `hipMemcpyDeviceToHost` + +After, copying the data from device to memory, we will verify it with the one we computed with the cpu reference funtion. + +Finally, we will free the memory allocated earlier by using free() for host while for devices we will use `hipFree`. + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/1_hipEvent/Makefile b/samples/2_Cookbook/1_hipEvent/Makefile new file mode 100644 index 0000000000..dc0f7db2e6 --- /dev/null +++ b/samples/2_Cookbook/1_hipEvent/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = hipEvent.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/1_hipEvent/Readme.md b/samples/2_Cookbook/1_hipEvent/Readme.md new file mode 100644 index 0000000000..16285120fa --- /dev/null +++ b/samples/2_Cookbook/1_hipEvent/Readme.md @@ -0,0 +1,74 @@ +## Using hipEvents to measure performance ### + +This tutorial is follow-up of the previous one where we learn how to write our first hip program, in which we compute Matrix Transpose. In this tutorial, we'll explain how to use the hipEvent to get the performance score for memory transfer and kernel execution time. + +## Introduction: + +Memory transfer and kernel execution are the most important parameter in parallel computing (specially HPC and machine learning). Memory bottlenecks is the main problem why we are not able to get the highest performance, therefore obtaining the memory transfer timing and kernel execution timing plays key role in application optimization. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the previous tutorial and modify it to learn how to get the performance score for memory transfer and kernel execution time. + +## hipEnvent_t + +We'll learn how to use the event management functionality of HIP runtime api. In the same sourcecode, we used for MatrixTranspose we will declare the following events as follows: + +` hipEvent_t start, stop;` + +We'll create the event with the help of following code: + +` hipEventCreate(&start);` +` hipEventCreate(&stop);` + +We'll use the "eventMs" variable to store the time taken value: +` float eventMs = 1.0f;` + +## Time taken measurement by using hipEvents: + +We'll start the timer by calling: +` hipEventRecord(start, NULL);` +in this, the first parameter is the hipEvent_t, will will mark the start of the time from which the measurement has to be performed, while the second parameter has to be of the type hipStream_t. In current situation, we have passed NULL (the default stream). We will learn about the `hipStream_t` in more detail latter. + +Now, we'll have the operation for which we need to compute the time taken. For the case of memory transfer, we'll place the `hipMemcpy`: +` hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice);` + +and for kernel execution time we'll use `hipKernelLaunch`: +` hipLaunchKernel(matrixTranspose, ` +` dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), ` +` dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), ` +` 0, 0, ` +` gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); ` + +Now to mark the end of the eventRecord, we will again use the hipEventRecord by passing the stop event: +` hipEventRecord(stop, NULL);` + +Will synchronize the event with the help of: +` hipEventSynchronize(stop);` + +In order to calculate the time taken by measuring the difference of occurance marked by the start and stop event, we'll use: +` hipEventElapsedTime(&eventMs, start, stop);` +Here the first parameter will store the time taken value, second parameter is the starting marker for the event while the third one is marking the end. + +We can print the value of time take comfortably since eventMs is float variable. + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/1_hipEvent/hipEvent.cpp b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp new file mode 100644 index 0000000000..b6bc4d1db1 --- /dev/null +++ b/samples/2_Cookbook/1_hipEvent/hipEvent.cpp @@ -0,0 +1,174 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + +#define WIDTH 1024 +#define HEIGHT 1024 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 16 +#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + out[y * width + x] = in[x * height + y]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 1.0f; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile b/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile new file mode 100644 index 0000000000..ffb442e443 --- /dev/null +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = MatrixTranspose.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp new file mode 100644 index 0000000000..b6bc4d1db1 --- /dev/null +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp @@ -0,0 +1,174 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + +#define WIDTH 1024 +#define HEIGHT 1024 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 16 +#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + out[y * width + x] = in[x * height + y]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 1.0f; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md b/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md new file mode 100644 index 0000000000..2bba31d349 --- /dev/null +++ b/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md @@ -0,0 +1,51 @@ +## Using hipEvents to measure performance ### + +This tutorial is follow-up of the previous two tutorial where we learn how to write our first hip program, in which we compute Matrix Transpose and in second one, we added feature to measure time taken for memory transfer and kernel execution. In this tutorial, we won't make amy changes to the source code. We'll explain how to use the codexl/rocm-profiler for hip timeline tracing. + + +## Introduction: + +CodeXL and rocm-profiler are the tool used for profiling the application, which is of prominent use in optimizing the application by means of finding the memory bottlenecks and etc. + +## Requirement: +[CodeXL Installation](http://gpuopen.com/compute-product/codexl/) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose source code from the previous tutorial as it is. + +## Using CodeXL markers for HIP Functions + +HIP can generate markers at function being/end which are displayed on the CodeXL timeline view. To do this, you need to install ROCm-Profiler and enable HIP to generate the markers: + +1. Install ROCm-Profiler Installing HIP from the rocm pre-built packages, installs the ROCm-Profiler as well. Alternatively, you can build ROCm-Profiler using the instructions given below. + +2. Build HIP with ATP markers enabled HIP pre-built packages are enabled with ATP marker support by default. To enable ATP marker support when building HIP from source, use the option -DCOMPILE_HIP_ATP_MARKER=1 during the cmake configure step. + +3. Set HIP_ATP_MARKER +`export HIP_ATP_MARKER=1` + +4. Recompile the target application + +5. Run with profiler enabled to generate ATP file. +`/opt/rocm/bin/rocm-profiler -o -A ` + +##Using HIP_TRACE_API + +You can also print the HIP function strings to stderr using HIP_TRACE_API environment variable. This can also be combined with the more detailed debug information provided by the HIP_DB switch. For example: +`HIP_TRACE_API=1 HIP_DB=0x2 ./myHipApp` +Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/3_shared_memory/Makefile b/samples/2_Cookbook/3_shared_memory/Makefile new file mode 100644 index 0000000000..5e9ce47211 --- /dev/null +++ b/samples/2_Cookbook/3_shared_memory/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = sharedMemory.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp new file mode 100644 index 0000000000..1106d454f2 --- /dev/null +++ b/samples/2_Cookbook/3_shared_memory/sharedMemory.cpp @@ -0,0 +1,144 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + + +#define WIDTH 1024 +#define HEIGHT 1024 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 16 +#define THREADS_PER_BLOCK_Y 16 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + __shared__ float sharedMem[16*16]; + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + sharedMem[y * width + x] = in[x * height + y]; + + __syncthreads(); + + out[y * width + x] = sharedMem[y * width + x]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/4_shfl/Makefile b/samples/2_Cookbook/4_shfl/Makefile new file mode 100644 index 0000000000..1d30c78749 --- /dev/null +++ b/samples/2_Cookbook/4_shfl/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = shfl.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/4_shfl/shfl.cpp b/samples/2_Cookbook/4_shfl/shfl.cpp new file mode 100644 index 0000000000..f43809b017 --- /dev/null +++ b/samples/2_Cookbook/4_shfl/shfl.cpp @@ -0,0 +1,143 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + + +#define WIDTH 4 +#define HEIGHT 4 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + //int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + float val = in[x]; + + for(int i=0;i 0 ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/5_2dshfl/2dshfl.cpp b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp new file mode 100644 index 0000000000..85bc3be2ae --- /dev/null +++ b/samples/2_Cookbook/5_2dshfl/2dshfl.cpp @@ -0,0 +1,139 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +#include + +// hip header file +#include "hip_runtime.h" + + +#define WIDTH 4 +#define HEIGHT 4 + +#define NUM (WIDTH*HEIGHT) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width, + const int height) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + float val = in[y*width + x]; + + out[x*height + y] = __shfl(val,y*width + x); +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width, + const unsigned int height) +{ + for(unsigned int j=0; j < height; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*height + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(1), + dim3(THREADS_PER_BLOCK_X , THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH ,HEIGHT); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH, HEIGHT); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > 0 ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/5_2dshfl/Makefile b/samples/2_Cookbook/5_2dshfl/Makefile new file mode 100644 index 0000000000..502d2948b0 --- /dev/null +++ b/samples/2_Cookbook/5_2dshfl/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = 2dshfl.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./exe + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o +