From f1a4303b0437647b9970584fc6365c2739874fa8 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 25 Oct 2016 12:26:54 +0530 Subject: [PATCH 01/65] hipcc: Turn back linking hip_ir.ll by default Change-Id: I9cec5a1a5a4791eddf6ef3e3332143ffc3ceadb6 --- bin/hipcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hipcc b/bin/hipcc index 21453634d6..2f3cd46c62 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -211,7 +211,7 @@ if($HIP_PLATFORM eq "hcc"){ } } -if(($HIP_PLATFORM eq "hcc") and defined $ENV{HIP_EXPERIMENTAL}){ +if($HIP_PLATFORM eq "hcc"){ $EXPORT_LL=" "; $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n"; } From 528b25700431880c116dd0f7b0af71a2b7855784 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 25 Oct 2016 12:26:54 +0530 Subject: [PATCH 02/65] hipcc: Turn back linking hip_ir.ll by default Change-Id: I9cec5a1a5a4791eddf6ef3e3332143ffc3ceadb6 --- bin/hipcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hipcc b/bin/hipcc index 21453634d6..2f3cd46c62 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -211,7 +211,7 @@ if($HIP_PLATFORM eq "hcc"){ } } -if(($HIP_PLATFORM eq "hcc") and defined $ENV{HIP_EXPERIMENTAL}){ +if($HIP_PLATFORM eq "hcc"){ $EXPORT_LL=" "; $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n"; } From c2b6eee1522f3702730fd094bfea162a01101bb2 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 25 Oct 2016 15:42:24 +0530 Subject: [PATCH 03/65] hip_hcc package: install hip_ir.ll to lib folder Change-Id: Ieaa4ff83597c211f221b4c1b0b93e08ad7c92ea6 --- packaging/hip_hcc.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/packaging/hip_hcc.txt b/packaging/hip_hcc.txt index 5801554f7c..d7b0877b62 100644 --- a/packaging/hip_hcc.txt +++ b/packaging/hip_hcc.txt @@ -9,6 +9,7 @@ else() install(FILES @PROJECT_BINARY_DIR@/libhip_hcc.so DESTINATION lib) endif() install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib) +install(FILES @hip_SOURCE_DIR@/src/hip_ir.ll DESTINATION lib) ############################# # Packaging steps From db7a46adf68738d92b3f124ceb4cd2c143cb3280 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 25 Oct 2016 15:42:24 +0530 Subject: [PATCH 04/65] hip_hcc package: install hip_ir.ll to lib folder Change-Id: Ieaa4ff83597c211f221b4c1b0b93e08ad7c92ea6 --- packaging/hip_hcc.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/packaging/hip_hcc.txt b/packaging/hip_hcc.txt index 5801554f7c..d7b0877b62 100644 --- a/packaging/hip_hcc.txt +++ b/packaging/hip_hcc.txt @@ -9,6 +9,7 @@ else() install(FILES @PROJECT_BINARY_DIR@/libhip_hcc.so DESTINATION lib) endif() install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib) +install(FILES @hip_SOURCE_DIR@/src/hip_ir.ll DESTINATION lib) ############################# # Packaging steps From 820a914b985eecf495683a177a16d846812fffc9 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 25 Oct 2016 09:33:45 -0500 Subject: [PATCH 05/65] correct cachesize to output correct value Change-Id: I5db031591eb718b0c12e78a35e4b19349de9526d --- src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 7fa25334bc..0ef9a45ca6 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -769,7 +769,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) uint32_t cache_size[4]; err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_CACHE_SIZE, cache_size); DeviceErrorCheck(err); - prop->l2CacheSize = cache_size[1]; + prop->l2CacheSize = cache_size[0]; /* Computemode for HSA Devices is always : cudaComputeModeDefault */ prop->computeMode = 0; From e1c1b4c009be6cd1af2f185fe83053937b162747 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 25 Oct 2016 11:03:35 -0500 Subject: [PATCH 06/65] reverted change for cache size query Change-Id: I44a1f43818cd287a2a3b6265f43d183f9bd5b71c --- src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 0ef9a45ca6..7fa25334bc 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -769,7 +769,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) uint32_t cache_size[4]; err = hsa_agent_get_info(_hsaAgent, HSA_AGENT_INFO_CACHE_SIZE, cache_size); DeviceErrorCheck(err); - prop->l2CacheSize = cache_size[0]; + prop->l2CacheSize = cache_size[1]; /* Computemode for HSA Devices is always : cudaComputeModeDefault */ prop->computeMode = 0; From 8a7dcfce0b581f77be6c3e793d691192eb36dfe2 Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 25 Oct 2016 15:29:33 -0500 Subject: [PATCH 07/65] Remove extra semicolons and extra spaces in header on NV path Change-Id: Ib33aec2451a4e0b298d537dbb1b9df000405871b --- include/hip/nvcc_detail/hip_runtime_api.h | 58 +++++++++++------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index a632e57f97..769a66ea7b 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -106,7 +106,7 @@ switch(cuError) { case cudaErrorHostMemoryNotRegistered : return hipErrorHostMemoryNotRegistered ; case cudaErrorUnsupportedLimit : return hipErrorUnsupportedLimit ; default : return hipErrorUnknown; // Note - translated error. -}; +} } inline static hipError_t hipCUResultTohipError(CUresult cuError) { //TODO Populate further @@ -120,7 +120,7 @@ switch(cuError) { case CUDA_ERROR_INVALID_CONTEXT : return hipErrorInvalidContext ; case CUDA_ERROR_NOT_INITIALIZED : return hipErrorNotInitialized ; default : return hipErrorUnknown; // Note - translated error. -}; +} } // TODO match the error enum names of hip and cuda @@ -319,11 +319,11 @@ inline static hipError_t hipDeviceSynchronize() { } inline static const char* hipGetErrorString(hipError_t error){ - return cudaGetErrorString( hipErrorToCudaError(error) ); + return cudaGetErrorString(hipErrorToCudaError(error)); } inline static const char* hipGetErrorName(hipError_t error){ - return cudaGetErrorName( hipErrorToCudaError(error) ); + return cudaGetErrorName(hipErrorToCudaError(error)); } inline static hipError_t hipGetDeviceCount(int * count){ @@ -611,12 +611,12 @@ inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int devic inline static hipError_t hipDeviceDisablePeerAccess ( int peerDevice ) { - return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess ( peerDevice )); -}; + return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess(peerDevice)); +} inline static hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned int flags ) { - return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess ( peerDevice, flags )); + return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess(peerDevice, flags)); } inline static hipError_t hipCtxDisablePeerAccess ( hipCtx_t peerCtx ) @@ -626,22 +626,22 @@ inline static hipError_t hipCtxDisablePeerAccess ( hipCtx_t peerCtx ) inline static hipError_t hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int flags ) { - return hipCUResultTohipError(cuCtxEnablePeerAccess ( peerCtx, flags )); + return hipCUResultTohipError(cuCtxEnablePeerAccess(peerCtx, flags)); } inline static hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count ) { - return hipCUDAErrorTohipError(cudaMemcpyPeer ( dst, dstDevice, src, srcDevice, count )); -}; + return hipCUDAErrorTohipError(cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count)); +} inline static hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count, hipStream_t stream=0 ) { - return hipCUDAErrorTohipError(cudaMemcpyPeerAsync ( dst, dstDevice, src, srcDevice, count, stream )); -}; + return hipCUDAErrorTohipError(cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream)); +} inline static hipError_t hipSetDeviceFlags (unsigned int flags) { - return hipCUDAErrorTohipError(cudaSetDeviceFlags( flags )); + return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags)); } inline static hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned int flags) @@ -656,62 +656,62 @@ inline static hipError_t hipEventQuery(hipEvent_t event) inline static hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device) { - return hipCUResultTohipError(cuCtxCreate ( ctx,flags,device )); + return hipCUResultTohipError(cuCtxCreate(ctx,flags,device)); } inline static hipError_t hipCtxDestroy(hipCtx_t ctx) { - return hipCUResultTohipError(cuCtxDestroy ( ctx )); + return hipCUResultTohipError(cuCtxDestroy(ctx)); } inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx) { - return hipCUResultTohipError(cuCtxPopCurrent ( ctx )); + return hipCUResultTohipError(cuCtxPopCurrent(ctx)); } inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx) { - return hipCUResultTohipError(cuCtxPushCurrent ( ctx )); + return hipCUResultTohipError(cuCtxPushCurrent(ctx)); } inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx) { - return hipCUResultTohipError(cuCtxSetCurrent ( ctx )); + return hipCUResultTohipError(cuCtxSetCurrent(ctx)); } inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx) { - return hipCUResultTohipError(cuCtxGetCurrent ( ctx )); + return hipCUResultTohipError(cuCtxGetCurrent(ctx)); } inline static hipError_t hipCtxGetDevice(hipDevice_t *device) { - return hipCUResultTohipError(cuCtxGetDevice ( device )); + return hipCUResultTohipError(cuCtxGetDevice(device)); } inline static hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion) { - return hipCUResultTohipError(cuCtxGetApiVersion ( ctx,(unsigned int*)apiVersion )); + return hipCUResultTohipError(cuCtxGetApiVersion(ctx,(unsigned int*)apiVersion)); } -inline static hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig ) +inline static hipError_t hipCtxGetCacheConfig (hipFuncCache *cacheConfig) { - return hipCUResultTohipError(cuCtxGetCacheConfig ( cacheConfig )); + return hipCUResultTohipError(cuCtxGetCacheConfig(cacheConfig)); } -inline static hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig ) +inline static hipError_t hipCtxSetCacheConfig (hipFuncCache cacheConfig) { - return hipCUResultTohipError(cuCtxSetCacheConfig ( cacheConfig )); + return hipCUResultTohipError(cuCtxSetCacheConfig(cacheConfig)); } -inline static hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config ) +inline static hipError_t hipCtxSetSharedMemConfig (hipSharedMemConfig config) { - return hipCUResultTohipError(cuCtxSetSharedMemConfig ( config )); + return hipCUResultTohipError(cuCtxSetSharedMemConfig(config)); } inline static hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig ) { - return hipCUResultTohipError(cuCtxGetSharedMemConfig ( pConfig )); + return hipCUResultTohipError(cuCtxGetSharedMemConfig(pConfig)); } inline static hipError_t hipCtxSynchronize ( void ) @@ -721,7 +721,7 @@ inline static hipError_t hipCtxSynchronize ( void ) inline static hipError_t hipCtxGetFlags ( unsigned int* flags ) { - return hipCUResultTohipError(cuCtxGetFlags ( flags )); + return hipCUResultTohipError(cuCtxGetFlags(flags)); } inline static hipError_t hipCtxDetach(hipCtx_t ctx) From 2abf300797c40e2d7fc32bfafa285244e56f3bb0 Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 25 Oct 2016 15:29:33 -0500 Subject: [PATCH 08/65] Remove extra semicolons and extra spaces in header on NV path Change-Id: Ib33aec2451a4e0b298d537dbb1b9df000405871b --- include/hip/nvcc_detail/hip_runtime_api.h | 58 +++++++++++------------ 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index a632e57f97..769a66ea7b 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -106,7 +106,7 @@ switch(cuError) { case cudaErrorHostMemoryNotRegistered : return hipErrorHostMemoryNotRegistered ; case cudaErrorUnsupportedLimit : return hipErrorUnsupportedLimit ; default : return hipErrorUnknown; // Note - translated error. -}; +} } inline static hipError_t hipCUResultTohipError(CUresult cuError) { //TODO Populate further @@ -120,7 +120,7 @@ switch(cuError) { case CUDA_ERROR_INVALID_CONTEXT : return hipErrorInvalidContext ; case CUDA_ERROR_NOT_INITIALIZED : return hipErrorNotInitialized ; default : return hipErrorUnknown; // Note - translated error. -}; +} } // TODO match the error enum names of hip and cuda @@ -319,11 +319,11 @@ inline static hipError_t hipDeviceSynchronize() { } inline static const char* hipGetErrorString(hipError_t error){ - return cudaGetErrorString( hipErrorToCudaError(error) ); + return cudaGetErrorString(hipErrorToCudaError(error)); } inline static const char* hipGetErrorName(hipError_t error){ - return cudaGetErrorName( hipErrorToCudaError(error) ); + return cudaGetErrorName(hipErrorToCudaError(error)); } inline static hipError_t hipGetDeviceCount(int * count){ @@ -611,12 +611,12 @@ inline static hipError_t hipDeviceCanAccessPeer ( int* canAccessPeer, int devic inline static hipError_t hipDeviceDisablePeerAccess ( int peerDevice ) { - return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess ( peerDevice )); -}; + return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess(peerDevice)); +} inline static hipError_t hipDeviceEnablePeerAccess ( int peerDevice, unsigned int flags ) { - return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess ( peerDevice, flags )); + return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess(peerDevice, flags)); } inline static hipError_t hipCtxDisablePeerAccess ( hipCtx_t peerCtx ) @@ -626,22 +626,22 @@ inline static hipError_t hipCtxDisablePeerAccess ( hipCtx_t peerCtx ) inline static hipError_t hipCtxEnablePeerAccess ( hipCtx_t peerCtx, unsigned int flags ) { - return hipCUResultTohipError(cuCtxEnablePeerAccess ( peerCtx, flags )); + return hipCUResultTohipError(cuCtxEnablePeerAccess(peerCtx, flags)); } inline static hipError_t hipMemcpyPeer ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count ) { - return hipCUDAErrorTohipError(cudaMemcpyPeer ( dst, dstDevice, src, srcDevice, count )); -}; + return hipCUDAErrorTohipError(cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count)); +} inline static hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const void* src, int srcDevice, size_t count, hipStream_t stream=0 ) { - return hipCUDAErrorTohipError(cudaMemcpyPeerAsync ( dst, dstDevice, src, srcDevice, count, stream )); -}; + return hipCUDAErrorTohipError(cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream)); +} inline static hipError_t hipSetDeviceFlags (unsigned int flags) { - return hipCUDAErrorTohipError(cudaSetDeviceFlags( flags )); + return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags)); } inline static hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned int flags) @@ -656,62 +656,62 @@ inline static hipError_t hipEventQuery(hipEvent_t event) inline static hipError_t hipCtxCreate(hipCtx_t *ctx, unsigned int flags, hipDevice_t device) { - return hipCUResultTohipError(cuCtxCreate ( ctx,flags,device )); + return hipCUResultTohipError(cuCtxCreate(ctx,flags,device)); } inline static hipError_t hipCtxDestroy(hipCtx_t ctx) { - return hipCUResultTohipError(cuCtxDestroy ( ctx )); + return hipCUResultTohipError(cuCtxDestroy(ctx)); } inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx) { - return hipCUResultTohipError(cuCtxPopCurrent ( ctx )); + return hipCUResultTohipError(cuCtxPopCurrent(ctx)); } inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx) { - return hipCUResultTohipError(cuCtxPushCurrent ( ctx )); + return hipCUResultTohipError(cuCtxPushCurrent(ctx)); } inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx) { - return hipCUResultTohipError(cuCtxSetCurrent ( ctx )); + return hipCUResultTohipError(cuCtxSetCurrent(ctx)); } inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx) { - return hipCUResultTohipError(cuCtxGetCurrent ( ctx )); + return hipCUResultTohipError(cuCtxGetCurrent(ctx)); } inline static hipError_t hipCtxGetDevice(hipDevice_t *device) { - return hipCUResultTohipError(cuCtxGetDevice ( device )); + return hipCUResultTohipError(cuCtxGetDevice(device)); } inline static hipError_t hipCtxGetApiVersion (hipCtx_t ctx,int *apiVersion) { - return hipCUResultTohipError(cuCtxGetApiVersion ( ctx,(unsigned int*)apiVersion )); + return hipCUResultTohipError(cuCtxGetApiVersion(ctx,(unsigned int*)apiVersion)); } -inline static hipError_t hipCtxGetCacheConfig ( hipFuncCache *cacheConfig ) +inline static hipError_t hipCtxGetCacheConfig (hipFuncCache *cacheConfig) { - return hipCUResultTohipError(cuCtxGetCacheConfig ( cacheConfig )); + return hipCUResultTohipError(cuCtxGetCacheConfig(cacheConfig)); } -inline static hipError_t hipCtxSetCacheConfig ( hipFuncCache cacheConfig ) +inline static hipError_t hipCtxSetCacheConfig (hipFuncCache cacheConfig) { - return hipCUResultTohipError(cuCtxSetCacheConfig ( cacheConfig )); + return hipCUResultTohipError(cuCtxSetCacheConfig(cacheConfig)); } -inline static hipError_t hipCtxSetSharedMemConfig ( hipSharedMemConfig config ) +inline static hipError_t hipCtxSetSharedMemConfig (hipSharedMemConfig config) { - return hipCUResultTohipError(cuCtxSetSharedMemConfig ( config )); + return hipCUResultTohipError(cuCtxSetSharedMemConfig(config)); } inline static hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig ) { - return hipCUResultTohipError(cuCtxGetSharedMemConfig ( pConfig )); + return hipCUResultTohipError(cuCtxGetSharedMemConfig(pConfig)); } inline static hipError_t hipCtxSynchronize ( void ) @@ -721,7 +721,7 @@ inline static hipError_t hipCtxSynchronize ( void ) inline static hipError_t hipCtxGetFlags ( unsigned int* flags ) { - return hipCUResultTohipError(cuCtxGetFlags ( flags )); + return hipCUResultTohipError(cuCtxGetFlags(flags)); } inline static hipError_t hipCtxDetach(hipCtx_t ctx) From 334e9c6f8ea33588b068a736614b47b97e32ee02 Mon Sep 17 00:00:00 2001 From: pensun Date: Thu, 27 Oct 2016 13:34:14 -0500 Subject: [PATCH 09/65] Add missing hipStream typedef for NV path Change-Id: I915cd14a9ff32b55b0121062d7804a7fbbdc3341 --- include/hip/nvcc_detail/hip_runtime_api.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 769a66ea7b..1436008dd1 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -83,6 +83,10 @@ typedef CUmodule hipModule_t; typedef CUfunction hipFunction_t; typedef CUdeviceptr hipDeviceptr_t; +// Flags that can be used with hipStreamCreateWithFlags +#define hipStreamDefault 0x00 ///< Default stream creation flags +#define hipStreamNonBlocking 0x01 ///< Stream does not implicitly synchronize with null stream + //typedef cudaChannelFormatDesc hipChannelFormatDesc; #define hipChannelFormatDesc cudaChannelFormatDesc From bff88d0f636c646a46c0339af77b84459506b9b4 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 20 Oct 2016 10:06:17 -0500 Subject: [PATCH 10/65] Update docs and improve hipcc error message. Change-Id: I25636d06959d02cc46c8c476d3948e91ff83ea47 --- CONTRIBUTING.md | 5 +++++ bin/hipcc | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d535ccac39..f6ed47acef 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -124,6 +124,11 @@ Differences or limitations of HIP APIs as compared to CUDA APIs should be clearl - ihipLogStatus should only be called from top-level HIP APIs,and should be called to log and return the error code. The error code is used by the GetLastError and PeekLastError functions - if a HIP API simply returns, then the error will not be logged correctly. +- All HIP environment variables should begin with the keyword HIP_ + Environment variables should be long enough to describe their purpose but short enough so they can be remembered - perhaps 10-20 characters, with 3-4 parts separated by underscores. + To see the list of current environment variables, along with their values, set HIP_PRINT_ENV and run any hip applications on ROCM platform . + HIPCC or other tools may support additional environment variables which should follow the above convention. + #### Presubmit Testing: diff --git a/bin/hipcc b/bin/hipcc index 2f3cd46c62..d5e40b8961 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -353,7 +353,7 @@ if ($printHipVersion) { } if ($runCmd) { if ($HIP_PLATFORM eq "hcc" and exists($hipConfig{'HCC_VERSION'}) and $HCC_VERSION ne $hipConfig{'HCC_VERSION'}) { - print ("HIP ($HIP_PATH) was built using hcc $hipConfig{'HCC_VERSION'}, but you are using hcc $HCC_VERSION. Please rebuild HIP including cmake.\n") && die (); + print ("HIP ($HIP_PATH) was built using hcc $hipConfig{'HCC_VERSION'}, but you are using $HCC_HOME/hcc with version $HCC_VERSION from hipcc. Please rebuild HIP including cmake or update HCC_HOME variable.\n") && die (); } system ("$CMD") and die (); } From 346c519ace8c0b2d80b340d265d22415856158ac Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Oct 2016 17:38:22 -0500 Subject: [PATCH 11/65] Improve HIP TID printing in debug mode. Map long thread-id to a short one that is printed with each message. Remove clunky stirng creation code for tid_tr. Print TID on every message. Change-Id: I780a91d8ce789cb4957789036b478bf5cde8c4e4 --- src/hip_hcc.cpp | 25 +++++++++++++++++++++++-- src/hip_hcc.h | 34 +++++++++++++++++----------------- 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 7fa25334bc..a93f49f5f7 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -84,6 +84,11 @@ std::vector g_hip_visible_devices; hsa_agent_t g_cpu_agent; unsigned g_numLogicalThreads; +std::atomic g_lastShortTid(1); + + +thread_local ShortTid tls_shortTid; + /* Implementation of malloc and free device functions. @@ -230,6 +235,22 @@ hipError_t ihipSynchronize(void) return (hipSuccess); } +//================================================================================================= +// ihipStream_t: +//================================================================================================= +ShortTid::ShortTid() +{ + _shortTid = g_lastShortTid.fetch_add(1); + + if (HIP_DB & (1<>%s\n", (localHipStatus == 0) ? API_COLOR:KRED, __func__, localHipStatus, ihipErrorString(localHipStatus), API_COLOR_END);\ + fprintf(stderr, " %ship-api:tid:%d %-30s ret=%2d (%s)>>%s\n", (localHipStatus == 0) ? API_COLOR:KRED, tls_shortTid.tid(), __func__, localHipStatus, ihipErrorString(localHipStatus), API_COLOR_END);\ }\ localHipStatus;\ }) @@ -216,8 +217,7 @@ static const DbName dbName [] = if (HIP_DB & (1<<(trace_level))) {\ char msgStr[1000];\ snprintf(msgStr, 2000, __VA_ARGS__);\ - COMPUTE_TID_STR\ - fprintf (stderr, " %ship-%s%s:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tid_ss.str().c_str(), msgStr, KNRM); \ + fprintf (stderr, " %ship-%s tid:%d:%s%s", dbName[trace_level]._color, dbName[trace_level]._shortName, tls_shortTid.tid(), msgStr, KNRM); \ }\ } #else From 354091f357220ca2e50dc3cb97553a651da9563c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Oct 2016 17:39:43 -0500 Subject: [PATCH 12/65] Don't call allow-access if allocating device's only peer is self. Change-Id: Iac58e6c3e460675833f10b1e8b2e393de223654d --- src/hip_memory.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 01f940a408..4f45370c5a 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -125,7 +125,8 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hc::am_memtracker_update(*ptr, device->_deviceId, 0); { LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - if (crit->peerCnt()) { + // the peerCnt always stores self so make sure the trace actually + if (crit->peerCnt() > 1) { hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); } } From 739bc37503d19f2006f351e9fb5a2a6b381f93ce Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Oct 2016 20:24:12 -0500 Subject: [PATCH 13/65] Add per-thread API seqnum to debug Change-Id: Ib13733a3e84cd56bae13a32bae40f936c20b7543 --- src/hip_hcc.cpp | 13 +++++++------ src/hip_hcc.h | 13 +++++++++---- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index a93f49f5f7..29d83e43fb 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -87,7 +87,6 @@ unsigned g_numLogicalThreads; std::atomic g_lastShortTid(1); -thread_local ShortTid tls_shortTid; /* Implementation of malloc and free device functions. @@ -175,10 +174,12 @@ __device__ void* __hip_hc_free(void *ptr) // This is the implicit context used by all HIP commands. // It can be set by hipSetDevice or by the CTX manipulation commands: - thread_local hipError_t tls_lastHipError = hipSuccess; +thread_local ShortTid tls_shortTid; + + //================================================================================================= @@ -238,7 +239,8 @@ hipError_t ihipSynchronize(void) //================================================================================================= // ihipStream_t: //================================================================================================= -ShortTid::ShortTid() +ShortTid::ShortTid() : + _apiSeqNum(0) { _shortTid = g_lastShortTid.fetch_add(1); @@ -273,8 +275,6 @@ ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int }; - - tprintf(DB_SYNC, " streamCreate: stream=%p\n", this); }; @@ -1292,7 +1292,8 @@ void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, c { if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) { std::stringstream os; - os << "<grid_dim << " groupDim:" << lp->group_dim << " sharedMem:+" << lp->dynamic_group_mem_bytes diff --git a/src/hip_hcc.h b/src/hip_hcc.h index 19338642fe..668011a8c8 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -66,10 +66,15 @@ class ShortTid { ShortTid() ; - int tid() { return _shortTid; }; + int tid() const { return _shortTid; }; + uint64_t incApiSeqNum() { return ++_apiSeqNum; }; + uint64_t apiSeqNum() const { return _apiSeqNum; }; + private: + int _shortTid; - int _shortTid; + // monotonically increasing API sequence number for this threa. + uint64_t _apiSeqNum; }; //--- @@ -145,7 +150,7 @@ extern const char *API_COLOR_END; if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\ std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ if (COMPILE_HIP_DB && HIP_TRACE_API) {\ - fprintf (stderr, "%s<>%s\n", (localHipStatus == 0) ? API_COLOR:KRED, tls_shortTid.tid(), __func__, localHipStatus, ihipErrorString(localHipStatus), API_COLOR_END);\ + fprintf(stderr, " %ship-api tid:%d.%lu %-30s ret=%2d (%s)>>%s\n", (localHipStatus == 0) ? API_COLOR:KRED, tls_shortTid.tid(),tls_shortTid.apiSeqNum(), __func__, localHipStatus, ihipErrorString(localHipStatus), API_COLOR_END);\ }\ localHipStatus;\ }) From 710be682ca04f309b055a8dd848917be9aee57b8 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 25 Oct 2016 16:20:37 -0500 Subject: [PATCH 14/65] Add HIP_PROFILE_START_API, HIP_PROFILE_STOP_API Refactor HIP_INIT_API to call recordApiTrace. Change-Id: Ieff4b5018236f59e49e1b9841474440a34f821df --- src/hip_hcc.cpp | 108 ++++++++++++++++++++++++++++++++++++++++++++++++ src/hip_hcc.h | 35 +++++++++++++--- 2 files changed, 138 insertions(+), 5 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 29d83e43fb..6ea07adb87 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -49,6 +49,7 @@ THE SOFTWARE. + //================================================================================================= //Global variables: //================================================================================================= @@ -63,11 +64,16 @@ int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; std::string HIP_TRACE_API_COLOR("green"); int HIP_ATP_MARKER= 0; +std::string HIP_PROFILE_START_API; +std::string HIP_PROFILE_STOP_API; int HIP_DB= 0; int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; + + + #define HIP_USE_PRODUCT_NAME 0 //#define DISABLE_COPY_EXT 1 @@ -86,6 +92,11 @@ unsigned g_numLogicalThreads; std::atomic g_lastShortTid(1); +// Indexed by short-tid: +// +std::vector g_profStartTriggers; +std::vector g_profStopTriggers; + /* @@ -185,6 +196,30 @@ thread_local ShortTid tls_shortTid; //================================================================================================= // Top-level "free" functions: //================================================================================================= +void recordApiTrace(const std::string &s) +{ + auto apiSeqNum = tls_shortTid.incApiSeqNum(); + auto tid = tls_shortTid.tid(); + + if ((tid < g_profStartTriggers.size()) && (apiSeqNum >= g_profStartTriggers[tid].nextTrigger())) { + printf ("info: resume profiling at %lu\n", apiSeqNum); + RESUME_PROFILING; + g_profStartTriggers.pop_back(); + }; + if ((tid < g_profStopTriggers.size()) && (apiSeqNum >= g_profStopTriggers[tid].nextTrigger())) { + printf ("info: stop profiling at %lu\n", apiSeqNum); + STOP_PROFILING; + g_profStopTriggers.pop_back(); + }; + + + if (COMPILE_HIP_DB && HIP_TRACE_API) { + fprintf (stderr, "%s< 0 @@ -1112,6 +1147,71 @@ void ihipReadEnv_S(std::string *var_ptr, const char *var_name1, const char *var_ #endif +static void tokenize(const std::string &s, char delim, std::vector *tokens) +{ + std::stringstream ss; + ss.str(s); + std::string item; + while (getline(ss, item, delim)) { + item.erase (std::remove (item.begin(), item.end(), ' '), item.end()); // remove whitespace. + tokens->push_back(item); + } +} + +static void trim(std::string *s) +{ + // trim whitespace from beginning and end: + const char *t = "\t\n\r\f\v"; + s->erase(0, s->find_first_not_of(t)); + s->erase(s->find_last_not_of(t)+1); +} + +static void ltrim(std::string *s) +{ + // trim whitespace from beginning + const char *t = "\t\n\r\f\v"; + s->erase(0, s->find_first_not_of(t)); +} + + +// TODO - change last arg to pointer. +void parseTrigger(std::string triggerString, std::vector &profTriggers ) +{ + std::vector tidApiTokens; + tokenize(std::string(triggerString), ',', &tidApiTokens); + for (auto t=tidApiTokens.begin(); t != tidApiTokens.end(); t++) { + std::vector oneToken; + //std::cout << "token=" << *t << "\n"; + tokenize(std::string(*t), '.', &oneToken); + int tid = 1; + uint64_t apiTrigger = 0; + if (oneToken.size() == 1) { + // the case with just apiNum + apiTrigger = std::strtoull(oneToken[0].c_str(), nullptr, 0); + } else if (oneToken.size() == 2) { + // the case with tid.apiNum + tid = std::strtoul(oneToken[0].c_str(), nullptr, 0); + apiTrigger = std::strtoull(oneToken[1].c_str(), nullptr, 0); + } else { + throw ihipException(hipErrorRuntimeOther); // TODO -> bad env var? + } + + if (tid > 10000) { + throw ihipException(hipErrorRuntimeOther); // TODO -> bad env var? + } else { + profTriggers.resize(tid+1); + //std::cout << "tid:" << tid << " add: " << apiTrigger << "\n"; + profTriggers[tid].add(apiTrigger); + } + } + + + for (int tid=1; tidlockclose_postKernelCommand(lp.av); } diff --git a/src/hip_hcc.h b/src/hip_hcc.h index 668011a8c8..9997552135 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -77,11 +77,34 @@ class ShortTid { uint64_t _apiSeqNum; }; +struct ProfTrigger { + + static const uint64_t MAX_TRIGGER = std::numeric_limits::max(); + + void print (int tid) { + std::cout << "Enabling tracing for "; + for (auto iter=_profTrigger.begin(); iter != _profTrigger.end(); iter++) { + std::cout << "tid:" << tid << "." << *iter << ","; + } + std::cout << "\n"; + }; + + uint64_t nextTrigger() { return _profTrigger.empty() ? MAX_TRIGGER : _profTrigger.back(); }; + void add(uint64_t trigger) { _profTrigger.push_back(trigger); }; + void sort() { std::sort (_profTrigger.begin(), _profTrigger.end(), std::greater()); }; +private: + std::vector _profTrigger; +}; + + + //--- //Extern tls extern thread_local hipError_t tls_lastHipError; extern thread_local ShortTid tls_shortTid; +extern std::vector g_profStartTriggers; +extern std::vector g_profStopTriggers; //--- //Forward defs: @@ -138,21 +161,23 @@ extern const char *API_COLOR_END; #if COMPILE_HIP_ATP_MARKER #include "CXLActivityLogger.h" #define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString) +#define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING); +#define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING); #else // Swallow scoped markers: #define SCOPED_MARKER(markerName,group,userString) +#define RESUME_PROFILING +#define STOP_PROFILING #endif +extern void recordApiTrace(const std::string &s); + #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1) #define API_TRACE(...)\ {\ if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\ - std::string s = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ - if (COMPILE_HIP_DB && HIP_TRACE_API) {\ - fprintf (stderr, "%s< Date: Wed, 26 Oct 2016 09:41:47 -0500 Subject: [PATCH 15/65] Allow HIP_DB to be number or string flags (ie HIP_DB=api+mem+sync) Add callbacks for processing env vars. Change-Id: I4ddf50e2da56b1dae43f50657bc693b07b23c03d --- src/hip_hcc.cpp | 96 +++++++++++++++++++++++++++++++++++++++++-------- src/hip_hcc.h | 2 +- 2 files changed, 82 insertions(+), 16 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 6ea07adb87..3a62f63d0a 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1113,7 +1113,7 @@ void ihipReadEnv_S(std::string *var_ptr, const char *var_name1, const char *var_ } if (env) { - *var_ptr = env; + *static_cast(var_ptr) = env; } if (HIP_PRINT_ENV) { printf ("%-30s = %s : %s\n", var_name1, var_ptr->c_str(), description); @@ -1121,6 +1121,25 @@ void ihipReadEnv_S(std::string *var_ptr, const char *var_name1, const char *var_ } +void ihipReadEnv_Callback(void *var_ptr, const char *var_name1, const char *var_name2, const char *description, std::string (*setterCallback)(void * var_ptr, const char * env)) +{ + char * env = getenv(var_name1); + + // Check second name if first not defined, used to allow HIP_ or CUDA_ env vars. + if ((env == NULL) && strcmp(var_name2, "0")) { + env = getenv(var_name2); + } + + std::string var_string = "TBD"; + if (env) { + var_string = setterCallback(var_ptr, env); + } + if (HIP_PRINT_ENV) { + printf ("%-30s = %s : %s\n", var_name1, var_string.c_str(), description); + } +} + + #if defined (DEBUG) #define READ_ENV_I(_build, _ENV_VAR, _ENV_VAR2, _description) \ @@ -1131,6 +1150,10 @@ void ihipReadEnv_S(std::string *var_ptr, const char *var_name1, const char *var_ if ((_build == release) || (_build == debug) {\ ihipReadEnv_S(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description);\ }; +#define READ_ENV_C(_build, _ENV_VAR, _ENV_VAR2, _description, _callback) \ + if ((_build == release) || (_build == debug) {\ + ihipReadEnv_Callback(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description, _callback);\ + }; #else @@ -1143,6 +1166,10 @@ void ihipReadEnv_S(std::string *var_ptr, const char *var_name1, const char *var_ if (_build == release) {\ ihipReadEnv_S(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description);\ }; +#define READ_ENV_C(_build, _ENV_VAR, _ENV_VAR2, _description, _callback) \ + if (_build == release) {\ + ihipReadEnv_Callback(&_ENV_VAR, #_ENV_VAR, #_ENV_VAR2, _description, _callback);\ + }; #endif @@ -1212,6 +1239,57 @@ void parseTrigger(std::string triggerString, std::vector &profTrigg } } +std::string HIP_DB_string(unsigned db) +{ + std::string dbStr; + bool first=true; + for (int i=0; i (var_ptr); + + std::string e(envVarString); + trim(&e); + if (!e.empty() && isdigit(e.c_str()[0])) { + long int v = strtol(envVarString, NULL, 0); + *var_ptr_int = (int) (v); + } else { + *var_ptr_int = 0; + std::vector tokens; + tokenize(e, '+', &tokens); + for (auto t=tokens.begin(); t!= tokens.end(); t++) { + for (int i=0; ic_str(), dbName[i]._shortName)) { + *var_ptr_int |= (1< Date: Wed, 26 Oct 2016 10:30:42 -0500 Subject: [PATCH 16/65] Rename HIP_ATP_MARKER and profiling vars HIP_PROFILE_API HIP_DB_START_API HIP_DB_STOP_API Change-Id: I6c4da67212ff8217e6356a2622d4c6278a188c34 --- docs/markdown/hip_faq.md | 38 +----- docs/markdown/hip_porting_guide.md | 2 +- .../Makefile | 0 .../MatrixTranspose.cpp | 0 .../Readme.md | 0 src/hip_hcc.cpp | 114 +++++++++--------- src/hip_hcc.h | 8 +- 7 files changed, 61 insertions(+), 101 deletions(-) rename samples/2_Cookbook/{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/Makefile (100%) rename samples/2_Cookbook/{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/MatrixTranspose.cpp (100%) rename samples/2_Cookbook/{2_HIP_ATP_MARKER => 2_CodeXL_ATP}/Readme.md (100%) diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md index f2fa3346cc..70ad94ba43 100644 --- a/docs/markdown/hip_faq.md +++ b/docs/markdown/hip_faq.md @@ -229,43 +229,7 @@ If platform portability is important, use #ifdef __HIP_PLATFORM_HIPCC__ to guard ### How do I trace HIP application flow? -#### Using CodeXL markers for HIP Functions -HIP can generate markers at function being/end which are displayed on the CodeXL timeline view. -To do this, you need to install ROCm-Profiler and enable HIP to generate the markers: - -1. Install ROCm-Profiler -Installing HIP from the [rocm](http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/) pre-built packages, installs the ROCm-Profiler as well. -Alternatively, you can build ROCm-Profiler using the instructions [here](https://github.com/RadeonOpenCompute/ROCm-Profiler#building-the-rocm-profiler). - -2. Build HIP with ATP markers enabled -HIP pre-built packages are enabled with ATP marker support by default. -To enable ATP marker support when building HIP from source, use the option ```-DCOMPILE_HIP_ATP_MARKER=1``` during the cmake configure step. - -3. Set HIP_ATP_MARKER -```shell -export HIP_ATP_MARKER=1 -``` - -4. Recompile the target application - -5. Run with profiler enabled to generate ATP file. -```shell -# Use profile to generate timeline view: -/opt/rocm/bin/rocm-profiler -o -A - -Or -/opt/rocm/bin/rocm-profiler -e HIP_ATP_MARKER=1 -o -A -``` - -#### Using HIP_TRACE_API -You can also print the HIP function strings to stderr using HIP_TRACE_API environment variable. This can also be combined with the more detailed debug information provided -by the HIP_DB switch. For example: -```shell -# Trace to stderr showing being/end of each function (with arguments) + intermediate debug trace during the execution of each function. -HIP_TRACE_API=1 HIP_DB=0x2 ./myHipApp -``` - -Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. +See the [HIP Profiling Guide](hip_porting_guide.md) for more information. ### What if HIP generates error of "symbol multiply defined!" only on AMD machine? Unlike CUDA, in HCC, for functions defined in the header files, the keyword of "__forceinline__" does not imply "static". diff --git a/docs/markdown/hip_porting_guide.md b/docs/markdown/hip_porting_guide.md index 621726ee5f..c530df5098 100644 --- a/docs/markdown/hip_porting_guide.md +++ b/docs/markdown/hip_porting_guide.md @@ -564,7 +564,7 @@ HIP_LAUNCH_BLOCKING = 0 : Make HIP APIs 'host-synchronous', so they HIP_DB = 0 : Print various debug info. Bitmask, see hip_hcc.cpp for more information. HIP_TRACE_API = 0 : Trace each HIP API call. Print function name and return code to stderr as program executes. HIP_TRACE_API_COLOR = green : Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White -HIP_ATP_MARKER = 0 : Add HIP function begin/end to ATP file generated with CodeXL +HIP_PROFILE_API = 0 : Add HIP function begin/end to ATP file generated with CodeXL HIP_VISIBLE_DEVICES = 0 : Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence HIP_NUM_KERNELS_INFLIGHT = 128 : Number of kernels per stream diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile b/samples/2_Cookbook/2_CodeXL_ATP/Makefile similarity index 100% rename from samples/2_Cookbook/2_HIP_ATP_MARKER/Makefile rename to samples/2_Cookbook/2_CodeXL_ATP/Makefile diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp b/samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp similarity index 100% rename from samples/2_Cookbook/2_HIP_ATP_MARKER/MatrixTranspose.cpp rename to samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp diff --git a/samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md b/samples/2_Cookbook/2_CodeXL_ATP/Readme.md similarity index 100% rename from samples/2_Cookbook/2_HIP_ATP_MARKER/Readme.md rename to samples/2_Cookbook/2_CodeXL_ATP/Readme.md diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 3a62f63d0a..5d96ceae4d 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -63,9 +63,9 @@ int HIP_LAUNCH_BLOCKING = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; std::string HIP_TRACE_API_COLOR("green"); -int HIP_ATP_MARKER= 0; -std::string HIP_PROFILE_START_API; -std::string HIP_PROFILE_STOP_API; +int HIP_PROFILE_API= 0; +std::string HIP_DB_START_API; +std::string HIP_DB_STOP_API; int HIP_DB= 0; int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ int HIP_NUM_KERNELS_INFLIGHT = 128; @@ -94,8 +94,8 @@ std::atomic g_lastShortTid(1); // Indexed by short-tid: // -std::vector g_profStartTriggers; -std::vector g_profStopTriggers; +std::vector g_dbStartTriggers; +std::vector g_dbStopTriggers; @@ -201,15 +201,15 @@ void recordApiTrace(const std::string &s) auto apiSeqNum = tls_shortTid.incApiSeqNum(); auto tid = tls_shortTid.tid(); - if ((tid < g_profStartTriggers.size()) && (apiSeqNum >= g_profStartTriggers[tid].nextTrigger())) { + if ((tid < g_dbStartTriggers.size()) && (apiSeqNum >= g_dbStartTriggers[tid].nextTrigger())) { printf ("info: resume profiling at %lu\n", apiSeqNum); RESUME_PROFILING; - g_profStartTriggers.pop_back(); + g_dbStartTriggers.pop_back(); }; - if ((tid < g_profStopTriggers.size()) && (apiSeqNum >= g_profStopTriggers[tid].nextTrigger())) { + if ((tid < g_dbStopTriggers.size()) && (apiSeqNum >= g_dbStopTriggers[tid].nextTrigger())) { printf ("info: stop profiling at %lu\n", apiSeqNum); STOP_PROFILING; - g_profStopTriggers.pop_back(); + g_dbStopTriggers.pop_back(); }; @@ -1062,43 +1062,14 @@ void ihipReadEnv_I(int *var_ptr, const char *var_name1, const char *var_name2, c env = getenv(var_name2); } - // TODO: Refactor this code so it is a separate call rather than being part of ihipReadEnv_I, which should only read integers. - // Check if the environment variable is either HIP_VISIBLE_DEVICES or CUDA_LAUNCH_BLOCKING, which - // contains a sequence of comma-separated device IDs - if (!(strcmp(var_name1,"HIP_VISIBLE_DEVICES") && strcmp(var_name2, "CUDA_VISIBLE_DEVICES")) && env){ - // Parse the string stream of env and store the device ids to g_hip_visible_devices global variable - std::string str = env; - std::istringstream ss(str); - std::string device_id; - // Clean up the defult value - g_hip_visible_devices.clear(); - g_visible_device = true; - // Read the visible device numbers - while (std::getline(ss, device_id, ',')) { - if (atoi(device_id.c_str()) >= 0) { - g_hip_visible_devices.push_back(atoi(device_id.c_str())); - } else { // Any device number after invalid number will not present - break; - } - } - // Print out the number of ids - if (HIP_PRINT_ENV) { - printf ("%-30s = ", var_name1); - for(int i=0;i (var_ptr); @@ -1275,9 +1246,9 @@ std::string HIP_DB_callback(void *var_ptr, const char *envVarString) tokenize(e, '+', &tokens); for (auto t=tokens.begin(); t!= tokens.end(); t++) { for (int i=0; ic_str(), dbName[i]._shortName)) { + if (!strcmp(t->c_str(), dbName[i]._shortName)) { *var_ptr_int |= (1<= 0) { + g_hip_visible_devices.push_back(atoi(device_id.c_str())); + } else { // Any device number after invalid number will not present + break; + } + } + + std::string valueString; + // Print out the number of ids + for(int i=0;i g_profStartTriggers; -extern std::vector g_profStopTriggers; +extern std::vector g_dbStartTriggers; +extern std::vector g_dbStopTriggers; //--- //Forward defs: @@ -176,7 +176,7 @@ extern void recordApiTrace(const std::string &s); #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1) #define API_TRACE(...)\ {\ - if (HIP_ATP_MARKER || (COMPILE_HIP_DB && HIP_TRACE_API)) {\ + if (HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) {\ recordApiTrace(std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')');\ }\ } From ab1836544afd87bc56af059f5cb5f09cdc6c39a1 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 20:31:50 -0500 Subject: [PATCH 17/65] Fix scoped marker so begin/end ATP timestamps correct Change-Id: Ic944d3fc00d7bc31b756c0e6c327b99eb489537e --- src/hip_hcc.cpp | 24 ++++++++++++++++-------- src/hip_hcc.h | 14 ++++++++++---- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 5d96ceae4d..1c52ceee58 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -196,7 +196,7 @@ thread_local ShortTid tls_shortTid; //================================================================================================= // Top-level "free" functions: //================================================================================================= -void recordApiTrace(const std::string &s) +void recordApiTrace(std::string *fullStr, const std::string &apiStr) { auto apiSeqNum = tls_shortTid.incApiSeqNum(); auto tid = tls_shortTid.tid(); @@ -212,11 +212,16 @@ void recordApiTrace(const std::string &s) g_dbStopTriggers.pop_back(); }; + fullStr->reserve(16 + apiStr.length()); + *fullStr = std::to_string(tid) + "."; + *fullStr += std::to_string(apiSeqNum); + *fullStr += " "; + *fullStr += apiStr; + if (COMPILE_HIP_DB && HIP_TRACE_API) { - fprintf (stderr, "%s<c_str(), API_COLOR_END); } - SCOPED_MARKER(s.c_str(), "HIP", NULL); } @@ -1332,15 +1337,15 @@ void ihipInit() // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { - fprintf (stderr, "warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0. (perhaps enable COMPILE_HIP_DB in src code before compiling?)", HIP_DB); + fprintf (stderr, "warning: env var HIP_DB=0x%x but COMPILE_HIP_DB=0. (perhaps enable COMPILE_HIP_DB in src code before compiling?)\n", HIP_DB); } if (HIP_TRACE_API && !COMPILE_HIP_TRACE_API) { - fprintf (stderr, "warning: env var HIP_TRACE_API=0x%x but COMPILE_HIP_TRACE_API=0. (perhaps enable COMPILE_HIP_TRACE_API in src code before compiling?)", HIP_DB); + fprintf (stderr, "warning: env var HIP_TRACE_API=0x%x but COMPILE_HIP_TRACE_API=0. (perhaps enable COMPILE_HIP_TRACE_API in src code before compiling?)\n", HIP_DB); } if (HIP_PROFILE_API && !COMPILE_HIP_ATP_MARKER) { - fprintf (stderr, "warning: env var HIP_PROFILE_API=0x%x but COMPILE_HIP_ATP_MARKER=0. (perhaps enable COMPILE_HIP_ATP_MARKER in src code before compiling?)", HIP_PROFILE_API); + fprintf (stderr, "warning: env var HIP_PROFILE_API=0x%x but COMPILE_HIP_ATP_MARKER=0. (perhaps enable COMPILE_HIP_ATP_MARKER in src code before compiling?)\n", HIP_PROFILE_API); } if (HIP_DB) { @@ -1459,19 +1464,21 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream) { if (HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) { + std::stringstream os_pre; std::stringstream os; - os << "<grid_dim << " groupDim:" << lp->group_dim << " sharedMem:+" << lp->dynamic_group_mem_bytes << " " << *stream; + MARKER_BEGIN(os.str().c_str(), "HIP"); if (COMPILE_HIP_DB && HIP_TRACE_API) { std::cerr << API_COLOR << os.str() << API_COLOR_END << std::endl; } - SCOPED_MARKER(os.str().c_str(), "HIP", NULL); } } @@ -1572,6 +1579,7 @@ void ihipPostLaunchKernel(hipStream_t stream, grid_launch_parm &lp) tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n"); stream->lockclose_postKernelCommand(lp.av); + MARKER_END(); } diff --git a/src/hip_hcc.h b/src/hip_hcc.h index b18d5bb2f2..c415ad3e16 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -160,24 +160,29 @@ extern const char *API_COLOR_END; // through ptr-to-args (ie the pointers allocated by hipMalloc). #if COMPILE_HIP_ATP_MARKER #include "CXLActivityLogger.h" -#define SCOPED_MARKER(markerName,group,userString) amdtScopedMarker(markerName, group, userString) +#define MARKER_BEGIN(markerName,group) amdtBeginMarker(markerName, group, nullptr); +#define MARKER_END() amdtEndMarker(); #define RESUME_PROFILING amdtResumeProfiling(AMDT_ALL_PROFILING); #define STOP_PROFILING amdtStopProfiling(AMDT_ALL_PROFILING); #else // Swallow scoped markers: -#define SCOPED_MARKER(markerName,group,userString) +#define MARKER_BEGIN(markerName,group) +#define MARKER_END() #define RESUME_PROFILING #define STOP_PROFILING #endif -extern void recordApiTrace(const std::string &s); +extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1) #define API_TRACE(...)\ {\ if (HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) {\ - recordApiTrace(std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')');\ + std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ + std::string fullStr;\ + recordApiTrace(&fullStr, apiStr);\ + MARKER_BEGIN(fullStr.c_str(), "HIP");\ }\ } #else @@ -207,6 +212,7 @@ extern void recordApiTrace(const std::string &s); if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API) {\ fprintf(stderr, " %ship-api tid:%d.%lu %-30s ret=%2d (%s)>>%s\n", (localHipStatus == 0) ? API_COLOR:KRED, tls_shortTid.tid(),tls_shortTid.apiSeqNum(), __func__, localHipStatus, ihipErrorString(localHipStatus), API_COLOR_END);\ }\ + if (HIP_PROFILE_API) { MARKER_END(); }\ localHipStatus;\ }) From f95482c7c59a5f81788b10aaf39af1c246fc85c0 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 20:37:46 -0500 Subject: [PATCH 18/65] Add new hipdemangleatp and snapshot sample update for new functionality Change-Id: Ie19c683b2b0bdfeb0c3fcf89444c2e21b7f606e7 --- bin/hipdemangleatp | 18 ++ samples/0_Intro/square/Makefile | 6 +- .../2_CodeXL_ATP/MatrixTranspose.cpp | 172 --------------- .../{2_CodeXL_ATP => 2_Profiler}/Makefile | 22 +- .../2_Cookbook/2_Profiler/MatrixTranspose.cpp | 195 ++++++++++++++++++ .../{2_CodeXL_ATP => 2_Profiler}/Readme.md | 14 +- 6 files changed, 236 insertions(+), 191 deletions(-) create mode 100755 bin/hipdemangleatp delete mode 100644 samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp rename samples/2_Cookbook/{2_CodeXL_ATP => 2_Profiler}/Makefile (52%) create mode 100644 samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp rename samples/2_Cookbook/{2_CodeXL_ATP => 2_Profiler}/Readme.md (83%) diff --git a/bin/hipdemangleatp b/bin/hipdemangleatp new file mode 100755 index 0000000000..b6734a9e24 --- /dev/null +++ b/bin/hipdemangleatp @@ -0,0 +1,18 @@ +#!/bin/bash + +# usage: hipdemangleatp.sh ATP_FILE + +# HIP kernels +kernels=$(grep grid_launch_parm $1 | cut -d" " -f1 | sort | uniq) +for mangled_sym in $kernels; do + real_sym=$(c++filt -p $(c++filt _$mangled_sym | cut -d: -f3 | sed 's/_functor//g')) + echo "$mangled_sym => $real_sym" >> $1.log + sed -i "s/$mangled_sym/$real_sym/g" $1 +done + +# HC kernels +#kernels=$(grep cxxamp_trampoline $1 | cut -d" " -f1 | sort | uniq) +#for mangled_sym in $kernels; do +# real_sym=$(c++filt $(c++filt $mangled_sym | cut -d: -f3 | sed 's/_functor//g') | cut -d\( -f1 | cut -d" " -f2) +# sed -i "s/$mangled_sym/$real_sym/g" $1 +#done diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 89921c2072..1e8cdba080 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -1,7 +1,4 @@ HIP_PATH?= $(wildcard /opt/rocm/hip) -ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. -endif HIPCC=$(HIP_PATH)/bin/hipcc all: square.hip.out @@ -11,9 +8,10 @@ square.cuda.out : square.cu #hipify square.cu > square.cpp # Then review & finish port in square.cpp +# square.hip.out: square.hipref.cpp - $(HIPCC) square.hipref.cpp -o $@ + $(HIPCC) $(CXXFLAGS) square.hipref.cpp -o $@ diff --git a/samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp b/samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp deleted file mode 100644 index f2aea146e4..0000000000 --- a/samples/2_Cookbook/2_CodeXL_ATP/MatrixTranspose.cpp +++ /dev/null @@ -1,172 +0,0 @@ -/* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include - -// hip header file -#include "hip/hip_runtime.h" - -#define WIDTH 1024 - -#define NUM (WIDTH*WIDTH) - -#define THREADS_PER_BLOCK_X 4 -#define THREADS_PER_BLOCK_Y 4 -#define THREADS_PER_BLOCK_Z 1 - -// Device (Kernel) function, it must be void -// hipLaunchParm provides the execution configuration -__global__ void matrixTranspose(hipLaunchParm lp, - float *out, - float *in, - const int width) -{ - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; - - out[y * width + x] = in[x * width + y]; -} - -// CPU implementation of matrix transpose -void matrixTransposeCPUReference( - float * output, - float * input, - const unsigned int width) -{ - for(unsigned int j=0; j < width; j++) - { - for(unsigned int i=0; i < width; i++) - { - output[i*width + j] = input[j*width + i]; - } - } -} - -int main() { - - float* Matrix; - float* TransposeMatrix; - float* cpuTransposeMatrix; - - float* gpuMatrix; - float* gpuTransposeMatrix; - - hipDeviceProp_t devProp; - hipGetDeviceProperties(&devProp, 0); - - std::cout << "Device name " << devProp.name << std::endl; - - hipEvent_t start, stop; - hipEventCreate(&start); - hipEventCreate(&stop); - float eventMs = 1.0f; - - int i; - int errors; - - Matrix = (float*)malloc(NUM * sizeof(float)); - TransposeMatrix = (float*)malloc(NUM * sizeof(float)); - cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); - - // initialize the input data - for (i = 0; i < NUM; i++) { - Matrix[i] = (float)i*10.0f; - } - - // allocate the memory on the device side - hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); - hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); - - // Record the start event - hipEventRecord(start, NULL); - - // Memory transfer from host to device - hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - - printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); - - // Record the start event - hipEventRecord(start, NULL); - - // Lauching kernel from host - hipLaunchKernel(matrixTranspose, - dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), - dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), - 0, 0, - gpuTransposeMatrix , gpuMatrix, WIDTH); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - - printf ("kernel Execution time = %6.3fms\n", eventMs); - - // Record the start event - hipEventRecord(start, NULL); - - // Memory transfer from device to host - hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); - - // Record the stop event - hipEventRecord(stop, NULL); - hipEventSynchronize(stop); - - hipEventElapsedTime(&eventMs, start, stop); - - printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); - - // CPU MatrixTranspose computation - matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); - - // verify the results - errors = 0; - double eps = 1.0E-6; - for (i = 0; i < NUM; i++) { - if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { - errors++; - } - } - if (errors!=0) { - printf("FAILED: %d errors\n",errors); - } else { - printf ("PASSED!\n"); - } - - //free the resources on device side - hipFree(gpuMatrix); - hipFree(gpuTransposeMatrix); - - //free the resources on host side - free(Matrix); - free(TransposeMatrix); - free(cpuTransposeMatrix); - - return errors; -} diff --git a/samples/2_Cookbook/2_CodeXL_ATP/Makefile b/samples/2_Cookbook/2_Profiler/Makefile similarity index 52% rename from samples/2_Cookbook/2_CodeXL_ATP/Makefile rename to samples/2_Cookbook/2_Profiler/Makefile index d3630a1c19..4b9a063f38 100644 --- a/samples/2_Cookbook/2_CodeXL_ATP/Makefile +++ b/samples/2_Cookbook/2_Profiler/Makefile @@ -1,10 +1,12 @@ HIP_PATH?= $(wildcard /opt/rocm/hip) -ifeq (,$(HIP_PATH)) - HIP_PATH=../../.. -endif HIPCC=$(HIP_PATH)/bin/hipcc + +HIPPROFILER=/opt/rocm/bin/rocm-profiler +PROFILER_OPT=-A -o MT.atp -e HIP_PROFILE_API=1 +HIPPROFILER_POST_CMD=$(HIP_PATH)/bin/hipdemangleatp MT.atp + TARGET=hcc SOURCES = MatrixTranspose.cpp @@ -15,9 +17,12 @@ EXECUTABLE=./MatrixTranspose .PHONY: test -all: $(EXECUTABLE) test +all: $(EXECUTABLE) profile + -CXXFLAGS =-g + +OPT =-g +CXXFLAGS =$(OPT) CXX=$(HIPCC) @@ -25,7 +30,12 @@ $(EXECUTABLE): $(OBJECTS) $(HIPCC) $(OBJECTS) -o $@ -test: $(EXECUTABLE) +profile: $(EXECUTABLE) + $(HIPPROFILER) $(PROFILER_OPT) $(EXECUTABLE) + $(HIPPROFILER_POST_CMD) + + +run: $(EXECUTABLE) $(EXECUTABLE) diff --git a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp new file mode 100644 index 0000000000..7500957dfc --- /dev/null +++ b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp @@ -0,0 +1,195 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" +#include "hip/hip_profile.h" + +#define WIDTH 1024 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + out[y * width + x] = in[x * width + y]; +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width) +{ + for(unsigned int j=0; j < width; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*width + j] = input[j*width + i]; + } + } +} + +int main() { + + //HIP_SCOPED_MARKER(__func__, "MainFunc"); + HIP_BEGIN_MARKER(__func__, "MainFunc"); + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + float eventMs = 1.0f; + { + // Show example of how to create a "scoped marker". + // The scoped marker records the time spent inside the { scope } of the marker - the begin timestamp is at the + // beginning of the code scope, and the end is recorded when the SCOPE exits. This can be viewed in CodeXL + // timeline relative to other GPU and CPU events. + // This marker captures the time spent in setup including host allocation, initialization, and device memory allocation. + HIP_SCOPED_MARKER("Setup", "App"); + + hipEventCreate(&start); + hipEventCreate(&stop); + + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (int i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + } + + { + HIP_SCOPED_MARKER("Loop", "App"); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + hipEventElapsedTime(&eventMs, start, stop); + } + + int errors = 0; + { + HIP_SCOPED_MARKER("Teardown", "App"); + + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + double eps = 1.0E-6; + for (int i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + } + + HIP_END_MARKER(); + + return errors; +} diff --git a/samples/2_Cookbook/2_CodeXL_ATP/Readme.md b/samples/2_Cookbook/2_Profiler/Readme.md similarity index 83% rename from samples/2_Cookbook/2_CodeXL_ATP/Readme.md rename to samples/2_Cookbook/2_Profiler/Readme.md index de1a800572..92a8be228e 100644 --- a/samples/2_Cookbook/2_CodeXL_ATP/Readme.md +++ b/samples/2_Cookbook/2_Profiler/Readme.md @@ -1,6 +1,6 @@ ## Using hipEvents to measure performance ### -This tutorial is follow-up of the previous two tutorial where we learn how to write our first hip program, in which we compute Matrix Transpose and in second one, we added feature to measure time taken for memory transfer and kernel execution. In this tutorial, we won't make amy changes to the source code. We'll explain how to use the codexl/rocm-profiler for hip timeline tracing. +This tutorial is follow-up of the previous two tutorial where we learn how to write our first hip program, in which we compute Matrix Transpose and in second one, we added feature to measure time taken for memory transfer and kernel execution. In this tutorial, we'll explain how to use the codexl/rocm-profiler for hip timeline tracing. Also, we will augment the source code with additional markers so we can see the high-level application flow alongside the information that CodeXL automatically collects. ## Introduction: @@ -24,15 +24,11 @@ HIP can generate markers at function being/end which are displayed on the CodeXL 1. Install ROCm-Profiler Installing HIP from the rocm pre-built packages, installs the ROCm-Profiler as well. Alternatively, you can build ROCm-Profiler using the instructions given below. -2. Build HIP with ATP markers enabled HIP pre-built packages are enabled with ATP marker support by default. To enable ATP marker support when building HIP from source, use the option -DCOMPILE_HIP_ATP_MARKER=1 during the cmake configure step. -3. Set HIP_ATP_MARKER -`export HIP_ATP_MARKER=1` - -4. Recompile the target application - -5. Run with profiler enabled to generate ATP file. -`/opt/rocm/bin/rocm-profiler -o -A ` +2. Run with profiler enabled to generate ATP file. +(These steps are also captured in the Makefile) +The HIP_PROFILE_API enables display of the HIP APIs on the CodeXL trimeline view. +`/opt/rocm/bin/rocm-profiler -o -A -e HIP_PROFILE_API=1 ` ##Using HIP_TRACE_API From e5ef8a2fd70a95a217da5e118fafbfce99bd2ea0 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 20:38:32 -0500 Subject: [PATCH 19/65] HIPCC adds paths to profiler include and libs by default. Users who desire otherwise can set HIP_ATP_MARKER=0. Also remove old unused hipcc_explicit_lib option. Change-Id: I2bf07ba880329e7a3b1365dd33a3b2be6794370f --- bin/hipcc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bin/hipcc b/bin/hipcc index d5e40b8961..5f82966f3c 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -79,7 +79,7 @@ if ($HIP_PLATFORM eq "hcc") { $ROCM_PATH=$ENV{'ROCM_PATH'} // "/opt/rocm"; - $HIP_ATP_MARKER=$ENV{'HIP_ATP_MARKER'}; + $HIP_ATP_MARKER=$ENV{'HIP_ATP_MARKER'} // 1; $marker_path = "$ROCM_PATH/profiler/CXLActivityLogger"; $ROCM_TARGET=$ENV{'ROCM_TARGET'} // "fiji"; @@ -273,10 +273,9 @@ foreach $arg (@ARGV) # Process HIPCC options here: if ($arg =~ m/^--hipcc/) { $swallowArg = 1; - if ($arg eq "--hipcc_explicit_lib") { - # Some environments (ie cmake tests) already link their own hip_hcc.o, so don't add here: - $needHipHcc = 0; - } + #if $arg eq "--hipcc_profile") { # Example argument here, hipcc + # + #} } else { push (@options, $arg); } From ef8eac9b66b79b202b7b82a1ec028a318a2d9c29 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 21:25:56 -0500 Subject: [PATCH 20/65] Add two levels of HIP_PROFILE_API (1=short,2=long) Change-Id: I7ef98589f8731fb879db109fd573c62b489f2b61 --- src/hip_hcc.cpp | 11 +++++++++-- src/hip_hcc.h | 3 ++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 1c52ceee58..e0a979b1e9 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1324,7 +1324,7 @@ void ihipInit() READ_ENV_I(release, HIP_TRACE_API, 0, "Trace each HIP API call. Print function name and return code to stderr as program executes."); READ_ENV_S(release, HIP_TRACE_API_COLOR, 0, "Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White"); - READ_ENV_I(release, HIP_PROFILE_API, 0, "Add HIP API markers to ATP file generated with CodeXL"); + READ_ENV_I(release, HIP_PROFILE_API, 0, "Add HIP API markers to ATP file generated with CodeXL. 0x1=short API name, 0x2=full API name including args."); READ_ENV_S(release, HIP_DB_START_API, 0, "Comma-separted list of tid.api_seq_num for when to start debug and profiling."); READ_ENV_S(release, HIP_DB_STOP_API, 0, "Comma-separated list of tid.api_seq_num for when to stop debug and profiling."); @@ -1346,6 +1346,7 @@ void ihipInit() if (HIP_PROFILE_API && !COMPILE_HIP_ATP_MARKER) { fprintf (stderr, "warning: env var HIP_PROFILE_API=0x%x but COMPILE_HIP_ATP_MARKER=0. (perhaps enable COMPILE_HIP_ATP_MARKER in src code before compiling?)\n", HIP_PROFILE_API); + HIP_PROFILE_API = 0; } if (HIP_DB) { @@ -1474,7 +1475,13 @@ void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, c << " sharedMem:+" << lp->dynamic_group_mem_bytes << " " << *stream; - MARKER_BEGIN(os.str().c_str(), "HIP"); + if (HIP_PROFILE_API == 0x1) { + MARKER_BEGIN(os.str().c_str(), "HIP"); + } else if (HIP_PROFILE_API == 0x2) { + std::string shortAtpString("hipLaunchKernel:"); + shortAtpString += kernelName; + MARKER_BEGIN(shortAtpString.c_str(), "HIP"); + } if (COMPILE_HIP_DB && HIP_TRACE_API) { std::cerr << API_COLOR << os.str() << API_COLOR_END << std::endl; diff --git a/src/hip_hcc.h b/src/hip_hcc.h index c415ad3e16..e40fa29f7b 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -182,7 +182,8 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\ std::string fullStr;\ recordApiTrace(&fullStr, apiStr);\ - MARKER_BEGIN(fullStr.c_str(), "HIP");\ + if (HIP_PROFILE_API == 0x1) {MARKER_BEGIN(__func__, "HIP") }\ + else if (HIP_PROFILE_API == 0x2) {MARKER_BEGIN(fullStr.c_str(), "HIP"); }\ }\ } #else From 32b086400e2688aac228db037b5604dfe9394894 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 21:26:28 -0500 Subject: [PATCH 21/65] show how to use variety of HIP_PROFILE features Change-Id: I6edd66ac4c068b64e1dc3787d7f1f69ab3238469 --- .../2_Cookbook/2_Profiler/MatrixTranspose.cpp | 175 ++++++++++-------- 1 file changed, 95 insertions(+), 80 deletions(-) diff --git a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp index 7500957dfc..b6a6b141d2 100644 --- a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp +++ b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp @@ -34,6 +34,8 @@ THE SOFTWARE. #define THREADS_PER_BLOCK_Y 4 #define THREADS_PER_BLOCK_Z 1 +#define ITERATIONS 10 + // Device (Kernel) function, it must be void // hipLaunchParm provides the execution configuration __global__ void matrixTranspose(hipLaunchParm lp, @@ -62,10 +64,72 @@ void matrixTransposeCPUReference( } } -int main() { - //HIP_SCOPED_MARKER(__func__, "MainFunc"); - HIP_BEGIN_MARKER(__func__, "MainFunc"); +// Use a separate function to demonstrate how to use function name as part of scoped marker: +void runGPU(float *Matrix, float *TransposeMatrix, + float* gpuMatrix, float* gpuTransposeMatrix) { + + // __func__ is a standard C++ macro which expands to the name of the function, in this case "runGPU" + HIP_SCOPED_MARKER(__func__, "MyGroup"); + + for (int i=0; i eps ) { - errors++; - } - } - if (errors!=0) { - printf("FAILED: %d errors\n",errors); - } else { - printf ("PASSED!\n"); - } + // verify the results + double eps = 1.0E-6; + for (int i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } - //free the resources on device side - hipFree(gpuMatrix); - hipFree(gpuTransposeMatrix); + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); - //free the resources on host side - free(Matrix); - free(TransposeMatrix); - free(cpuTransposeMatrix); - } + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); HIP_END_MARKER(); From bb58f4f6fcb47366880ed5e7c58f5964df9ed285 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 22:05:52 -0500 Subject: [PATCH 22/65] Add initial hipProfileStart/Stop And modify sample to show how to use. Still needs some work to understand interaction with CXL. Change-Id: I2579824d2dd7863ea23874d34f0dabb3cb305d3e --- include/hip/hcc_detail/hip_runtime_api.h | 17 +++++++++-- include/hip/nvcc_detail/hip_runtime_api.h | 14 +++++++++ samples/2_Cookbook/2_Profiler/Makefile | 7 +++++ .../2_Cookbook/2_Profiler/MatrixTranspose.cpp | 27 +++++++++++++++-- src/hip_hcc.cpp | 29 ++++++++++++++++++- 5 files changed, 89 insertions(+), 5 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 8c575eedc0..b62d0c4957 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1737,11 +1737,24 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, * * @warning The cudaProfilerInitialize API format for "configFile" is not supported. * - * On AMD platforms, hipProfilerStart and hipProfilerStop require installation of AMD's GPU - * perf counter API and defining GPU_PERF */ +// TODO - expand descriptions: +/** + * @brief Start recording of profiling information + * @warning : hipProfilerStart API is under development. + */ +hipError_t hipProfilerStart(); + + +/** + * @brief Stop recording of profiling information. + * @warning : hipProfilerStop API is under development. + */ +hipError_t hipProfilerStop(); + + /** * @} */ diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 1436008dd1..f4a9fd8e1b 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -643,6 +643,18 @@ inline static hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const v return hipCUDAErrorTohipError(cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream)); } +// Profile APIs: +inline hipError_t hipProfilerStart() +{ + return hipCUDAErrorTohipError(cudaProfileStart()); +} + +inline hipError_t hipProfilerStop() +{ + return hipCUDAErrorTohipError(cudaProfileStop()); +} + + inline static hipError_t hipSetDeviceFlags (unsigned int flags) { return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags)); @@ -802,6 +814,8 @@ inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, sharedMemBytes, stream, kernelParams, extra)); } + + #ifdef __cplusplus } #endif diff --git a/samples/2_Cookbook/2_Profiler/Makefile b/samples/2_Cookbook/2_Profiler/Makefile index 4b9a063f38..db2d008182 100644 --- a/samples/2_Cookbook/2_Profiler/Makefile +++ b/samples/2_Cookbook/2_Profiler/Makefile @@ -35,6 +35,13 @@ profile: $(EXECUTABLE) $(HIPPROFILER_POST_CMD) +# Pass option to control start and stop iterations for profiling - see MatrixTranspose.cpp for implementation: +# Note we start profiler in --startdisabled mode - no timing collected until app enabled it via hipProfilerStart() +profile_trigger: $(EXECUTABLE) + $(HIPPROFILER) $(PROFILER_OPT) --startdisabled $(EXECUTABLE) 3 6 + $(HIPPROFILER_POST_CMD) + + run: $(EXECUTABLE) $(EXECUTABLE) diff --git a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp index b6a6b141d2..3747bb4ec5 100644 --- a/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp +++ b/samples/2_Cookbook/2_Profiler/MatrixTranspose.cpp @@ -36,6 +36,10 @@ THE SOFTWARE. #define ITERATIONS 10 +// Cmdline parms to control start and stop triggers +int startTriggerIteration=-1; +int stopTriggerIteration=-1; + // Device (Kernel) function, it must be void // hipLaunchParm provides the execution configuration __global__ void matrixTranspose(hipLaunchParm lp, @@ -74,6 +78,13 @@ void runGPU(float *Matrix, float *TransposeMatrix, for (int i=0; i= 2) { + startTriggerIteration = atoi(argv[1]); + printf ("info : will start tracing at iteration:%d\n", startTriggerIteration); + } + if (argc >= 3) { + stopTriggerIteration = atoi(argv[2]); + printf ("info : will stop tracing at iteration:%d\n", stopTriggerIteration); + } float* Matrix; float* TransposeMatrix; @@ -166,6 +186,8 @@ int main() { // allocate the memory on the device side hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // FYI, the scoped-marker will be destroyed here when the scope exits, and will record its "end" timestamp. } runGPU(Matrix, TransposeMatrix, gpuMatrix, gpuTransposeMatrix); @@ -204,7 +226,8 @@ int main() { free(TransposeMatrix); free(cpuTransposeMatrix); - HIP_END_MARKER(); + // This ends the last marker started in this thread, in this case "Check&TearDown" + HIP_END_MARKER(); return errors; } diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index e0a979b1e9..7d79f0a930 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -63,7 +63,9 @@ int HIP_LAUNCH_BLOCKING = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; std::string HIP_TRACE_API_COLOR("green"); -int HIP_PROFILE_API= 0; +int HIP_PROFILE_API= 0;S + +// TODO - DB_START/STOP need more testing. std::string HIP_DB_START_API; std::string HIP_DB_STOP_API; int HIP_DB= 0; @@ -1891,6 +1893,31 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes } } +//------------------------------------------------------------------------------------------------- +//------------------------------------------------------------------------------------------------- +//Profiler, really these should live elsewhere: +hipError_t hipProfilerStart() +{ + HIP_INIT_API(); +#if COMPILE_HIP_ATP_MARKER + amdtResumeProfiling(AMDT_ALL_PROFILING); +#endif + + return ihipLogStatus(hipSuccess); +}; + + +hipError_t hipProfilerStop() +{ + HIP_INIT_API(); +#if COMPILE_HIP_ATP_MARKER + amdtStopProfiling(AMDT_ALL_PROFILING); +#endif + + return ihipLogStatus(hipSuccess); +}; + + //------------------------------------------------------------------------------------------------- //------------------------------------------------------------------------------------------------- // HCC-specific accessor functions: From 024d9ab09031d10c13e41fbb17df88e21d1468bc Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 27 Oct 2016 23:08:18 -0500 Subject: [PATCH 23/65] Print short hipLaunchKernel correctly. Change-Id: I6ca03d7c707cd03d6982199830213953d5855f17 --- bin/hipdemangleatp | 3 +++ src/hip_hcc.cpp | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/bin/hipdemangleatp b/bin/hipdemangleatp index b6734a9e24..d5061ed7d2 100755 --- a/bin/hipdemangleatp +++ b/bin/hipdemangleatp @@ -16,3 +16,6 @@ done # real_sym=$(c++filt $(c++filt $mangled_sym | cut -d: -f3 | sed 's/_functor//g') | cut -d\( -f1 | cut -d" " -f2) # sed -i "s/$mangled_sym/$real_sym/g" $1 #done +# +#sed -e "s/^/_/g; s/_EC_/$/g" < test.txt | c++filt + diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 7d79f0a930..c125829739 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -63,7 +63,7 @@ int HIP_LAUNCH_BLOCKING = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; std::string HIP_TRACE_API_COLOR("green"); -int HIP_PROFILE_API= 0;S +int HIP_PROFILE_API= 0; // TODO - DB_START/STOP need more testing. std::string HIP_DB_START_API; @@ -1478,11 +1478,11 @@ void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, c << " " << *stream; if (HIP_PROFILE_API == 0x1) { - MARKER_BEGIN(os.str().c_str(), "HIP"); - } else if (HIP_PROFILE_API == 0x2) { std::string shortAtpString("hipLaunchKernel:"); shortAtpString += kernelName; MARKER_BEGIN(shortAtpString.c_str(), "HIP"); + } else if (HIP_PROFILE_API == 0x2) { + MARKER_BEGIN(os.str().c_str(), "HIP"); } if (COMPILE_HIP_DB && HIP_TRACE_API) { From 6fa9bc3b61d1d78a3414511fd658281399d27165 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 28 Oct 2016 14:12:13 +0530 Subject: [PATCH 24/65] hipcc: Update flags for Fedora support Change-Id: I90be7768410e491b4f11c3b0f08470246d781d80 --- bin/hipcc | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/hipcc b/bin/hipcc index 2f3cd46c62..f5692e6fca 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -116,9 +116,16 @@ if ($HIP_PLATFORM eq "hcc") { } else { $HIPLDFLAGS .= " -Wl,--defsym=_binary_kernel_spir_end=1 -Wl,--defsym=_binary_kernel_spir_start=1 -Wl,--defsym=_binary_kernel_cl_start=1 -Wl,--defsym=_binary_kernel_cl_end=1"; } + if ($HOST_OSNAME eq "fedora") { + $HIPCXXFLAGS .= " -I/usr/local/include/c++/v1"; + } # Satisfy HCC dependencies - $HIPLDFLAGS .= " -lc++abi -lsupc++"; + if ($HOST_OSNAME eq "fedora") { + $HIPLDFLAGS .= " -lc++abi"; + } else { + $HIPLDFLAGS .= " -lc++abi -lsupc++"; + } $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt"; # Handle ROCm target platform From c5a2ad496476b42c1425b97b8a4ebf8b2c2a1165 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 28 Oct 2016 14:12:53 +0530 Subject: [PATCH 25/65] CMakeLists.txt: Update include paths needed for Fedora support Change-Id: Ib84f9dba30d2c64f344d6f8e85ddbe15f30af1a0 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ea2e0f877..8958eaa090 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -164,7 +164,7 @@ if(HIP_PLATFORM STREQUAL "hcc") set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -DHIP_VERSION_MAJOR=${HIP_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_VERSION_MINOR} -DHIP_VERSION_PATCH=${HIP_VERSION_PATCH}") # Add remaining flags - set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -I/opt/rocm/libhsakmt/include -stdlib=libc++") + set(HIP_HCC_BUILD_FLAGS "${HIP_HCC_BUILD_FLAGS} -fPIC -hc -I${HCC_HOME}/include -I${HSA_PATH}/include -I/opt/rocm/libhsakmt/include -I/usr/local/include/c++/v1 -stdlib=libc++") # Set compiler and compiler flags set(CMAKE_CXX_COMPILER "${HCC_HOME}/bin/hcc") From c39ddad215f98d93b0fef6786427be2275bbb081 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 28 Oct 2016 15:46:59 +0530 Subject: [PATCH 26/65] hipdemangleatp: Try handling HC kernels as well Change-Id: Ie438ddd28e5bc6067fcd682df849d3183046b40a --- bin/hipdemangleatp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/bin/hipdemangleatp b/bin/hipdemangleatp index d5061ed7d2..f979f59434 100755 --- a/bin/hipdemangleatp +++ b/bin/hipdemangleatp @@ -6,16 +6,14 @@ kernels=$(grep grid_launch_parm $1 | cut -d" " -f1 | sort | uniq) for mangled_sym in $kernels; do real_sym=$(c++filt -p $(c++filt _$mangled_sym | cut -d: -f3 | sed 's/_functor//g')) - echo "$mangled_sym => $real_sym" >> $1.log + #echo "$mangled_sym => $real_sym" >> $1.log sed -i "s/$mangled_sym/$real_sym/g" $1 done # HC kernels -#kernels=$(grep cxxamp_trampoline $1 | cut -d" " -f1 | sort | uniq) -#for mangled_sym in $kernels; do -# real_sym=$(c++filt $(c++filt $mangled_sym | cut -d: -f3 | sed 's/_functor//g') | cut -d\( -f1 | cut -d" " -f2) -# sed -i "s/$mangled_sym/$real_sym/g" $1 -#done -# -#sed -e "s/^/_/g; s/_EC_/$/g" < test.txt | c++filt - +kernels=$(grep cxxamp_trampoline $1 | cut -d" " -f1 | sort | uniq) +for mangled_sym in $kernels; do + real_sym=$(echo $mangled_sym | sed "s/^/_/g; s/_EC_/_$/g" | c++filt -p | cut -d\( -f1 | cut -d" " -f1 --complement) + #echo "$mangled_sym => $real_sym" >> $1.log + sed -i "s/$mangled_sym/$real_sym/g" $1 +done From 87a2e8f12b6983200afad11c9b06ff5956300ff4 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 28 Oct 2016 07:08:27 -0500 Subject: [PATCH 27/65] add hip_profile.h Change-Id: Id43a4336db53567020584cb7842baf5c1649fd8e --- include/hip/hip_profile.h | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 include/hip/hip_profile.h diff --git a/include/hip/hip_profile.h b/include/hip/hip_profile.h new file mode 100644 index 0000000000..489143adfd --- /dev/null +++ b/include/hip/hip_profile.h @@ -0,0 +1,38 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#pragma once + +#if not defined (ENABLE_HIP_PROFILE) +#define ENABLE_HIP_PROFILE 1 +#endif + +#if defined(__HIP_PLATFORM_HCC__) and (ENABLE_HIP_PROFILE==1) +#include +#define HIP_SCOPED_MARKER(markerName, group) amdtScopedMarker __scopedMarker(markerName, group, nullptr); +#define HIP_BEGIN_MARKER(markerName, group) amdtBeginMarker(markerName, group, nullptr); +#define HIP_END_MARKER() amdtEndMarker(); +#else +#define HIP_SCOPED_MARKER(markerName, group) +#define HIP_BEGIN_MARKER(markerName, group) +#define HIP_END_MARKER() +#endif From b0f37d0d2ed2c6c0bff4a2c6fc05cfd56c822e39 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Oct 2016 18:32:13 +0300 Subject: [PATCH 28/65] * [HIPIFY] Initial Profiler support. CUDA Driver API porting to HIP: + cuProfilerStart, cuProfilerStop. - cuProfilerInitialize & cudaProfilerInitialize - unsupported yet by HIP. --- hipify-clang/src/Cuda2Hip.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 33e38b2c4b..7a64db715f 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -423,7 +423,11 @@ struct cuda2hipMap { cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; - + // Profiler + // unsupported yet by HIP + // cuda2hipRename["cuProfilerInitialize"] = {"hipProfilerInitialize", CONV_OTHER, API_DRIVER}; + cuda2hipRename["cuProfilerStart"] = {"hipProfilerStart", CONV_OTHER, API_DRIVER}; + cuda2hipRename["cuProfilerStop"] = {"hipProfilerStop", CONV_OTHER, API_DRIVER}; /////////////////////////////// CUDA RT API /////////////////////////////// // Error API From 1b64c167f6f9666e0c45059d9dc8b2605e94ffa6 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Oct 2016 20:05:51 +0300 Subject: [PATCH 29/65] [HIPIFY] wrap kernel name with HIP_KERNEL_NAME macros... only in case of commas in it. --- hipify-clang/src/Cuda2Hip.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 7a64db715f..87a69e8cb9 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -1610,7 +1610,11 @@ class Cuda2HipCallback : public MatchFinder::MatchCallback { } } XStr.clear(); - OS << "hipLaunchKernel(HIP_KERNEL_NAME(" << calleeName << "),"; + if (calleeName.find(',') != StringRef::npos) { + SmallString<128> tmpData; + calleeName = Twine("HIP_KERNEL_NAME(" + calleeName + ")").toStringRef(tmpData); + } + OS << "hipLaunchKernel(" << calleeName << ","; const CallExpr *config = launchKernel->getConfig(); DEBUG(dbgs() << "Kernel config arguments:" << "\n"); SourceManager *SM = Result.SourceManager; From 2d15d0741c84f92e662c5a92381b3332d1093f14 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 1 Nov 2016 10:57:48 +0530 Subject: [PATCH 30/65] Added hipDeviceGetByPCIBusId in hip/hcc path Change-Id: I3cca0dc533d0281689d8a407c7da16ca1ba6a3a8 --- include/hip/hcc_detail/hip_runtime_api.h | 10 ++++++++++ src/hip_device.cpp | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index b62d0c4957..8eecd1650f 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1591,6 +1591,16 @@ hipError_t hipDeviceGetName(char *name,int len,hipDevice_t device); */ hipError_t hipDeviceGetPCIBusId (int *pciBusId,int len,hipDevice_t device); +/** + * @brief Returns a handle to a compute device. + * @param [out] device handle + * @param [in] PCI Bus ID + * + * @returns #hipSuccess, #hipErrorInavlidDevice, #hipErrorInvalidValue + */ +hipError_t hipDeviceGetByPCIBusId ( int* device,const int* pciBusId ); + + /** * @brief Returns the total amount of memory on the device. * @param [out] bytes diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 9d577f5313..29ab0805b8 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -340,8 +340,28 @@ hipError_t hipDeviceTotalMem (size_t *bytes,hipDevice_t device) return ihipLogStatus(e); } +hipError_t hipDeviceGetByPCIBusId (int* device, const int* pciBusId ) +{ + HIP_INIT_API(device,pciBusId); + hipDeviceProp_t tempProp; + int deviceCount; + hipError_t e = hipErrorInvalidValue; + hipGetDeviceCount( &deviceCount ); + *device = 0; + for (int i=0; i< deviceCount; i++) { + hipGetDeviceProperties( &tempProp, i ); + if(tempProp.pciBusID == *pciBusId) { + *device =i; + e = hipSuccess; + break; + } + } + return ihipLogStatus(e); +} + hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* prop ) { + HIP_INIT_API(device,prop); hipDeviceProp_t tempProp; int deviceCount; int inPropCount=0; From f48c53534efe766f0cf6565b285ab58a7ce3be2a Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 1 Nov 2016 16:37:33 -0500 Subject: [PATCH 31/65] added inter thread data movement intrinsics Change-Id: I2a8a8ed49429cb7f96439bd28c4b83b5142737df --- include/hip/hcc_detail/hip_runtime.h | 9 +++++++++ src/hip_hcc.cpp | 26 ++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 1bda07eb7d..65bf42516f 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -560,6 +560,15 @@ extern "C" __device__ void __threadfence(void); */ __device__ void __threadfence_system(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional"))); +__device__ unsigned __hip_ds_bpermute(int index, unsigned src); +__device__ float __hip_ds_bpermutef(int index, float src); +__device__ unsigned __hip_ds_permute(int index, unsigned src); +__device__ float __hip_ds_permutef(int index, float src); + +__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern); +__device__ float __hip_ds_swizzlef(float src, int pattern); + +__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl); // doxygen end Fence Fence /** diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index c125829739..15fd3a8237 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -180,7 +180,33 @@ __device__ void* __hip_hc_free(void *ptr) return nullptr; } +__device__ unsigned __hip_ds_bpermute(int index, unsigned src) { + return hc::__amdgcn_ds_bpermute(index, src); +} + +__device__ float __hip_ds_bpermutef(int index, float src) { + return hc::__amdgcn_ds_bpermute(index, src); +} + +__device__ unsigned __hip_ds_permute(int index, unsigned src) { + return hc::__amdgcn_ds_permute(index, src); +} + +__device__ float __hip_ds_permutef(int index, float src) { + return hc::__amdgcn_ds_permute(index, src); +} + +__device__ unsigned __hip_ds_swizzle(unsigned int src, int pattern) { + return hc::__amdgcn_ds_swizzle(src, pattern); +} +__device__ float __hip_ds_swizzlef(float src, int pattern) { + return hc::__amdgcn_ds_swizzle(src, pattern); +} + +__device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask, bool bound_ctrl) { + return hc::__amdgcn_move_dpp(src, dpp_ctrl, row_mask, bank_mask, bound_ctrl); +} //================================================================================================= // Thread-local storage: //================================================================================================= From 4817131cdcd3fdc0aeedb374dac78ca8b21d6af7 Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 1 Nov 2016 20:30:56 -0500 Subject: [PATCH 32/65] Update hipStreamNonBlocking to use cuda define on NV path Change-Id: I74ea09db99d602ba1c5f192b36ff7f2781176e6a --- include/hip/nvcc_detail/hip_runtime_api.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index f4a9fd8e1b..cb4a3f8676 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -84,8 +84,8 @@ typedef CUfunction hipFunction_t; typedef CUdeviceptr hipDeviceptr_t; // Flags that can be used with hipStreamCreateWithFlags -#define hipStreamDefault 0x00 ///< Default stream creation flags -#define hipStreamNonBlocking 0x01 ///< Stream does not implicitly synchronize with null stream +#define hipStreamDefault cudaStreamDefault +#define hipStreamNonBlocking cudaStreamNonBlocking //typedef cudaChannelFormatDesc hipChannelFormatDesc; #define hipChannelFormatDesc cudaChannelFormatDesc @@ -644,13 +644,13 @@ inline static hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const v } // Profile APIs: -inline hipError_t hipProfilerStart() -{ +inline hipError_t hipProfilerStart() +{ return hipCUDAErrorTohipError(cudaProfileStart()); } -inline hipError_t hipProfilerStop() -{ +inline hipError_t hipProfilerStop() +{ return hipCUDAErrorTohipError(cudaProfileStop()); } From 774de273d0eebd2223547c56d3774cfaaedb0d8e Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 2 Nov 2016 16:08:27 -0500 Subject: [PATCH 33/65] Update document for workaround suggestion on threadfence_system() Change-Id: Icccab8270604a0e578a8614b9afb3f95372f4966 --- docs/markdown/hip_kernel_language.md | 3 +-- include/hip/hcc_detail/hip_runtime.h | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md index 3b13cde08a..d629063f73 100644 --- a/docs/markdown/hip_kernel_language.md +++ b/docs/markdown/hip_kernel_language.md @@ -233,8 +233,7 @@ typedef struct dim3 { ## Memory-Fence Instructions HIP supports __threadfence() and __threadfence_block(). -Applications that use threadfence_system can disable the L1 and L2 caches on the GPU by: -"export HSA_DISABLE_CACHE=1". See the hip_porting_guide.md#threadfence_system for more information. +Support for threadfence_system() is under development. ## Synchronization Functions The __syncthreads() built-in function is supported in HIP. The __syncthreads_count(int), __syncthreads_and(int) and __syncthreads_or(int) functions are under development. diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 65bf42516f..73fc022362 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -556,7 +556,7 @@ extern "C" __device__ void __threadfence(void); * * @param void * - * @warning __threadfence_system is a stub and map to no-op, application should set "export HSA_DISABLE_CACHE=1" to disable both L1 and L2 caches. + * @warning __threadfence_system is a stub and map to no-op. */ __device__ void __threadfence_system(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional"))); From f0bb817c11cd0131dc675c3032846fbe8d7466dd Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 4 Nov 2016 06:05:00 -0500 Subject: [PATCH 34/65] Pre-pend HIP_PATH/lib to linker, so we find developer object code Previously might pick up libs from /opt/rocm/lib. Change-Id: Ia7adb345defe433d5952aa61706fe03fd7cbcd35 --- bin/hipcc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/hipcc b/bin/hipcc index 5f82966f3c..fa0a3dc467 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -315,12 +315,13 @@ if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc') if ($needHipHcc) { $HIP_LIB_TYPE = $hipConfig{'HIP_LIB_TYPE'} // 0; + # TODO - remove the old sea-of-objects solution: if ($HIP_LIB_TYPE eq 0) { - $HIPLDFLAGS .= " $HIP_PATH/lib/device_util.cpp.o $HIP_PATH/lib/hip_device.cpp.o $HIP_PATH/lib/hip_error.cpp.o $HIP_PATH/lib/hip_event.cpp.o $HIP_PATH/lib/hip_hcc.cpp.o $HIP_PATH/lib/hip_memory.cpp.o $HIP_PATH/lib/hip_peer.cpp.o $HIP_PATH/lib/hip_stream.cpp.o $HIP_PATH/lib/hip_ldg.cpp.o $HIP_PATH/lib/hip_fp16.cpp.o $HIP_PATH/lib/hip_context.cpp.o $HIP_PATH/lib/hip_module.cpp.o"; + substr($HIPLDFLAGS,0,0) = " $HIP_PATH/lib/device_util.cpp.o $HIP_PATH/lib/hip_device.cpp.o $HIP_PATH/lib/hip_error.cpp.o $HIP_PATH/lib/hip_event.cpp.o $HIP_PATH/lib/hip_hcc.cpp.o $HIP_PATH/lib/hip_memory.cpp.o $HIP_PATH/lib/hip_peer.cpp.o $HIP_PATH/lib/hip_stream.cpp.o $HIP_PATH/lib/hip_ldg.cpp.o $HIP_PATH/lib/hip_fp16.cpp.o $HIP_PATH/lib/hip_context.cpp.o $HIP_PATH/lib/hip_module.cpp.o "; } elsif ($HIP_LIB_TYPE eq 1) { - $HIPLDFLAGS .= " -L$HIP_PATH/lib -lhip_hcc" ; + substr($HIPLDFLAGS,0,0) = " -L$HIP_PATH/lib -lhip_hcc " ; } else { - $HIPLDFLAGS .= " -L$HIP_PATH/lib -Wl,--rpath=$HIP_PATH/lib -lhip_hcc"; + substr($HIPLDFLAGS,0,0) = " -L$HIP_PATH/lib -Wl,--rpath=$HIP_PATH/lib -lhip_hcc "; } } From 5832349e5ec8b05537d245511bac6d98823f9022 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 4 Nov 2016 06:34:07 -0500 Subject: [PATCH 35/65] Print non-peers too Change-Id: I2a6905edcdf144aa732ae3120c17780477f232ac --- samples/1_Utils/hipInfo/hipInfo.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/samples/1_Utils/hipInfo/hipInfo.cpp b/samples/1_Utils/hipInfo/hipInfo.cpp index 0403162bd1..42a879e732 100644 --- a/samples/1_Utils/hipInfo/hipInfo.cpp +++ b/samples/1_Utils/hipInfo/hipInfo.cpp @@ -133,6 +133,15 @@ void printDeviceProp (int deviceId) } } cout << endl; + cout << setw(w1) << "non-peers: "; + for (int i=0; i Date: Fri, 4 Nov 2016 08:51:16 -0500 Subject: [PATCH 36/65] Add debug for Peer APIs. Enable PeerMemcpy APIs by default. Change-Id: I46e39a9e7b07686a78484c1f3b5495b08e052fbb --- include/hip/hcc_detail/hip_runtime_api.h | 6 ++- src/hip_hcc.cpp | 50 ++++++++++++++---------- src/hip_hcc.h | 19 +++++---- src/hip_peer.cpp | 39 ++++++++++++------ 4 files changed, 70 insertions(+), 44 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 8eecd1650f..82eba49771 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1278,7 +1278,11 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags); hipError_t hipDeviceDisablePeerAccess (int peerDeviceId); -#ifdef PEER_NON_UNIFIED +#ifndef USE_PEER_NON_UNIFIED +#define USE_PEER_NON_UNIFIED 1 +#endif + +#if USE_PEER_NON_UNIFIED==1 /** * @brief Copies memory from one device to memory on another device. * diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 15fd3a8237..74b0fcbd28 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -73,6 +73,8 @@ int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU ident int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; +int HIP_FORCE_P2P_HOST = 0; + @@ -540,7 +542,7 @@ void ihipCtxCriticalBase_t::recomputePeerAgents() template<> -bool ihipCtxCriticalBase_t::isPeer(const ihipCtx_t *peer) +bool ihipCtxCriticalBase_t::isPeerWatcher(const ihipCtx_t *peer) { auto match = std::find(_peers.begin(), _peers.end(), peer); return (match != std::end(_peers)); @@ -548,12 +550,14 @@ bool ihipCtxCriticalBase_t::isPeer(const ihipCtx_t *peer) template<> -bool ihipCtxCriticalBase_t::addPeer(ihipCtx_t *peer) +bool ihipCtxCriticalBase_t::addPeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peerWatcher) { - auto match = std::find(_peers.begin(), _peers.end(), peer); + auto match = std::find(_peers.begin(), _peers.end(), peerWatcher); if (match == std::end(_peers)) { // Not already a peer, let's update the list: - _peers.push_back(peer); + tprintf(DB_COPY, "addPeerWatcher. Allocations on %s now visible to peerWatcher %s.\n", + thisCtx->toString().c_str(), peerWatcher->toString().c_str()); + _peers.push_back(peerWatcher); recomputePeerAgents(); return true; } @@ -564,12 +568,14 @@ bool ihipCtxCriticalBase_t::addPeer(ihipCtx_t *peer) template<> -bool ihipCtxCriticalBase_t::removePeer(ihipCtx_t *peer) +bool ihipCtxCriticalBase_t::removePeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peerWatcher) { - auto match = std::find(_peers.begin(), _peers.end(), peer); + auto match = std::find(_peers.begin(), _peers.end(), peerWatcher); if (match != std::end(_peers)) { // Found a valid peer, let's remove it. - _peers.remove(peer); + tprintf(DB_COPY, "removePeerWatcher. Allocations on %s no longer visible to former peerWatcher %s.\n", + thisCtx->toString().c_str(), peerWatcher->toString().c_str()); + _peers.remove(peerWatcher); recomputePeerAgents(); return true; } else { @@ -579,16 +585,17 @@ bool ihipCtxCriticalBase_t::removePeer(ihipCtx_t *peer) template<> -void ihipCtxCriticalBase_t::resetPeers(ihipCtx_t *thisDevice) +void ihipCtxCriticalBase_t::resetPeerWatchers(ihipCtx_t *thisCtx) { + tprintf(DB_COPY, "resetPeerWatchers for context=%s\n", thisCtx->toString().c_str()); _peers.clear(); _peerCnt = 0; - addPeer(thisDevice); // peer-list always contains self agent. + addPeerWatcher(thisCtx, thisCtx); // peer-list always contains self agent. } template<> -void ihipCtxCriticalBase_t::printPeers(FILE *f) const +void ihipCtxCriticalBase_t::printPeerWatchers(FILE *f) const { for (auto iter = _peers.begin(); iter!=_peers.end(); iter++) { fprintf (f, "%s ", (*iter)->toString().c_str()); @@ -993,7 +1000,7 @@ void ihipCtx_t::locked_reset() // Reset peer list to just me: - crit->resetPeers(this); + crit->resetPeerWatchers(this); // Reset and release all memory stored in the tracker: // Reset will remove peer mapping so don't need to do this explicitly. @@ -1360,7 +1367,7 @@ void ihipInit() READ_ENV_I(release, HIP_WAIT_MODE, 0, "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in application"); - + READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copiecopies"); READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. @@ -1726,14 +1733,14 @@ void ihipSetTs(hipEvent_t e) // So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx) { - tprintf (DB_COPY1, "Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s\n", + tprintf (DB_COPY, "Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s\n", thisCtx->toString().c_str(), dstCtx->toString().c_str(), srcCtx->toString().c_str()); // Use blocks to control scope of critical sections. { LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); tprintf(DB_SYNC, "dstCrit lock succeeded\n"); - if (!ctxCrit->isPeer(thisCtx)) { + if (!ctxCrit->isPeerWatcher(thisCtx)) { return false; }; } @@ -1741,7 +1748,7 @@ bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, { LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); tprintf(DB_SYNC, "srcCrit lock succeeded\n"); - if (!ctxCrit->isPeer(thisCtx)) { + if (!ctxCrit->isPeerWatcher(thisCtx)) { return false; }; } @@ -1832,13 +1839,13 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, if (hcCopyDir == hc::hcMemcpyDeviceToDevice) { if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId))) { forceHostCopyEngine = true; - tprintf (DB_COPY1, "Forcing use of host copy engine.\n"); + tprintf (DB_COPY, "Forcing use of host copy engine.\n"); } else { - tprintf (DB_COPY1, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); + tprintf (DB_COPY, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); } }; - tprintf (DB_COPY1, "locked_copy dir=%s dst=%p src=%p sz=%zu\n", memcpyStr(kind), src, dst, sizeBytes); + tprintf (DB_COPY, "locked_copy dir=%s dst=%p src=%p sz=%zu\n", memcpyStr(kind), src, dst, sizeBytes); { LockedAccessor_StreamCrit_t crit (_criticalData); @@ -1859,12 +1866,12 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes const ihipCtx_t *ctx = this->getCtx(); if ((ctx == nullptr) || (ctx->getDevice() == nullptr)) { - tprintf (DB_COPY1, "locked_copyAsync bad ctx or device\n"); + tprintf (DB_COPY, "locked_copyAsync bad ctx or device\n"); throw ihipException(hipErrorInvalidDevice); } if (kind == hipMemcpyHostToHost) { - tprintf (DB_COPY1, "locked_copyAsync: H2H with memcpy"); + tprintf (DB_COPY, "locked_copyAsync: H2H with memcpy"); // TODO - consider if we want to perhaps use the GPU SDMA engines anyway, to avoid the host-side sync here and keep everything flowing on the GPU. /* As this is a CPU op, we need to wait until all @@ -1890,7 +1897,7 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes copyEngineCanSeeSrcAndDest = canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId)); } - tprintf (DB_COPY1, "locked_copyAsync: async memcpy dstTracked=%d srcTracked=%d copyEngineCanSeeSrcAndDest=%d\n", + tprintf (DB_COPY, "locked_copyAsync: async memcpy dstTracked=%d srcTracked=%d copyEngineCanSeeSrcAndDest=%d\n", dstTracked, srcTracked, copyEngineCanSeeSrcAndDest); @@ -1915,6 +1922,7 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes } else { // TODO - call copy_ext directly here? locked_copySync(dst, src, sizeBytes, kind); + //crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); } } } diff --git a/src/hip_hcc.h b/src/hip_hcc.h index e40fa29f7b..ad22789fda 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -43,6 +43,7 @@ THE SOFTWARE. //static const int debug = 0; extern const int release; +// TODO - this blocks both kernels and memory ops. Perhaps should have separate env var for kernels? extern int HIP_LAUNCH_BLOCKING; extern int HIP_PRINT_ENV; @@ -225,9 +226,8 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); #define DB_API 0 /* 0x01 - shortcut to enable HIP_TRACE_API on single switch */ #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */ #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */ -#define DB_COPY1 3 /* 0x08 - trace memory copy commands. . */ +#define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */ #define DB_SIGNAL 4 /* 0x10 - trace signal pool commands */ -#define DB_COPY2 5 /* 0x20 - trace memory copy commands. Detailed. */ #define DB_MAX_FLAG 5 // When adding a new debug flag, also add to the char name table below. // @@ -242,9 +242,8 @@ static const DbName dbName [] = {KGRN, "api"}, // not used, {KYEL, "sync"}, {KCYN, "mem"}, - {KMAG, "copy1"}, + {KMAG, "copy"}, {KRED, "signal"}, - {KNRM, "copy2"}, }; @@ -596,11 +595,11 @@ class ihipCtxCriticalBase_t : LockedBase // Peer Accessor classes: - bool isPeer(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device. - bool addPeer(ihipCtx_t *peer); - bool removePeer(ihipCtx_t *peer); - void resetPeers(ihipCtx_t *thisDevice); - void printPeers(FILE *f) const; + bool isPeerWatcher(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device. + bool addPeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peer); + bool removePeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peer); + void resetPeerWatchers(ihipCtx_t *thisDevice); + void printPeerWatchers(FILE *f) const; uint32_t peerCnt() const { return _peerCnt; }; hsa_agent_t *peerAgents() const { return _peerAgents; }; @@ -750,7 +749,7 @@ inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) { os << "ctx:" << static_cast (c) - << " dev:" << c->getDevice()->_deviceId; + << ".dev:" << c->getDevice()->_deviceId; return os; } diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index e66a0d2971..95ea4719a9 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -35,22 +35,22 @@ THE SOFTWARE. // public APIs are thin wrappers which call into this internal implementations. // TODO - actually not yet - currently the integer deviceId flavors just call the context APIs. need to fix. -/** - * HCC returns 0 in *canAccessPeer ; Need to update this function when RT supports P2P - */ -//--- -hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_t peerCtx) -{ - HIP_INIT_API(canAccessPeer, thisCtx, peerCtx); + +hipError_t ihipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_t peerCtx) +{ hipError_t err = hipSuccess; if ((thisCtx != NULL) && (peerCtx != NULL)) { if (thisCtx == peerCtx) { *canAccessPeer = 0; + tprintf(DB_COPY, "Can't be peer to self. (this=%s, peer=%s)\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str()); } else { - *canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); + *canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); + tprintf(DB_COPY, "deviceCanAccessPeer this=%s peer=%s canAccessPeer=%d\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer); } } else { @@ -58,8 +58,19 @@ hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_ err = hipErrorInvalidDevice; } + return err; +} + + +/** + * HCC returns 0 in *canAccessPeer ; Need to update this function when RT supports P2P + */ +//--- +hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx_t peerCtx) +{ + HIP_INIT_API(canAccessPeer, thisCtx, peerCtx); - return ihipLogStatus(err); + return ihipLogStatus(ihipDeviceCanAccessPeer(canAccessPeer, thisCtx, peerCtx)); } @@ -80,8 +91,10 @@ hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx) err = hipErrorInvalidDevice; // Can't disable peer access to self. } else { LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); - bool changed = peerCrit->removePeer(thisCtx); + bool changed = peerCrit->removePeerWatcher(peerCtx, thisCtx); if (changed) { + tprintf(DB_COPY, "device %s disable access to memory allocated on peer:%s\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str()); // Update the peers for all memory already saved in the tracker: am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); } else { @@ -112,8 +125,10 @@ hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) } else if ((thisCtx != NULL) && (peerCtx != NULL)) { LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); // Add thisCtx to peerCtx's access list so that new allocations on peer will be made visible to this device: - bool isNewPeer = peerCrit->addPeer(thisCtx); + bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx); if (isNewPeer) { + tprintf(DB_COPY, "device=%s can now see all memory allocated on peer=%s\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str()); am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); } else { err = hipErrorPeerAccessAlreadyEnabled; @@ -158,7 +173,7 @@ hipError_t hipMemcpyPeerAsync (void* dst, hipCtx_t dstDevice, const void* src, h hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, int deviceId, int peerDeviceId) { HIP_INIT_API(canAccessPeer, deviceId, peerDeviceId); - return hipDeviceCanAccessPeer(canAccessPeer, ihipGetPrimaryCtx(deviceId), ihipGetPrimaryCtx(peerDeviceId)); + return ihipDeviceCanAccessPeer(canAccessPeer, ihipGetPrimaryCtx(deviceId), ihipGetPrimaryCtx(peerDeviceId)); } From 5d793848329581c74fe941e3a2cffc303625eee2 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 4 Nov 2016 09:37:56 -0500 Subject: [PATCH 37/65] Refactor resolve-mem step1 Change-Id: I7b8b2bbb56d7b31a97b48ebd42002641cd07a460 --- src/hip_hcc.cpp | 103 +++++++++++++++++++++++++++--------------------- src/hip_hcc.h | 4 +- 2 files changed, 61 insertions(+), 46 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 74b0fcbd28..d74307918e 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1759,7 +1759,7 @@ bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, #define CASE_STRING(X) case X: return #X ;break; -const char* memcpyStr(unsigned memKind) +const char* hipMemcpyStr(unsigned memKind) { switch (memKind) { CASE_STRING(hipMemcpyHostToHost); @@ -1771,38 +1771,71 @@ const char* memcpyStr(unsigned memKind) }; } +const char* hcMemcpyStr(hc::hcCommandKind memKind) +{ + using namespace hc; + switch (memKind) { + CASE_STRING(hcMemcpyHostToHost); + CASE_STRING(hcMemcpyHostToDevice); + CASE_STRING(hcMemcpyDeviceToHost); + CASE_STRING(hcMemcpyDeviceToDevice); + //CASE_STRING(hcMemcpyDefault); + default : return ("unknown memcpyKind"); + }; +} + // Resolve hipMemcpyDefault to a known type. // TODO - review why is this so complicated, does this need srcTracked and dstTracked? -unsigned ihipStream_t::resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem) +unsigned ihipStream_t::resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem) { hipMemcpyKind kind = hipMemcpyDefault; - if(!srcTracked && !dstTracked) - { - kind = hipMemcpyHostToHost; - } - if(!srcTracked && dstTracked) - { - if(dstInDeviceMem) { kind = hipMemcpyHostToDevice; } - else{ kind = hipMemcpyHostToHost; } - } - if (srcTracked && !dstTracked) { - if(srcInDeviceMem) { kind = hipMemcpyDeviceToHost; } - else { kind = hipMemcpyHostToHost; } - } - if (srcTracked && dstTracked) { - if(srcInDeviceMem && dstInDeviceMem) { kind = hipMemcpyDeviceToDevice; } - if(srcInDeviceMem && !dstInDeviceMem) { kind = hipMemcpyDeviceToHost; } - if(!srcInDeviceMem && !dstInDeviceMem) { kind = hipMemcpyHostToHost; } - if(!srcInDeviceMem && dstInDeviceMem) { kind = hipMemcpyHostToDevice; } - } + + if( srcInDeviceMem && dstInDeviceMem) { kind = hipMemcpyDeviceToDevice; } + if( srcInDeviceMem && !dstInDeviceMem) { kind = hipMemcpyDeviceToHost; } + if(!srcInDeviceMem && !dstInDeviceMem) { kind = hipMemcpyHostToHost; } + if(!srcInDeviceMem && dstInDeviceMem) { kind = hipMemcpyHostToDevice; } assert (kind != hipMemcpyDefault); return kind; } +// hipMemKind must be "resolved" to a specific direction - cannot be default. +void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, + hc::hcCommandKind *hcCopyDir, bool *forceHostCopyEngine) +{ + ihipCtx_t *ctx = this->getCtx(); + + if (hipMemKind == hipMemcpyDefault) { + hipMemKind = resolveMemcpyDirection(srcPtrInfo->_isInDeviceMem, dstPtrInfo->_isInDeviceMem); + } + + switch (hipMemKind) { + case hipMemcpyHostToHost: *hcCopyDir = hc::hcMemcpyHostToHost; break; + case hipMemcpyHostToDevice: *hcCopyDir = hc::hcMemcpyHostToDevice; break; + case hipMemcpyDeviceToHost: *hcCopyDir = hc::hcMemcpyDeviceToHost; break; + case hipMemcpyDeviceToDevice: *hcCopyDir = hc::hcMemcpyDeviceToDevice; break; + default: throw ihipException(hipErrorRuntimeOther); + }; + + + // If this is P2P access, we need to check to see if the copy agent (specified by the stream where the copy is enqueued) + // has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers + // and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (*forceHostCopyEngine=true). + *forceHostCopyEngine = false; + if (*hcCopyDir == hc::hcMemcpyDeviceToDevice) { + if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo->_appId), ihipGetPrimaryCtx(srcPtrInfo->_appId))) { + *forceHostCopyEngine = true; + tprintf (DB_COPY, "Forcing use of host copy engine.\n"); + } else { + tprintf (DB_COPY, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); + } + }; +} + + // TODO - remove kind parm from here or use it below? void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, unsigned kind, bool resolveOn) { @@ -1819,33 +1852,13 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); - if (kind == hipMemcpyDefault) { - kind = resolveMemcpyDirection(srcTracked, dstTracked, srcPtrInfo._isInDeviceMem, dstPtrInfo._isInDeviceMem); - } - hc::hcCommandKind hcCopyDir; - switch (kind) { - case hipMemcpyHostToHost: hcCopyDir = hc::hcMemcpyHostToHost; break; - case hipMemcpyHostToDevice: hcCopyDir = hc::hcMemcpyHostToDevice; break; - case hipMemcpyDeviceToHost: hcCopyDir = hc::hcMemcpyDeviceToHost; break; - case hipMemcpyDeviceToDevice: hcCopyDir = hc::hcMemcpyDeviceToDevice; break; - default: throw ihipException(hipErrorRuntimeOther); - }; + hc::hcCommandKind hcCopyDir; + bool forceHostCopyEngine; + resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceHostCopyEngine); - // If this is P2P access, we need to check to see if the copy agent (specified by the stream where the copy is enqueued) - // has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers - // and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (forceHostCopyEngine=true). - bool forceHostCopyEngine = false; - if (hcCopyDir == hc::hcMemcpyDeviceToDevice) { - if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId))) { - forceHostCopyEngine = true; - tprintf (DB_COPY, "Forcing use of host copy engine.\n"); - } else { - tprintf (DB_COPY, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); - } - }; - tprintf (DB_COPY, "locked_copy dir=%s dst=%p src=%p sz=%zu\n", memcpyStr(kind), src, dst, sizeBytes); + tprintf (DB_COPY, "locked_copy dir=%s dst=%p src=%p sz=%zu\n", hcMemcpyStr(hcCopyDir), src, dst, sizeBytes); { LockedAccessor_StreamCrit_t crit (_criticalData); diff --git a/src/hip_hcc.h b/src/hip_hcc.h index ad22789fda..ecec00485d 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -489,7 +489,9 @@ class ihipStream_t { // The unsigned return is hipMemcpyKind - unsigned resolveMemcpyDirection(bool srcTracked, bool dstTracked, bool srcInDeviceMem, bool dstInDeviceMem); + unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); + void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, + hc::hcCommandKind *hcCopyDir, bool *forceHostCopyEngine); bool canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx); From ff3298f40ec7e7490f77bd6738a6ba4ccd68df94 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 4 Nov 2016 16:13:32 -0500 Subject: [PATCH 38/65] Expand hipP2PSimple testing. Cover cases where P2P is used for H2D copies, where host is pinned but not accessible to the copy agent. Change-Id: I9464b787228b40f93473708c3fde9726e1986365 --- tests/src/hipPeerToPeer_simple.cpp | 123 ++++++++++++++++++++++++++--- 1 file changed, 111 insertions(+), 12 deletions(-) diff --git a/tests/src/hipPeerToPeer_simple.cpp b/tests/src/hipPeerToPeer_simple.cpp index 2c1a3cc339..c2fc2e065c 100644 --- a/tests/src/hipPeerToPeer_simple.cpp +++ b/tests/src/hipPeerToPeer_simple.cpp @@ -50,6 +50,16 @@ void help(char *argv[]) }; +static hipError_t myHipMemcpy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream, bool async) +{ + if (async) { + return hipMemcpyAsync(dest, src, sizeBytes, kind, stream); + } else { + return hipMemcpy(dest, src, sizeBytes, kind); + }; +} + + void parseMyArguments(int argc, char *argv[]) { int more_argc = HipTest::parseStandardArguments(argc, argv, false); @@ -104,9 +114,9 @@ void setupPeerTests() //--- // Test which enables peer2peer first, then allocates the memory. -void enablePeerFirst() +void enablePeerFirst(bool useAsyncCopy) { - printf ("\n==testing: %s\n", __func__); + printf ("\n==testing: %s useAsyncCopy=%d\n", __func__, useAsyncCopy); setupPeerTests(); @@ -147,11 +157,11 @@ void enablePeerFirst() // NOTE : if p_mirrorPeers=0 and p_memcpyWithPeer=1, then peer device does not have mapping for A_d1 and we need to use a // a host staging copy for the P2P access. HIPCHECK (hipSetDevice(p_memcpyWithPeer ? g_peerDevice : g_currentDevice)); - HIPCHECK (hipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault)); // This is P2P copy. + HIPCHECK (myHipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault, 0/*stream*/, useAsyncCopy)); // This is P2P copy. // Copy data back to host: HIPCHECK (hipSetDevice(g_peerDevice)); - HIPCHECK (hipMemcpy(A_h, A_d1, Nbytes, hipMemcpyDeviceToHost)); + HIPCHECK (myHipMemcpy(A_h, A_d1, Nbytes, hipMemcpyDeviceToHost, 0/*stream*/, useAsyncCopy)); // Check host data: for (int i=0; i Date: Fri, 4 Nov 2016 18:40:10 -0500 Subject: [PATCH 39/65] Set forceHostCopyEngine for other copy dirs. Support HIP_FORCE_P2P_HOST Also: more debug for copy and P2p. Change-Id: I87030c525410e041b2a00baaf6c68e6c0977ff42 --- src/hip_hcc.cpp | 43 ++++++++++++++++++++++++++++--------------- src/hip_hcc.h | 1 + src/hip_memory.cpp | 28 +++++++++++++++++++--------- 3 files changed, 48 insertions(+), 24 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index d74307918e..d3f87a15c9 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1141,7 +1141,7 @@ void ihipReadEnv_Callback(void *var_ptr, const char *var_name1, const char *var_ env = getenv(var_name2); } - std::string var_string = "TBD"; + std::string var_string = "0"; if (env) { var_string = setterCallback(var_ptr, env); } @@ -1828,9 +1828,14 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPoi if (*hcCopyDir == hc::hcMemcpyDeviceToDevice) { if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo->_appId), ihipGetPrimaryCtx(srcPtrInfo->_appId))) { *forceHostCopyEngine = true; - tprintf (DB_COPY, "Forcing use of host copy engine.\n"); + tprintf (DB_COPY, "P2P D2D : copy engine cannot see both host and device pointers - forcing copy through staging buffers.\n"); } else { - tprintf (DB_COPY, "Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); + if (HIP_FORCE_P2P_HOST ) { + *forceHostCopyEngine = true; + tprintf (DB_COPY, "P2P D2D. Copy engine can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n"); + } else { + tprintf (DB_COPY, "P2P D2D. Copy engine can see src and dst, Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); + } } }; } @@ -1858,15 +1863,19 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceHostCopyEngine); - tprintf (DB_COPY, "locked_copy dir=%s dst=%p src=%p sz=%zu\n", hcMemcpyStr(hcCopyDir), src, dst, sizeBytes); { LockedAccessor_StreamCrit_t crit (_criticalData); #if DISABLE_COPY_EXT #warning ("Disabled copy_ext path, P2P host staging copies will not work") + tprintf (DB_COPY, "copySync copyEngine_dev:%d dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceHostCopyEngine=%d. Call HCC copy\n", + ctx->getDeviceNum(), dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceHostCopyEngine); // Note - peer-to-peer copies which require host staging will not work in this path. crit->_av.copy(src, dst, sizeBytes); #else + // If srcTracked == dstTracked =1 and forceHostCopyEngine=0 then we wil use async SDMA. (assuming HCC implementation doesn't override somehow) + tprintf (DB_COPY, "copySync copyEngine_dev:%d dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceHostCopyEngine=%d. Call HCC copy_ext.\n", + ctx->getDeviceNum(), dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceHostCopyEngine); crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); #endif } @@ -1904,19 +1913,18 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); - bool copyEngineCanSeeSrcAndDest = true; - if ((kind == hipMemcpyDeviceToDevice) || - ((kind == hipMemcpyDefault) && srcTracked && dstTracked)) { - copyEngineCanSeeSrcAndDest = canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo._appId), ihipGetPrimaryCtx(srcPtrInfo._appId)); - } + hc::hcCommandKind hcCopyDir; + bool forceHostCopyEngine; + resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceHostCopyEngine); - tprintf (DB_COPY, "locked_copyAsync: async memcpy dstTracked=%d srcTracked=%d copyEngineCanSeeSrcAndDest=%d\n", - dstTracked, srcTracked, copyEngineCanSeeSrcAndDest); + + tprintf (DB_COPY, "copyAsync dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceHostCopyEngine=%d\n", + dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceHostCopyEngine); // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. - if (dstTracked && srcTracked && copyEngineCanSeeSrcAndDest) { + if (dstTracked && srcTracked && !forceHostCopyEngine) { LockedAccessor_StreamCrit_t crit(_criticalData); // Perform asynchronous copy: @@ -1933,9 +1941,14 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes } } else { - // TODO - call copy_ext directly here? - locked_copySync(dst, src, sizeBytes, kind); - //crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); + LockedAccessor_StreamCrit_t crit(_criticalData); +#if DISABLE_COPY_EXT +#warning ("Disabled copy_ext path, P2P host staging copies will not work") + // Note - peer-to-peer copies which require host staging will not work in this path. + crit->_av.copy(src, dst, sizeBytes); +#else + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); +#endif } } } diff --git a/src/hip_hcc.h b/src/hip_hcc.h index ecec00485d..30512a9bd2 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -657,6 +657,7 @@ class ihipCtx_t ihipCtxCritical_t &criticalData() { return _criticalData; }; // TODO, move private. Fix P2P. const ihipDevice_t *getDevice() const { return _device; }; + int getDeviceNum() const { return _device->_deviceId; }; // TODO - review uses of getWriteableDevice(), can these be converted to getDevice() ihipDevice_t *getWriteableDevice() const { return _device; }; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 4f45370c5a..85b22d86d0 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -123,23 +123,26 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hip_status = hipErrorMemoryAllocation; } else { hc::am_memtracker_update(*ptr, device->_deviceId, 0); + int peerCnt=0; { LockedAccessor_CtxCrit_t crit(ctx->criticalData()); // the peerCnt always stores self so make sure the trace actually - if (crit->peerCnt() > 1) { + peerCnt = crit->peerCnt(); + if (peerCnt > 1) { hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); } } + tprintf(DB_MEM, " allocated %p (size=%zu) on dev:%d and allowed %d other peer(s) access\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1); } } else { hip_status = hipErrorMemoryAllocation; } - //printf (" hipMalloc allocated %p\n", *ptr); return ihipLogStatus(hip_status); } + hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { HIP_INIT_API(ptr, sizeBytes, flags); @@ -153,26 +156,28 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) auto device = ctx->getWriteableDevice(); if((flags == hipHostMallocDefault)|| (flags == hipHostMallocPortable)){ *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if(sizeBytes < 1 && (*ptr == NULL)){ + if (sizeBytes < 1 && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { hc::am_memtracker_update(*ptr, device->_deviceId, amHostPinned); } - tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); - } else if(flags & hipHostMallocMapped){ + tprintf(DB_MEM, "allocated pinned host ptr=%p on dev=%d\n", *ptr, device->_deviceId); + } else if(flags & hipHostMallocMapped) { *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if(sizeBytes && (*ptr == NULL)){ + if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; - }else{ + } else { hc::am_memtracker_update(*ptr, device->_deviceId, flags); + int peerCnt=0; { LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - if (crit->peerCnt()) { + peerCnt = crit->peerCnt(); + if (peerCnt) { hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); } } + tprintf(DB_MEM, "allocated pinned host ptr=%p on dev=%d, allow access to %d peer(s)\n", *ptr, device->_deviceId, peerCnt); } - tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); } } return ihipLogStatus(hip_status); @@ -355,6 +360,8 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags) vecAcc.push_back(ihipGetDevice(i)->_acc); } am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size()); + + tprintf(DB_MEM, " %s registered ptr=%p\n", __func__, hostPtr); if(am_status == AM_SUCCESS){ hip_status = hipSuccess; } else { @@ -378,6 +385,7 @@ hipError_t hipHostUnregister(void *hostPtr) }else{ auto device = ctx->getWriteableDevice(); am_status_t am_status = hc::am_memory_host_unlock(device->_acc, hostPtr); + tprintf(DB_MEM, " %s unregistered ptr=%p\n", __func__, hostPtr); if(am_status != AM_SUCCESS){ hip_status = hipErrorHostMemoryNotRegistered; } @@ -399,6 +407,7 @@ hipError_t hipMemcpyToSymbol(const char* symbolName, const void *src, size_t cou hc::accelerator acc = ctx->getDevice()->_acc; void *ptr = acc.get_symbol_address(symbolName); + tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, ptr); if(ptr == nullptr) { @@ -428,6 +437,7 @@ hipError_t hipMemcpyToSymbolAsync(const char* symbolName, const void *src, size_ hc::accelerator acc = ctx->getDevice()->_acc; void *ptr = acc.get_symbol_address(symbolName); + tprintf(DB_MEM, " symbol '%s' resolved to address:%p\n", symbolName, ptr); if(ptr == nullptr) { From d728819d1773e5a124a9af0eeb56d224049d7973 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 6 Nov 2016 03:22:36 -0600 Subject: [PATCH 40/65] Improve Peer support and testing. Change-Id: Icadc65988aaf145a265587ab0357c5bf4d26f3eb --- src/hip_hcc.cpp | 99 ++++++++++++++++-------------- src/hip_hcc.h | 2 +- tests/src/hipPeerToPeer_simple.cpp | 41 +++++++++++-- 3 files changed, 91 insertions(+), 51 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index d3f87a15c9..278ada0c94 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1731,26 +1731,35 @@ void ihipSetTs(hipEvent_t e) // Returns true if thisCtx can see the memory allocated on dstCtx and srcCtx. // The peer-list for a context controls which contexts have access to the memory allocated on that context. // So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. -bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx) -{ - tprintf (DB_COPY, "Checking if direct copy can be used. thisCtx:%s; dstCtx:%s ; srcCtx:%s\n", - thisCtx->toString().c_str(), dstCtx->toString().c_str(), srcCtx->toString().c_str()); - - // Use blocks to control scope of critical sections. - { - LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); - tprintf(DB_SYNC, "dstCrit lock succeeded\n"); - if (!ctxCrit->isPeerWatcher(thisCtx)) { - return false; - }; +bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo) +{ + + if (dstPtrInfo->_appId != -1) { + // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: + ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId); + if (thisCtx != dstCtx) { + // Only checks peer list if contexts are different + LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); + //tprintf(DB_SYNC, "dstCrit lock succeeded\n"); + if (!ctxCrit->isPeerWatcher(thisCtx)) { + return false; + }; + } } + - { - LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); - tprintf(DB_SYNC, "srcCrit lock succeeded\n"); - if (!ctxCrit->isPeerWatcher(thisCtx)) { - return false; - }; + + if (srcPtrInfo->_appId != -1) { + // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: + ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId); + if (thisCtx != srcCtx) { + // Only checks peer list if contexts are different + LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); + //tprintf(DB_SYNC, "srcCrit lock succeeded\n"); + if (!ctxCrit->isPeerWatcher(thisCtx)) { + return false; + }; + } } return true; @@ -1804,7 +1813,7 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDev // hipMemKind must be "resolved" to a specific direction - cannot be default. void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, - hc::hcCommandKind *hcCopyDir, bool *forceHostCopyEngine) + hc::hcCommandKind *hcCopyDir, bool *forceP2PCopyEngine) { ihipCtx_t *ctx = this->getCtx(); @@ -1823,21 +1832,19 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPoi // If this is P2P access, we need to check to see if the copy agent (specified by the stream where the copy is enqueued) // has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers - // and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (*forceHostCopyEngine=true). - *forceHostCopyEngine = false; - if (*hcCopyDir == hc::hcMemcpyDeviceToDevice) { - if (!canSeePeerMemory(ctx, ihipGetPrimaryCtx(dstPtrInfo->_appId), ihipGetPrimaryCtx(srcPtrInfo->_appId))) { - *forceHostCopyEngine = true; - tprintf (DB_COPY, "P2P D2D : copy engine cannot see both host and device pointers - forcing copy through staging buffers.\n"); + // and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (*forceP2PCopyEngine=true). + *forceP2PCopyEngine = false; + if (!canSeePeerMemory(ctx, dstPtrInfo, srcPtrInfo)) { + *forceP2PCopyEngine = true; + tprintf (DB_COPY, "P2P: copy engine(dev:%d) cannot see both host and device pointers - forcing copy through staging buffers.\n", ctx->getDeviceNum()); + } else { + if (HIP_FORCE_P2P_HOST ) { + *forceP2PCopyEngine = true; + tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", ctx->getDeviceNum()); } else { - if (HIP_FORCE_P2P_HOST ) { - *forceHostCopyEngine = true; - tprintf (DB_COPY, "P2P D2D. Copy engine can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n"); - } else { - tprintf (DB_COPY, "P2P D2D. Copy engine can see src and dst, Will use SDMA engine on streamDevice=%s.\n", ctx->toString().c_str()); - } + tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst.\n", ctx->getDeviceNum()); } - }; + } } @@ -1859,8 +1866,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, hc::hcCommandKind hcCopyDir; - bool forceHostCopyEngine; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceHostCopyEngine); + bool forceP2PCopyEngine; + resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceP2PCopyEngine); @@ -1868,15 +1875,15 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, LockedAccessor_StreamCrit_t crit (_criticalData); #if DISABLE_COPY_EXT #warning ("Disabled copy_ext path, P2P host staging copies will not work") - tprintf (DB_COPY, "copySync copyEngine_dev:%d dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceHostCopyEngine=%d. Call HCC copy\n", - ctx->getDeviceNum(), dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceHostCopyEngine); + tprintf (DB_COPY, "copySync copyEngine_dev:%d dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceP2PCopyEngine=%d. Call HCC copy\n", + ctx->getDeviceNum(), dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceP2PCopyEngine); // Note - peer-to-peer copies which require host staging will not work in this path. crit->_av.copy(src, dst, sizeBytes); #else - // If srcTracked == dstTracked =1 and forceHostCopyEngine=0 then we wil use async SDMA. (assuming HCC implementation doesn't override somehow) - tprintf (DB_COPY, "copySync copyEngine_dev:%d dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceHostCopyEngine=%d. Call HCC copy_ext.\n", - ctx->getDeviceNum(), dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceHostCopyEngine); - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); + // If srcTracked == dstTracked =1 and forceP2PCopyEngine=0 then we wil use async SDMA. (assuming HCC implementation doesn't override somehow) + tprintf (DB_COPY, "copySync copyEngine_dev:%d dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceP2PCopyEngine=%d. Call HCC copy_ext.\n", + ctx->getDeviceNum(), dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceP2PCopyEngine); + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceP2PCopyEngine); #endif } } @@ -1914,17 +1921,17 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes hc::hcCommandKind hcCopyDir; - bool forceHostCopyEngine; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceHostCopyEngine); + bool forceP2PCopyEngine; + resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceP2PCopyEngine); - tprintf (DB_COPY, "copyAsync dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceHostCopyEngine=%d\n", - dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceHostCopyEngine); + tprintf (DB_COPY, "copyAsync dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceP2PCopyEngine=%d\n", + dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceP2PCopyEngine); // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. - if (dstTracked && srcTracked && !forceHostCopyEngine) { + if (dstTracked && srcTracked && !forceP2PCopyEngine) { LockedAccessor_StreamCrit_t crit(_criticalData); // Perform asynchronous copy: @@ -1947,7 +1954,7 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes // Note - peer-to-peer copies which require host staging will not work in this path. crit->_av.copy(src, dst, sizeBytes); #else - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceHostCopyEngine); + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceP2PCopyEngine); #endif } } diff --git a/src/hip_hcc.h b/src/hip_hcc.h index 30512a9bd2..4ebf002a58 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -493,7 +493,7 @@ class ihipStream_t { void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, hc::hcCommandKind *hcCopyDir, bool *forceHostCopyEngine); - bool canSeePeerMemory(const ihipCtx_t *thisCtx, ihipCtx_t *dstCtx, ihipCtx_t *srcCtx); + bool canSeePeerMemory(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstInfo, const hc::AmPointerInfo *srcInfo); private: // Data diff --git a/tests/src/hipPeerToPeer_simple.cpp b/tests/src/hipPeerToPeer_simple.cpp index c2fc2e065c..a0bf6abac1 100644 --- a/tests/src/hipPeerToPeer_simple.cpp +++ b/tests/src/hipPeerToPeer_simple.cpp @@ -53,7 +53,9 @@ void help(char *argv[]) static hipError_t myHipMemcpy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream, bool async) { if (async) { - return hipMemcpyAsync(dest, src, sizeBytes, kind, stream); + hipError_t e = hipMemcpyAsync(dest, src, sizeBytes, kind, stream); + //HIPCHECK(hipStreamSynchronize(stream)); + return (e); } else { return hipMemcpy(dest, src, sizeBytes, kind); }; @@ -84,6 +86,19 @@ void parseMyArguments(int argc, char *argv[]) }; }; +void syncBothDevices() +{ + int saveDevice; + HIPCHECK(hipGetDevice(&saveDevice)); + HIPCHECK(hipSetDevice(g_currentDevice)); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipSetDevice(g_peerDevice)); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipSetDevice(saveDevice)); +} + // Sets globals g_currentDevice, g_peerDevice void setupPeerTests() @@ -162,6 +177,9 @@ void enablePeerFirst(bool useAsyncCopy) // Copy data back to host: HIPCHECK (hipSetDevice(g_peerDevice)); HIPCHECK (myHipMemcpy(A_h, A_d1, Nbytes, hipMemcpyDeviceToHost, 0/*stream*/, useAsyncCopy)); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK (hipSetDevice(g_currentDevice)); // Check host data: for (int i=0; i Date: Sun, 6 Nov 2016 04:07:51 -0600 Subject: [PATCH 41/65] Update gitignore for some common output files Change-Id: I9cd60f042af4dba07fe0fdbd2ee442936ff8c7bd --- .gitignore | 1 + samples/0_Intro/hcc_dialects/.gitignore | 5 +++++ samples/0_Intro/module_api/.gitignore | 5 +++++ 3 files changed, 11 insertions(+) create mode 100644 samples/0_Intro/hcc_dialects/.gitignore create mode 100644 samples/0_Intro/module_api/.gitignore diff --git a/.gitignore b/.gitignore index 67fa40f563..22cd23f2c6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .* +!.gitignore *.o *.exe *.swp diff --git a/samples/0_Intro/hcc_dialects/.gitignore b/samples/0_Intro/hcc_dialects/.gitignore new file mode 100644 index 0000000000..bce1cdf193 --- /dev/null +++ b/samples/0_Intro/hcc_dialects/.gitignore @@ -0,0 +1,5 @@ +vadd_amp_arrayview +vadd_hc_am +vadd_hc_array +vadd_hc_arrayview +vadd_hip diff --git a/samples/0_Intro/module_api/.gitignore b/samples/0_Intro/module_api/.gitignore new file mode 100644 index 0000000000..c1d81e043f --- /dev/null +++ b/samples/0_Intro/module_api/.gitignore @@ -0,0 +1,5 @@ +runKernel.hip.out +vcpy_isa.code +vcpy_isa.hsaco +vcpy_kernel.co +vcpy_kernel.code From 7106dc679616b984880e28f726e16f739beb044b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 6 Nov 2016 04:26:28 -0600 Subject: [PATCH 42/65] Fix tests to read warpSize from device props. Change-Id: I9583577793afad49f9eb1ee9069bd4c6963a6023 --- tests/src/deviceLib/hip_anyall.cpp | 14 +++++++++----- tests/src/deviceLib/hip_ballot.cpp | 8 +++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/src/deviceLib/hip_anyall.cpp b/tests/src/deviceLib/hip_anyall.cpp index aa025dad43..a562b7810e 100644 --- a/tests/src/deviceLib/hip_anyall.cpp +++ b/tests/src/deviceLib/hip_anyall.cpp @@ -45,11 +45,15 @@ int main(int argc, char *argv[]) { int warpSize, pshift; hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); - if(strncmp(devProp.name,"Fiji",1)==0) -{ warpSize =64; - pshift =6; -} - else {warpSize =32; pshift=5;} + warpSize = devProp.warpSize; + + int w = warpSize; + pshift = 0; + while (w >>= 1) ++pshift; + + printf ("warpSize=%d pshift=%d\n", warpSize, pshift); + + int anycount =0; int allcount =0; int Num_Threads_per_Block = 1024; diff --git a/tests/src/deviceLib/hip_ballot.cpp b/tests/src/deviceLib/hip_ballot.cpp index d6df069351..236ceb57fe 100644 --- a/tests/src/deviceLib/hip_ballot.cpp +++ b/tests/src/deviceLib/hip_ballot.cpp @@ -48,9 +48,11 @@ int main(int argc, char *argv[]) hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); - if(strncmp(devProp.name,"Fiji",1)==0) - {warpSize = 64; pshift =6;} - else {warpSize =32; pshift =5;} + warpSize = devProp.warpSize; + + int w = warpSize; + pshift = 0; + while (w >>= 1) ++pshift; unsigned int Num_Threads_per_Block = 512; unsigned int Num_Blocks_per_Grid = 1; From 7c68768a0d86a1efd5293b78126850e91486623e Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 8 Nov 2016 16:31:56 +0530 Subject: [PATCH 43/65] Update release notes for 1.0 release Change-Id: I74fa2b41afc334a76c309b125c27aa141cd59554 --- RELEASE.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index d1f79bc3c6..a7c770f611 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,8 +13,25 @@ Upcoming: ## Revision History: +=================================================================================================== +Release:1.0 +Date: 2016.11.8 +- Initial implementation for FindHIP.cmake +- HIP library now installs as a static library by default +- Added support for HIP context and HIP module APIs +- Major changes to HIP signal & memory management implementation +- Support for complex data type and math functions +- clang-hipify is now known as hipify-clang +- Added several new HIP samples +- Preliminary support for new APIs: hipMemcpyToSymbol, hipDeviceGetLimit, hipRuntimeGetVersion +- Added support for async memcpy driver API (for example hipMemcpyHtoDAsync) +- Support for memory management device functions: malloc, free, memcpy & memset +- Removed deprecated HIP runtime header locations. Please include "hip/hip_runtime.h" instead of "hip_runtime.h". You can use `find . -type f -exec sed -i 's:#include "hip_runtime.h":#include "hip/hip_runtime.h":g' {} +` to replace all such references + + =================================================================================================== Release:0.92.00 +Date: 2016.8.14 - hipLaunchKernel supports one-dimensional grid and/or block dims, without explicit cast to dim3 type (actually in 0.90.00) - fp16 software support - Support for Hawaii dGPUs using environment variable ROCM_TARGET=hawaii From 76c3c20da6e8c446ab128f784bb5f5e33cb7e2d7 Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 9 Nov 2016 11:33:23 -0600 Subject: [PATCH 44/65] fix for hipcallback function on NV path Change-Id: If80c0cfe60b1f3b1a71627b5f3f79503cba4d491 --- include/hip/nvcc_detail/hip_runtime_api.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index cb4a3f8676..9d2e12d8af 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -171,8 +171,8 @@ inline static cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind kind) { /** * Stream CallBack struct */ -typedef void(* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData); - +#define HIPRT_CB CUDART_CB +typedef void(HIPRT_CB * hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData); inline static hipError_t hipInit(unsigned int flags) { return hipCUResultTohipError(cuInit(flags)); @@ -589,8 +589,7 @@ inline static hipError_t hipStreamQuery(hipStream_t stream) inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback, void *userData, unsigned int flags) { - return hipCUDAErrorTohipError(cudaStreamAddCallback(cudaStream_t stream, - cudaStreamCallback_t callback, void *userData, unsigned int flags)); + return hipCUDAErrorTohipError(cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags)); } inline static hipError_t hipDriverGetVersion(int *driverVersion) From 57cd3c8244dc33bdaef92d562f607bc81f7d98e6 Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 9 Nov 2016 15:44:01 -0600 Subject: [PATCH 45/65] fix hipProfiler* apis on NV path Change-Id: I6adca6151fef3a9b35348163eb6bd13f5c414172 --- include/hip/nvcc_detail/hip_runtime_api.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 9d2e12d8af..924c44b79f 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -24,6 +24,7 @@ THE SOFTWARE. #include #include +#include #ifdef __cplusplus extern "C" { @@ -645,12 +646,12 @@ inline static hipError_t hipMemcpyPeerAsync ( void* dst, int dstDevice, const v // Profile APIs: inline hipError_t hipProfilerStart() { - return hipCUDAErrorTohipError(cudaProfileStart()); + return hipCUDAErrorTohipError(cudaProfilerStart()); } inline hipError_t hipProfilerStop() { - return hipCUDAErrorTohipError(cudaProfileStop()); + return hipCUDAErrorTohipError(cudaProfilerStop()); } From f7e9f12bf1081fba0558766ead0e73c34aeb7df2 Mon Sep 17 00:00:00 2001 From: pensun Date: Tue, 8 Nov 2016 01:01:26 -0600 Subject: [PATCH 46/65] Add option to alloc fingrained system memory Change-Id: Ia13c8e058cb988b5857e75a590a4d67411362ae1 --- src/hip_memory.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 85b22d86d0..e59e6c261d 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -154,6 +154,16 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) if(ctx){ // am_alloc requires writeable __acc, perhaps could be refactored? auto device = ctx->getWriteableDevice(); + // If HIP_COHERENT_HOST_ALLOC is defined, we always alloc coherent host system memroy + #ifdef HIP_COHERENT_HOST_ALLOC + *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); + if(sizeBytes < 1 && (*ptr == NULL)){ + hip_status = hipErrorMemoryAllocation; + } else { + hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent); + } + tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); + #else if((flags == hipHostMallocDefault)|| (flags == hipHostMallocPortable)){ *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); if (sizeBytes < 1 && (*ptr == NULL)) { @@ -179,6 +189,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) tprintf(DB_MEM, "allocated pinned host ptr=%p on dev=%d, allow access to %d peer(s)\n", *ptr, device->_deviceId, peerCnt); } } + #endif //HIP_COHERENT_HOST_ALLOC } return ihipLogStatus(hip_status); } From 24c621db5bbd056f495ca7f809b2ac940015960f Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 9 Nov 2016 20:54:19 -0600 Subject: [PATCH 47/65] Add documentation on threadfence_system workaround guidelines. Change-Id: I9636a3808798f3dabe992285ce5652187cee6eb8 --- docs/markdown/hip_kernel_language.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md index d629063f73..f84868987c 100644 --- a/docs/markdown/hip_kernel_language.md +++ b/docs/markdown/hip_kernel_language.md @@ -233,7 +233,11 @@ typedef struct dim3 { ## Memory-Fence Instructions HIP supports __threadfence() and __threadfence_block(). -Support for threadfence_system() is under development. +HIP provides workaround for threadfence_system() under HCC path. +To enable the workaround, HIP should be built with environment variable HIP_COHERENT_HOST_ALLOC enabled. +In addition,the kernels that use __threadfence_system() should be modified as follows: +- The kernel should only operate on finegrained system memory; which should be allocated with hipHostMalloc(). +- Remove all memcpy for those allocated finegrained system memory regions. ## Synchronization Functions The __syncthreads() built-in function is supported in HIP. The __syncthreads_count(int), __syncthreads_and(int) and __syncthreads_or(int) functions are under development. From 4d7ac1e09187bb35b68612643573a577a81027da Mon Sep 17 00:00:00 2001 From: pensun Date: Wed, 9 Nov 2016 21:38:43 -0600 Subject: [PATCH 48/65] resolve conflicts for git pull Change-Id: Ie353b831e2241bc28042069b6cc7405257e871e1 --- CMakeLists.txt | 10 ++++++++++ src/hip_hcc.h | 22 ++++++++++++++-------- src/hip_memory.cpp | 10 +++++----- 3 files changed, 29 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8958eaa090..59d3507c20 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,6 +142,16 @@ if(NOT DEFINED COMPILE_HIP_ATP_MARKER) endif() add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER) +# Check if we need to force finegrained system memory allocation +if(NOT DEFINED HIP_COHERENT_HOST_ALLOC) + if(NOT DEFINED ENV{HIP_COHERENT_HOST_ALLOC}) + set(HIP_COHERENT_HOST_ALLOC 0) + else() + set(HIP_COHERENT_HOST_ALLOC $ENV{HIP_COHERENT_HOST_ALLOC}) + endif() +endif() +add_to_config(_buildInfo HIP_COHERENT_HOST_ALLOC) + ############################# # Build steps ############################# diff --git a/src/hip_hcc.h b/src/hip_hcc.h index 4ebf002a58..f18d68473d 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -75,7 +75,7 @@ class ShortTid { int _shortTid; // monotonically increasing API sequence number for this threa. - uint64_t _apiSeqNum; + uint64_t _apiSeqNum; }; struct ProfTrigger { @@ -155,6 +155,12 @@ extern const char *API_COLOR_END; #endif +// Compile code that force hipHostMalloc only allocates finegrained system memory. +#ifndef HIP_COHERENT_HOST_ALLOC +#define HIP_COHERENT_HOST_ALLOC 0 +#endif + + // Compile support for trace markers that are displayed on CodeXL GUI at start/stop of each function boundary. // TODO - currently we print the trace message at the beginning. if we waited, we could also include return codes, and any values returned @@ -169,8 +175,8 @@ extern const char *API_COLOR_END; // Swallow scoped markers: #define MARKER_BEGIN(markerName,group) #define MARKER_END() -#define RESUME_PROFILING -#define STOP_PROFILING +#define RESUME_PROFILING +#define STOP_PROFILING #endif @@ -246,7 +252,7 @@ static const DbName dbName [] = {KRED, "signal"}, }; - + #if COMPILE_HIP_DB #define tprintf(trace_level, ...) {\ @@ -467,7 +473,7 @@ class ihipStream_t { void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal, uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ, uint32_t gridDimX, uint32_t gridDimY, uint32_t gridDimZ, - uint32_t groupSegmentSize, uint32_t sharedMemBytes, + uint32_t groupSegmentSize, uint32_t sharedMemBytes, void *kernarg, size_t kernSize, uint64_t kernel); @@ -490,7 +496,7 @@ class ihipStream_t { // The unsigned return is hipMemcpyKind unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); - void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, + void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, hc::hcCommandKind *hcCopyDir, bool *forceHostCopyEngine); bool canSeePeerMemory(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstInfo, const hc::AmPointerInfo *srcInfo); @@ -662,7 +668,7 @@ class ihipCtx_t // TODO - review uses of getWriteableDevice(), can these be converted to getDevice() ihipDevice_t *getWriteableDevice() const { return _device; }; - std::string toString() const; + std::string toString() const; public: // Data // The NULL stream is used if no other stream is specified. @@ -751,7 +757,7 @@ inline std::ostream& operator<<(std::ostream& os, const hipEvent_t& e) inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) { - os << "ctx:" << static_cast (c) + os << "ctx:" << static_cast (c) << ".dev:" << c->getDevice()->_deviceId; return os; } diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index e59e6c261d..ee05c6b00a 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -126,7 +126,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) int peerCnt=0; { LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually + // the peerCnt always stores self so make sure the trace actually peerCnt = crit->peerCnt(); if (peerCnt > 1) { hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); @@ -154,8 +154,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) if(ctx){ // am_alloc requires writeable __acc, perhaps could be refactored? auto device = ctx->getWriteableDevice(); - // If HIP_COHERENT_HOST_ALLOC is defined, we always alloc coherent host system memroy - #ifdef HIP_COHERENT_HOST_ALLOC + // If HIP_COHERENT_HOST_ALLOC is defined, we always alloc coherent host system memroy +#if HIP_COHERENT_HOST_ALLOC *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); if(sizeBytes < 1 && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; @@ -163,7 +163,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent); } tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); - #else +#else if((flags == hipHostMallocDefault)|| (flags == hipHostMallocPortable)){ *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); if (sizeBytes < 1 && (*ptr == NULL)) { @@ -189,7 +189,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) tprintf(DB_MEM, "allocated pinned host ptr=%p on dev=%d, allow access to %d peer(s)\n", *ptr, device->_deviceId, peerCnt); } } - #endif //HIP_COHERENT_HOST_ALLOC +#endif //HIP_COHERENT_HOST_ALLOC } return ihipLogStatus(hip_status); } From 36024deb3af35c5d1642f608b4574234804a9438 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 10 Nov 2016 11:27:28 +0530 Subject: [PATCH 49/65] hcc_dialects/Makefile: use clamp-config Change-Id: I86df82f75b75125825e22d0545209a19386d9936 --- samples/0_Intro/hcc_dialects/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/samples/0_Intro/hcc_dialects/Makefile b/samples/0_Intro/hcc_dialects/Makefile index 3b5ceca7f0..4a514b6691 100644 --- a/samples/0_Intro/hcc_dialects/Makefile +++ b/samples/0_Intro/hcc_dialects/Makefile @@ -5,8 +5,8 @@ OPT=-O2 HCC_CFLAGS= `$(HCC_HOME)/bin/hcc-config --cxxflags` ${OPT} HCC_LDFLAGS= `$(HCC_HOME)/bin/hcc-config --ldflags` ${OPT} -CPPAMP_CFLAGS= -std=c++amp -stdlib=libc++ -I$(HCC_HOME)/include -CPPAMP_LDFLAGS= -std=c++amp -L$(HCC_HOME)/lib -Wl,--rpath=$(HCC_HOME)/lib -lc++ -lc++abi -ldl -lpthread -Wl,--whole-archive -lmcwamp -Wl,--no-whole-archive +CPPAMP_CFLAGS= `$(HCC_HOME)/bin/clamp-config --cxxflags` +CPPAMP_LDFLAGS= `$(HCC_HOME)/bin/clamp-config --ldflags` HIP_PATH?= $(wildcard /opt/rocm/hip) ifeq (,$(HIP_PATH)) From 669d734624325b7203e29c314107cd9722dd344c Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 10 Nov 2016 11:34:00 +0530 Subject: [PATCH 50/65] hipcc: Default to HIP_LIB_TYPE=1 Change-Id: I83b05accd76f7bc94bd724c66ae060fa0095bc8d --- bin/hipcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hipcc b/bin/hipcc index 3441ac764b..1504d68141 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -320,7 +320,7 @@ if ($setStdLib eq 0 and $HIP_PLATFORM eq 'hcc') } if ($needHipHcc) { - $HIP_LIB_TYPE = $hipConfig{'HIP_LIB_TYPE'} // 0; + $HIP_LIB_TYPE = $hipConfig{'HIP_LIB_TYPE'} // 1; # TODO - remove the old sea-of-objects solution: if ($HIP_LIB_TYPE eq 0) { From fcb94863f7c4c04d7cb072d6acb3c1eebe9fc939 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 10 Nov 2016 11:35:40 +0530 Subject: [PATCH 51/65] hipDeviceGetByPCIBusId support for HIP/NVCC Change-Id: I8f82890e88d2a15f592bff192179e7d5c5362722 --- include/hip/nvcc_detail/hip_runtime_api.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 0d73bae199..0d15dfcb01 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -764,6 +764,11 @@ inline static hipError_t hipDeviceGetPCIBusId(int *pciBusId,int len,hipDevice_t return hipCUResultTohipError(cuDeviceGetPCIBusId((char*)pciBusId,len,device)); } +inline static hipError_t hipDeviceGetByPCIBusId(int* device, const int *pciBusId) +{ + return hipCUDAErrorTohipError(cudaDeviceGetByPCIBusId(device,(char*)pciBusId)); +} + inline static hipError_t hipDeviceGetLimit(size_t *pValue, hipLimit_t limit) { return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit)); From a12d5a8989b688c76e09afe8145d3505d999c969 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 10 Nov 2016 21:26:34 +0530 Subject: [PATCH 52/65] CMakeLists.txt: Cascade CMAKE_BUILD_TYPE to tests Change-Id: I53a3ea951c1fd57e43a02381a457c1dedc1a34f7 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 59d3507c20..e5c3d51c6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -347,7 +347,7 @@ endif() add_custom_target(install_for_test COMMAND "${CMAKE_COMMAND}" --build . --target install WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) execute_process(COMMAND getconf _NPROCESSORS_ONLN OUTPUT_VARIABLE DASH_JAY OUTPUT_STRIP_TRAILING_WHITESPACE) -add_custom_target(test COMMAND ${CMAKE_COMMAND} . +add_custom_target(test COMMAND ${CMAKE_COMMAND} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} . COMMAND make -j ${DASH_JAY} COMMAND make test WORKING_DIRECTORY ${BUILD_DIR} From 1ec5761a1148d68f3d2697fe84baca90f1511cfd Mon Sep 17 00:00:00 2001 From: pensun Date: Thu, 10 Nov 2016 11:54:59 -0600 Subject: [PATCH 53/65] Update depreciated information for threadfence_system() Change-Id: Id13d2f81edb51eb42b896a5c06913d59ec907c55 --- include/hip/hcc_detail/hip_runtime.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 73fc022362..b1edef18d7 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -558,7 +558,7 @@ extern "C" __device__ void __threadfence(void); * * @warning __threadfence_system is a stub and map to no-op. */ -__device__ void __threadfence_system(void) __attribute__((deprecated("Provided for compile-time compatibility, not yet functional"))); +__device__ void __threadfence_system(void) __attribute__((deprecated("Provided with workaround configuration, see hip_kernel_language.md for details"))); __device__ unsigned __hip_ds_bpermute(int index, unsigned src); __device__ float __hip_ds_bpermutef(int index, float src); From 2dea3a0b1a172efddb13ddbf0cfbbf85cbd475ff Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 6 Nov 2016 10:36:08 -0600 Subject: [PATCH 54/65] Improve memory debug Change-Id: I0f033139aa4e4b47039eb016e404009127bd0a44 --- src/hip_hcc.h | 3 ++- src/hip_memory.cpp | 30 ++++++++++++++++++++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/src/hip_hcc.h b/src/hip_hcc.h index f18d68473d..d8b7030e4f 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -613,6 +613,8 @@ class ihipCtxCriticalBase_t : LockedBase hsa_agent_t *peerAgents() const { return _peerAgents; }; + // TODO - move private + std::list _peers; // list of enabled peer devices. friend class LockedAccessor; private: @@ -624,7 +626,6 @@ class ihipCtxCriticalBase_t : LockedBase // These reflect the currently Enabled set of peers for this GPU: // Enabled peers have permissions to access the memory physically allocated on this device. // Note the peers always contain the self agent for easy interfacing with HSA APIs. - std::list _peers; // list of enabled peer devices. uint32_t _peerCnt; // number of enabled peers hsa_agent_t *_peerAgents; // efficient packed array of enabled agents (to use for allocations.) private: diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index ee05c6b00a..5be319d9ed 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -119,6 +119,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) const unsigned am_flags = 0; *ptr = hc::am_alloc(sizeBytes, device->_acc, am_flags); + if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { @@ -128,11 +129,23 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) LockedAccessor_CtxCrit_t crit(ctx->criticalData()); // the peerCnt always stores self so make sure the trace actually peerCnt = crit->peerCnt(); - if (peerCnt > 1) { - hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); + tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allowed %d other peer(s) access\n", + *ptr, sizeBytes, device->_deviceId, peerCnt-1); + if (peerCnt > 1) { + + //printf ("peer self access\n"); + + // TODOD - remove me: + for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { + tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); + }; + + hsa_status_t e = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); + if (e != HSA_STATUS_SUCCESS) { + hip_status = hipErrorMemoryAllocation; + } } } - tprintf(DB_MEM, " allocated %p (size=%zu) on dev:%d and allowed %d other peer(s) access\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1); } } else { hip_status = hipErrorMemoryAllocation; @@ -153,9 +166,14 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) if(ctx){ // am_alloc requires writeable __acc, perhaps could be refactored? + // TODO-P1 - Review and test this logic. Seems : + // hipHostMallocPortable should map the host memory into all contexts, regardless of peer status. + // peer mappings should always be honored. + // hipHostMallocMapped should be ignored on ROCM - all memory is mapped to host. auto device = ctx->getWriteableDevice(); // If HIP_COHERENT_HOST_ALLOC is defined, we always alloc coherent host system memroy #if HIP_COHERENT_HOST_ALLOC + // TODOD - let's make this an environment variable *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); if(sizeBytes < 1 && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; @@ -164,14 +182,14 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); #else - if((flags == hipHostMallocDefault)|| (flags == hipHostMallocPortable)){ + if ((flags == hipHostMallocDefault) || (flags == hipHostMallocPortable)) { *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); if (sizeBytes < 1 && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } else { hc::am_memtracker_update(*ptr, device->_deviceId, amHostPinned); } - tprintf(DB_MEM, "allocated pinned host ptr=%p on dev=%d\n", *ptr, device->_deviceId); + tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d\n", *ptr, sizeBytes, device->_deviceId); } else if(flags & hipHostMallocMapped) { *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); if (sizeBytes && (*ptr == NULL)) { @@ -186,7 +204,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); } } - tprintf(DB_MEM, "allocated pinned host ptr=%p on dev=%d, allow access to %d peer(s)\n", *ptr, device->_deviceId, peerCnt); + tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d, allow access to %d peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt); } } #endif //HIP_COHERENT_HOST_ALLOC From ced9d72d9450a93f5622b140d8d5f04c3ff73b0c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 10 Nov 2016 10:49:44 -0600 Subject: [PATCH 55/65] Refactor copy and P2P logic. Prefer use of source-engine for DMA copies, even if user submits copy in a stream attached to a different device. The stream is now used only for synchronization, and HIP makes the most optimal decision for which engine to perform the copy - typically the source copy engine. HIP now makes decision on which engine should perform the copy and passes this to HCC using new apis. HIP has additional information about peer visibility and will make a decision which agent should perform the copy . Change-Id: I0cf4cfebeae256e6ca795f08a7ed7130f4857d1f --- .vimrc | 5 +- src/hip_hcc.cpp | 145 ++++++++++++++++++++++----------------------- src/hip_hcc.h | 12 +++- src/hip_memory.cpp | 135 +++++++++++++++-------------------------- src/hip_peer.cpp | 10 ++-- 5 files changed, 134 insertions(+), 173 deletions(-) diff --git a/.vimrc b/.vimrc index ed64acd347..019afa57e6 100644 --- a/.vimrc +++ b/.vimrc @@ -1,4 +1 @@ -:set tabstop=4 -:set shiftwidth=4 -:set expandtab -:set smartindent +:set makeprg=make\ -C\ build.hcc-LC.db diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 278ada0c94..b35c2db7ec 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1731,35 +1731,34 @@ void ihipSetTs(hipEvent_t e) // Returns true if thisCtx can see the memory allocated on dstCtx and srcCtx. // The peer-list for a context controls which contexts have access to the memory allocated on that context. // So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. -bool ihipStream_t::canSeePeerMemory(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo) -{ - - if (dstPtrInfo->_appId != -1) { - // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: - ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId); - if (thisCtx != dstCtx) { - // Only checks peer list if contexts are different - LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); - //tprintf(DB_SYNC, "dstCrit lock succeeded\n"); - if (!ctxCrit->isPeerWatcher(thisCtx)) { - return false; - }; - } +// TODO- change these to use dst and src ptr info. +bool ihipStream_t::chooseDirectPeerToPeer(const ihipCtx_t *copyEngineCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo) +{ + + // Make sure this is a device-to-device copy with all memory available to the requested copy engine + // + // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: + ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId); + if (copyEngineCtx != dstCtx) { + // Only checks peer list if contexts are different + LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); + //tprintf(DB_SYNC, "dstCrit lock succeeded\n"); + if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { + return false; + }; } - - if (srcPtrInfo->_appId != -1) { - // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: - ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId); - if (thisCtx != srcCtx) { - // Only checks peer list if contexts are different - LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); - //tprintf(DB_SYNC, "srcCrit lock succeeded\n"); - if (!ctxCrit->isPeerWatcher(thisCtx)) { - return false; - }; - } + + // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: + ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId); + if (copyEngineCtx != srcCtx) { + // Only checks peer list if contexts are different + LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); + //tprintf(DB_SYNC, "srcCrit lock succeeded\n"); + if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { + return false; + }; } return true; @@ -1812,14 +1811,16 @@ unsigned ihipStream_t::resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDev // hipMemKind must be "resolved" to a specific direction - cannot be default. -void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, - hc::hcCommandKind *hcCopyDir, bool *forceP2PCopyEngine) +void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, + const hc::AmPointerInfo *dstPtrInfo, + const hc::AmPointerInfo *srcPtrInfo, + hc::hcCommandKind *hcCopyDir, + ihipCtx_t **copyDevice) { - ihipCtx_t *ctx = this->getCtx(); + // Ignore what the user tells us and always resolve the direction: + // Some apps apparently rely on this. + hipMemKind = resolveMemcpyDirection(srcPtrInfo->_isInDeviceMem, dstPtrInfo->_isInDeviceMem); - if (hipMemKind == hipMemcpyDefault) { - hipMemKind = resolveMemcpyDirection(srcPtrInfo->_isInDeviceMem, dstPtrInfo->_isInDeviceMem); - } switch (hipMemKind) { case hipMemcpyHostToHost: *hcCopyDir = hc::hcMemcpyHostToHost; break; @@ -1829,20 +1830,24 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPoi default: throw ihipException(hipErrorRuntimeOther); }; - - // If this is P2P access, we need to check to see if the copy agent (specified by the stream where the copy is enqueued) - // has peer access enabled to both the source and dest. If this is true, then the copy agent can see both pointers - // and we can perform the access with the copy engine from the current stream. If not true, then we will copy through the host. (*forceP2PCopyEngine=true). - *forceP2PCopyEngine = false; - if (!canSeePeerMemory(ctx, dstPtrInfo, srcPtrInfo)) { - *forceP2PCopyEngine = true; - tprintf (DB_COPY, "P2P: copy engine(dev:%d) cannot see both host and device pointers - forcing copy through staging buffers.\n", ctx->getDeviceNum()); + if (srcPtrInfo->_isInDeviceMem) { + *copyDevice = ihipGetPrimaryCtx(srcPtrInfo->_appId); + } else if (dstPtrInfo->_isInDeviceMem) { + *copyDevice = ihipGetPrimaryCtx(dstPtrInfo->_appId); } else { - if (HIP_FORCE_P2P_HOST ) { - *forceP2PCopyEngine = true; - tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", ctx->getDeviceNum()); + *copyDevice = nullptr; + } + + if (hipMemKind == hipMemcpyDeviceToDevice) { + if (chooseDirectPeerToPeer(*copyDevice, dstPtrInfo, srcPtrInfo)) { + if (HIP_FORCE_P2P_HOST ) { + tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", (*copyDevice)->getDeviceNum()); + } else { + tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst.\n", (*copyDevice)->getDeviceNum()); + } } else { - tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst.\n", ctx->getDeviceNum()); + *copyDevice = nullptr; + tprintf (DB_COPY, "P2P: copy engine(dev:%d) cannot see both host and device pointers - forcing copy through staging buffers.\n", (*copyDevice)->getDeviceNum()); } } } @@ -1866,25 +1871,20 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, hc::hcCommandKind hcCopyDir; - bool forceP2PCopyEngine; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceP2PCopyEngine); + ihipCtx_t *copyDevice; + resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device); + // copy_ext will use copy-engine to perform the copy. nullptr then { LockedAccessor_StreamCrit_t crit (_criticalData); -#if DISABLE_COPY_EXT -#warning ("Disabled copy_ext path, P2P host staging copies will not work") - tprintf (DB_COPY, "copySync copyEngine_dev:%d dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceP2PCopyEngine=%d. Call HCC copy\n", - ctx->getDeviceNum(), dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceP2PCopyEngine); - // Note - peer-to-peer copies which require host staging will not work in this path. - crit->_av.copy(src, dst, sizeBytes); -#else - // If srcTracked == dstTracked =1 and forceP2PCopyEngine=0 then we wil use async SDMA. (assuming HCC implementation doesn't override somehow) - tprintf (DB_COPY, "copySync copyEngine_dev:%d dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceP2PCopyEngine=%d. Call HCC copy_ext.\n", - ctx->getDeviceNum(), dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceP2PCopyEngine); - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceP2PCopyEngine); -#endif + tprintf (DB_COPY, "copySync copyDev:%d dst=%p(home_dev:%d, tracked:%d, isDevMem:%d) src=%p(home_dev:%d, tracked:%d, isDevMem:%d) sz=%zu dir=%s\n", + copyDevice ? copyDevice->getDeviceNum():-1, + dst, dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem, + src, srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem, + sizeBytes, hcMemcpyStr(hcCopyDir)); + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr); } } @@ -1921,22 +1921,26 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes hc::hcCommandKind hcCopyDir; - bool forceP2PCopyEngine; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &forceP2PCopyEngine); - + ihipCtx_t *copyDevice; + resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device); - tprintf (DB_COPY, "copyAsync dst=%p(home_dev:%d) src=%p(home_dev:%d) sz=%zu dstTracked=%d srcTracked=%d dir=%s forceP2PCopyEngine=%d\n", - dst, dstPtrInfo._appId, src, srcPtrInfo._appId, sizeBytes, dstTracked, srcTracked, hcMemcpyStr(hcCopyDir), forceP2PCopyEngine); + tprintf (DB_COPY, "copyASync copyEngine_dev:%d dst=%p(home_dev:%d, tracked:%d, isDevMem:%d) src=%p(home_dev:%d, tracked:%d, isDevMem:%d) sz=%zu dir=%s . \n", + copyDevice->getDeviceNum(), + dst, dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem, + src, srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem, + sizeBytes, hcMemcpyStr(hcCopyDir)); // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. - if (dstTracked && srcTracked && !forceP2PCopyEngine) { + if (dstTracked && srcTracked && copyDevice) { LockedAccessor_StreamCrit_t crit(_criticalData); - // Perform asynchronous copy: + // Perform fast asynchronous copy: try { - crit->_av.copy_async(src, dst, sizeBytes); + printf ("forcing copy to use synchronous path: !!!!!\n"); + //crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice); + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc ); } catch (Kalmar::runtime_exception) { throw ihipException(hipErrorRuntimeOther); }; @@ -1949,17 +1953,12 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes } else { LockedAccessor_StreamCrit_t crit(_criticalData); -#if DISABLE_COPY_EXT -#warning ("Disabled copy_ext path, P2P host staging copies will not work") - // Note - peer-to-peer copies which require host staging will not work in this path. - crit->_av.copy(src, dst, sizeBytes); -#else - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceP2PCopyEngine); -#endif + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr); } } } + //------------------------------------------------------------------------------------------------- //------------------------------------------------------------------------------------------------- //Profiler, really these should live elsewhere: diff --git a/src/hip_hcc.h b/src/hip_hcc.h index d8b7030e4f..ca759ba78d 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -496,10 +496,10 @@ class ihipStream_t { // The unsigned return is hipMemcpyKind unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); - void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, - hc::hcCommandKind *hcCopyDir, bool *forceHostCopyEngine); + void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, + hc::hcCommandKind *hcCopyDir, ihipCtx_t **copyDevice); - bool canSeePeerMemory(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstInfo, const hc::AmPointerInfo *srcInfo); + bool chooseDirectPeerToPeer(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstInfo, const hc::AmPointerInfo *srcInfo); private: // Data @@ -764,4 +764,10 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) } +// Helper functions that are used across src files: +namespace hip_internal { + hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream); +}; + + #endif diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 5be319d9ed..2f1eb1e27f 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -164,47 +164,50 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) auto ctx = ihipGetTlsDefaultCtx(); - if(ctx){ - // am_alloc requires writeable __acc, perhaps could be refactored? - // TODO-P1 - Review and test this logic. Seems : - // hipHostMallocPortable should map the host memory into all contexts, regardless of peer status. - // peer mappings should always be honored. - // hipHostMallocMapped should be ignored on ROCM - all memory is mapped to host. - auto device = ctx->getWriteableDevice(); - // If HIP_COHERENT_HOST_ALLOC is defined, we always alloc coherent host system memroy + if (sizeBytes == 0) { + hip_status = hipSuccess; + // TODO - should size of 0 return err or be siliently ignored? + } else if ((ctx==nullptr) || (ptr == nullptr)) { + hip_status = hipErrorInvalidValue; + } else { + unsigned trueFlags = flags; + if (flags == hipHostMallocDefault) { + trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined; + } + + const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; + + if (flags & ~supportedFlags) { + hip_status = hipErrorInvalidValue; + } else { #if HIP_COHERENT_HOST_ALLOC - // TODOD - let's make this an environment variable - *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if(sizeBytes < 1 && (*ptr == NULL)){ - hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent); - } - tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); -#else - if ((flags == hipHostMallocDefault) || (flags == hipHostMallocPortable)) { + // TODO - let's make this an environment variable *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if (sizeBytes < 1 && (*ptr == NULL)) { + if(sizeBytes < 1 && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; } else { - hc::am_memtracker_update(*ptr, device->_deviceId, amHostPinned); + hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent); } - tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d\n", *ptr, sizeBytes, device->_deviceId); - } else if(flags & hipHostMallocMapped) { + tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); +#else + // TODO - am_alloc requires writeable __acc, perhaps could be refactored? + // TODO - hipHostMallocMapped is be ignored on ROCM - all memory is mapped to host address space as WC. + auto device = ctx->getWriteableDevice(); *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if (sizeBytes && (*ptr == NULL)) { + if (*ptr == NULL) { hip_status = hipErrorMemoryAllocation; } else { hc::am_memtracker_update(*ptr, device->_deviceId, flags); + // TODO-hipHostMallocPortable should map the host memory into all contexts, regardless of peer status. int peerCnt=0; { LockedAccessor_CtxCrit_t crit(ctx->criticalData()); peerCnt = crit->peerCnt(); - if (peerCnt) { + if (peerCnt > 1) { hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); } } - tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d, allow access to %d peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt); + tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1); } } #endif //HIP_COHERENT_HOST_ALLOC @@ -595,10 +598,13 @@ hipError_t hipMemcpyHtoH(void* dst, void* src, size_t sizeBytes) return ihipLogStatus(e); } -hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) -{ - HIP_INIT_API(dst, src, sizeBytes, kind, stream); + +// Internal copy sync: +namespace hip_internal { + +hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) +{ hipError_t e = hipSuccess; stream = ihipSyncAndResolveStream(stream); @@ -617,86 +623,39 @@ hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcp e = hipErrorInvalidValue; } - return ihipLogStatus(e); + return e; } +} // end namespace hip_internal -hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream) + +hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_API(dst, src, sizeBytes, stream); + HIP_INIT_API(dst, src, sizeBytes, kind, stream); - hipError_t e = hipSuccess; + return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, kind, stream)); - stream = ihipSyncAndResolveStream(stream); +} - hipMemcpyKind kind = hipMemcpyHostToDevice; - if ((dst == NULL) || (src == NULL)) { - e= hipErrorInvalidValue; - } else if (stream) { - try { - stream->locked_copyAsync((void*)dst, src, sizeBytes, kind); - } - catch (ihipException ex) { - e = ex._code; - } - } else { - e = hipErrorInvalidValue; - } +hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t sizeBytes, hipStream_t stream) +{ + HIP_INIT_API(dst, src, sizeBytes, stream); - return ihipLogStatus(e); + return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyHostToDevice, stream)); } hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream) { HIP_INIT_API(dst, src, sizeBytes, stream); - hipError_t e = hipSuccess; - - hipMemcpyKind kind = hipMemcpyDeviceToDevice; - - stream = ihipSyncAndResolveStream(stream); - - - if ((dst == NULL) || (src == NULL)) { - e= hipErrorInvalidValue; - } else if (stream) { - try { - stream->locked_copyAsync((void*)dst, (void*)src, sizeBytes, kind); - } - catch (ihipException ex) { - e = ex._code; - } - } else { - e = hipErrorInvalidValue; - } - - return ihipLogStatus(e); + return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream)); } hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t sizeBytes, hipStream_t stream) { HIP_INIT_API(dst, src, sizeBytes, stream); - hipError_t e = hipSuccess; - - stream = ihipSyncAndResolveStream(stream); - - hipMemcpyKind kind = hipMemcpyDeviceToHost; - - if ((dst == NULL) || (src == NULL)) { - e= hipErrorInvalidValue; - } else if (stream) { - try { - stream->locked_copyAsync(dst, (void*)src, sizeBytes, kind); - } - catch (ihipException ex) { - e = ex._code; - } - } else { - e = hipErrorInvalidValue; - } - - return ihipLogStatus(e); + return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToHost, stream)); } // TODO - review and optimize diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index 95ea4719a9..b0e4eeef52 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -149,7 +149,7 @@ hipError_t hipMemcpyPeer (void* dst, hipCtx_t dstCtx, const void* src, hipCtx_t // TODO - move to ihip memory copy implementaion. // HCC has a unified memory architecture so device specifiers are not required. - return hipMemcpy(dst, src, sizeBytes, hipMemcpyDefault); + return ihipLogStatus(hipMemcpy(dst, src, sizeBytes, hipMemcpyDefault)); }; @@ -160,7 +160,7 @@ hipError_t hipMemcpyPeerAsync (void* dst, hipCtx_t dstDevice, const void* src, h // TODO - move to ihip memory copy implementaion. // HCC has a unified memory architecture so device specifiers are not required. - return hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream); + return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream)); }; @@ -173,7 +173,7 @@ hipError_t hipMemcpyPeerAsync (void* dst, hipCtx_t dstDevice, const void* src, h hipError_t hipDeviceCanAccessPeer (int* canAccessPeer, int deviceId, int peerDeviceId) { HIP_INIT_API(canAccessPeer, deviceId, peerDeviceId); - return ihipDeviceCanAccessPeer(canAccessPeer, ihipGetPrimaryCtx(deviceId), ihipGetPrimaryCtx(peerDeviceId)); + return ihipLogStatus(ihipDeviceCanAccessPeer(canAccessPeer, ihipGetPrimaryCtx(deviceId), ihipGetPrimaryCtx(peerDeviceId))); } @@ -196,14 +196,14 @@ hipError_t hipDeviceEnablePeerAccess (int peerDeviceId, unsigned int flags) hipError_t hipMemcpyPeer (void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes) { HIP_INIT_API(dst, dstDevice, src, srcDevice, sizeBytes); - return hipMemcpyPeer(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes); + return ihipLogStatus(hipMemcpyPeer(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes)); } hipError_t hipMemcpyPeerAsync (void* dst, int dstDevice, const void* src, int srcDevice, size_t sizeBytes, hipStream_t stream) { HIP_INIT_API(dst, dstDevice, src, srcDevice, sizeBytes, stream); - return hipMemcpyPeerAsync(dst, ihipGetPrimaryCtx(dstDevice), src, ihipGetPrimaryCtx(srcDevice), sizeBytes, stream); + return ihipLogStatus(hip_internal::memcpyAsync(dst, src, sizeBytes, hipMemcpyDefault, stream)); } hipError_t hipCtxEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) From 8724273f28c18cf246c1dcaa7ea0a8431c556638 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 10 Nov 2016 10:53:10 -0600 Subject: [PATCH 56/65] Doc change only - add comments to test. Change-Id: Ie42087cf3c78e49337b18bb71f3f0e1e7950ee1b --- tests/README.md | 12 ++++++------ tests/src/hipFuncDeviceSynchronize.cpp | 7 ++++++- tests/src/hipHostGetFlags.cpp | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/README.md b/tests/README.md index 56bb4e7edd..223bd149dc 100644 --- a/tests/README.md +++ b/tests/README.md @@ -53,11 +53,11 @@ ctest -R Memcpy ### If a test fails - how to debug a test -Extract the commandline from the testing log: +Find the test and commandline that fail: (From the test build directory, perhaps hip/tests/build) -$ grep -A3 -m2 hipMemcpy-size Testing/Temporary/LastTest.log -36/47 Testing: hipMemcpy-size -36/47 Test: hipMemcpy-size -Command: "/home/bensander/git/compute/external/hip/hip/tests/b6.hcc-LC.debug/runtimeApi/memory/hipMemcpy" "--tests" "0x6" -Directory: /home/bensander/git/compute/external/hip/hip/tests/b6.hcc-LC.debug/runtimeApi/memory +grep -IR hipMemcpy-modes -IR ../tests/ +../tests/src/runtimeApi/memory/hipMemcpy.cpp: * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 + + + diff --git a/tests/src/hipFuncDeviceSynchronize.cpp b/tests/src/hipFuncDeviceSynchronize.cpp index 6d05253536..930bc37b8b 100644 --- a/tests/src/hipFuncDeviceSynchronize.cpp +++ b/tests/src/hipFuncDeviceSynchronize.cpp @@ -35,6 +35,7 @@ THE SOFTWARE. __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i Date: Thu, 10 Nov 2016 16:27:05 -0600 Subject: [PATCH 57/65] Enable async copy again. Also add HIP_FORCE_SYNC_COPY chicken bit. Change-Id: I76a385410494b99bf27305d3c08f55dd81987565 --- src/hip_hcc.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index b35c2db7ec..bf87b12ce1 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -75,6 +75,11 @@ int HIP_WAIT_MODE = 0; int HIP_FORCE_P2P_HOST = 0; +// Force async copies to actually use the synchronous copy interface. +int HIP_FORCE_SYNC_COPY = 0; + + + @@ -1368,6 +1373,7 @@ void ihipInit() READ_ENV_I(release, HIP_WAIT_MODE, 0, "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in application"); READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copiecopies"); + READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, "Force all copies (even hipMemcpyAsync) to use sync copies"); READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. @@ -1933,14 +1939,16 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. - if (dstTracked && srcTracked && copyDevice) { + if (dstTracked && srcTracked && copyDevice/*code below assumes this is !nullptr*/) { LockedAccessor_StreamCrit_t crit(_criticalData); - // Perform fast asynchronous copy: + // Perform fast asynchronous copy - we know copyDevice != NULL based on check above try { - printf ("forcing copy to use synchronous path: !!!!!\n"); - //crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice); - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc ); + if (HIP_FORCE_SYNC_COPY) { + crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc); + } else { + crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc); + } } catch (Kalmar::runtime_exception) { throw ihipException(hipErrorRuntimeOther); }; From 65584e48de64890e4dcb4414521d289469464aef Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 10 Nov 2016 22:59:49 -0600 Subject: [PATCH 58/65] Use forceUnpinnedCopy to resolve P2p corner cases. Change-Id: I2aebb419881246cebb696bec87798635bc71acc2 --- src/hip_hcc.cpp | 115 ++++++++++++++++++++++++++++++------------------ src/hip_hcc.h | 9 ++-- 2 files changed, 77 insertions(+), 47 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index bf87b12ce1..fe69810615 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -48,7 +48,9 @@ THE SOFTWARE. #include "trace_helper.h" - +#ifndef USE_COPY_EXT_V2 +#define USE_COPY_EXT_V2 0 +#endif //================================================================================================= //Global variables: @@ -1734,37 +1736,44 @@ void ihipSetTs(hipEvent_t e) } -// Returns true if thisCtx can see the memory allocated on dstCtx and srcCtx. +// Returns true if copyEngineCtx can see the memory allocated on dstCtx and srcCtx. // The peer-list for a context controls which contexts have access to the memory allocated on that context. // So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. -// TODO- change these to use dst and src ptr info. -bool ihipStream_t::chooseDirectPeerToPeer(const ihipCtx_t *copyEngineCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo) +bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo) { // Make sure this is a device-to-device copy with all memory available to the requested copy engine // // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: - ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId); - if (copyEngineCtx != dstCtx) { - // Only checks peer list if contexts are different - LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); - //tprintf(DB_SYNC, "dstCrit lock succeeded\n"); - if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { - return false; - }; + if (dstPtrInfo->_sizeBytes == 0) { + return false; + } else { + ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId); + if (copyEngineCtx != dstCtx) { + // Only checks peer list if contexts are different + LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); + //tprintf(DB_SYNC, "dstCrit lock succeeded\n"); + if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { + return false; + }; + } } // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: - ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId); - if (copyEngineCtx != srcCtx) { - // Only checks peer list if contexts are different - LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); - //tprintf(DB_SYNC, "srcCrit lock succeeded\n"); - if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { - return false; - }; + if (srcPtrInfo->_sizeBytes == 0) { + return false; + } else { + ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId); + if (copyEngineCtx != srcCtx) { + // Only checks peer list if contexts are different + LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); + //tprintf(DB_SYNC, "srcCrit lock succeeded\n"); + if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { + return false; + }; + } } return true; @@ -1801,7 +1810,6 @@ const char* hcMemcpyStr(hc::hcCommandKind memKind) // Resolve hipMemcpyDefault to a known type. -// TODO - review why is this so complicated, does this need srcTracked and dstTracked? unsigned ihipStream_t::resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem) { hipMemcpyKind kind = hipMemcpyDefault; @@ -1821,7 +1829,8 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, hc::hcCommandKind *hcCopyDir, - ihipCtx_t **copyDevice) + ihipCtx_t **copyDevice, + bool *forceUnpinnedCopy) { // Ignore what the user tells us and always resolve the direction: // Some apps apparently rely on this. @@ -1844,17 +1853,18 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, *copyDevice = nullptr; } - if (hipMemKind == hipMemcpyDeviceToDevice) { - if (chooseDirectPeerToPeer(*copyDevice, dstPtrInfo, srcPtrInfo)) { - if (HIP_FORCE_P2P_HOST ) { - tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", (*copyDevice)->getDeviceNum()); - } else { - tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst.\n", (*copyDevice)->getDeviceNum()); - } + *forceUnpinnedCopy = false; + if (canSeeMemory(*copyDevice, dstPtrInfo, srcPtrInfo)) { + + if (HIP_FORCE_P2P_HOST ) { + *forceUnpinnedCopy = true; + tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", (*copyDevice)->getDeviceNum()); } else { - *copyDevice = nullptr; - tprintf (DB_COPY, "P2P: copy engine(dev:%d) cannot see both host and device pointers - forcing copy through staging buffers.\n", (*copyDevice)->getDeviceNum()); + tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst.\n", (*copyDevice)->getDeviceNum()); } + } else { + *forceUnpinnedCopy = true; + tprintf (DB_COPY, "P2P: copy engine(dev:%d) cannot see both host and device pointers - forcing copy with unpinned engine.\n", (*copyDevice)->getDeviceNum()); } } @@ -1878,19 +1888,22 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, hc::hcCommandKind hcCopyDir; ihipCtx_t *copyDevice; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device); - - - // copy_ext will use copy-engine to perform the copy. nullptr then + bool forceUnpinnedCopy; + resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device, &forceUnpinnedCopy); { LockedAccessor_StreamCrit_t crit (_criticalData); - tprintf (DB_COPY, "copySync copyDev:%d dst=%p(home_dev:%d, tracked:%d, isDevMem:%d) src=%p(home_dev:%d, tracked:%d, isDevMem:%d) sz=%zu dir=%s\n", + tprintf (DB_COPY, "copySync copyDev:%d dst=%p(home_dev:%d, tracked:%d, isDevMem:%d) src=%p(home_dev:%d, tracked:%d, isDevMem:%d) sz=%zu dir=%s forceUnpinnedCopy=%d\n", copyDevice ? copyDevice->getDeviceNum():-1, dst, dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem, - sizeBytes, hcMemcpyStr(hcCopyDir)); - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr); + sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); + +#if USE_COPY_EXT_V2 + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); +#else + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); +#endif } } @@ -1928,26 +1941,36 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes hc::hcCommandKind hcCopyDir; ihipCtx_t *copyDevice; - resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device); + bool forceUnpinnedCopy; + resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device, &forceUnpinnedCopy); - tprintf (DB_COPY, "copyASync copyEngine_dev:%d dst=%p(home_dev:%d, tracked:%d, isDevMem:%d) src=%p(home_dev:%d, tracked:%d, isDevMem:%d) sz=%zu dir=%s . \n", + tprintf (DB_COPY, "copyASync copyEngine_dev:%d dst=%p(home_dev:%d, tracked:%d, isDevMem:%d) src=%p(home_dev:%d, tracked:%d, isDevMem:%d) sz=%zu dir=%s. forceUnpinnedCopy=%d \n", copyDevice->getDeviceNum(), dst, dstPtrInfo._appId, dstTracked, dstPtrInfo._isInDeviceMem, src, srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem, - sizeBytes, hcMemcpyStr(hcCopyDir)); + sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. - if (dstTracked && srcTracked && copyDevice/*code below assumes this is !nullptr*/) { + if (dstTracked && srcTracked && !forceUnpinnedCopy && copyDevice/*code below assumes this is !nullptr*/) { LockedAccessor_StreamCrit_t crit(_criticalData); // Perform fast asynchronous copy - we know copyDevice != NULL based on check above try { if (HIP_FORCE_SYNC_COPY) { - crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc); +#if USE_COPY_EXT_V2 + crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc, forceUnpinnedCopy); +#else + crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); +#endif + } else { +#if USE_COPY_EXT_V2 crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc); +#else + crit->_av.copy_async(src, dst, sizeBytes); +#endif } } catch (Kalmar::runtime_exception) { throw ihipException(hipErrorRuntimeOther); @@ -1961,7 +1984,11 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes } else { LockedAccessor_StreamCrit_t crit(_criticalData); - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr); +#if USE_COPY_EXT_V2 + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); +#else + crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); +#endif } } } diff --git a/src/hip_hcc.h b/src/hip_hcc.h index ca759ba78d..66502cb7a4 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -496,10 +496,13 @@ class ihipStream_t { // The unsigned return is hipMemcpyKind unsigned resolveMemcpyDirection(bool srcInDeviceMem, bool dstInDeviceMem); - void resolveHcMemcpyDirection(unsigned hipMemKind, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, - hc::hcCommandKind *hcCopyDir, ihipCtx_t **copyDevice); + void resolveHcMemcpyDirection(unsigned hipMemKind, + const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo, + hc::hcCommandKind *hcCopyDir, + ihipCtx_t **copyDevice, + bool *forceUnpinnedCopy); - bool chooseDirectPeerToPeer(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstInfo, const hc::AmPointerInfo *srcInfo); + bool canSeeMemory(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstInfo, const hc::AmPointerInfo *srcInfo); private: // Data From 1e5515ee9f43e7b4adedf9e23ff967f01284c4a1 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 10 Nov 2016 23:10:42 -0600 Subject: [PATCH 59/65] Add option to deny peer access. Also fix test. Change-Id: I1b247f6c4271442b008e560669bca4daf8eb94c7 --- src/hip_hcc.cpp | 5 +++-- src/hip_hcc.h | 1 + src/hip_peer.cpp | 14 ++++++++++---- tests/src/hipPeerToPeer_simple.cpp | 22 +++++++++++++--------- 4 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index fe69810615..5218f4e2eb 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -76,6 +76,7 @@ int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; int HIP_FORCE_P2P_HOST = 0; +int HIP_DENY_PEER_ACCESS = 0; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; @@ -1374,7 +1375,7 @@ void ihipInit() READ_ENV_I(release, HIP_WAIT_MODE, 0, "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in application"); - READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copiecopies"); + READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copies.1=always use copies, 2=always return false for hipDeviceCanAccessPeer"); READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, "Force all copies (even hipMemcpyAsync) to use sync copies"); READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); @@ -1856,7 +1857,7 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, *forceUnpinnedCopy = false; if (canSeeMemory(*copyDevice, dstPtrInfo, srcPtrInfo)) { - if (HIP_FORCE_P2P_HOST ) { + if (HIP_FORCE_P2P_HOST & 0x1) { *forceUnpinnedCopy = true; tprintf (DB_COPY, "P2P. Copy engine (dev:%d) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", (*copyDevice)->getDeviceNum()); } else { diff --git a/src/hip_hcc.h b/src/hip_hcc.h index 66502cb7a4..0040263194 100644 --- a/src/hip_hcc.h +++ b/src/hip_hcc.h @@ -54,6 +54,7 @@ extern int HIP_DB; extern int HIP_STAGING_SIZE; /* size of staging buffers, in KB */ extern int HIP_STREAM_SIGNALS; /* number of signals to allocate at stream creation */ extern int HIP_VISIBLE_DEVICES; /* Contains a comma-separated sequence of GPU identifiers */ +extern int HIP_FORCE_P2P_HOST; //--- diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp index b0e4eeef52..b7dca06e5f 100644 --- a/src/hip_peer.cpp +++ b/src/hip_peer.cpp @@ -43,13 +43,18 @@ hipError_t ihipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx if ((thisCtx != NULL) && (peerCtx != NULL)) { + if (thisCtx == peerCtx) { *canAccessPeer = 0; - tprintf(DB_COPY, "Can't be peer to self. (this=%s, peer=%s)\n", + tprintf(DB_MEM, "Can't be peer to self. (this=%s, peer=%s)\n", thisCtx->toString().c_str(), peerCtx->toString().c_str()); + } else if (HIP_FORCE_P2P_HOST & 0x2) { + *canAccessPeer = false; + tprintf(DB_MEM, "HIP_FORCE_P2P_HOST denies peer access this=%s peer=%s canAccessPeer=%d\n", + thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer); } else { *canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc); - tprintf(DB_COPY, "deviceCanAccessPeer this=%s peer=%s canAccessPeer=%d\n", + tprintf(DB_MEM, "deviceCanAccessPeer this=%s peer=%s canAccessPeer=%d\n", thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer); } @@ -58,6 +63,7 @@ hipError_t ihipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx err = hipErrorInvalidDevice; } + return err; } @@ -93,7 +99,7 @@ hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx) LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData()); bool changed = peerCrit->removePeerWatcher(peerCtx, thisCtx); if (changed) { - tprintf(DB_COPY, "device %s disable access to memory allocated on peer:%s\n", + tprintf(DB_MEM, "device %s disable access to memory allocated on peer:%s\n", thisCtx->toString().c_str(), peerCtx->toString().c_str()); // Update the peers for all memory already saved in the tracker: am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); @@ -127,7 +133,7 @@ hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags) // Add thisCtx to peerCtx's access list so that new allocations on peer will be made visible to this device: bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx); if (isNewPeer) { - tprintf(DB_COPY, "device=%s can now see all memory allocated on peer=%s\n", + tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n", thisCtx->toString().c_str(), peerCtx->toString().c_str()); am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents()); } else { diff --git a/tests/src/hipPeerToPeer_simple.cpp b/tests/src/hipPeerToPeer_simple.cpp index a0bf6abac1..1dfbdafdfc 100644 --- a/tests/src/hipPeerToPeer_simple.cpp +++ b/tests/src/hipPeerToPeer_simple.cpp @@ -45,7 +45,7 @@ void help(char *argv[]) { printf ("usage: %s [OPTIONS]\n", argv[0]); printf (" --memcpyWithPeer : Perform memcpy with peer.\n"); - printf (" --mirrorPeersi : Mirror memory onto both default device and peerdevice. If 0, memory is mapped only on the default device.\n"); + printf (" --mirrorPeers : Mirror memory onto both default device and peerdevice. If 0, memory is mapped only on the default device.\n"); printf (" --peerDevice N : Set peer device.\n"); }; @@ -175,6 +175,9 @@ void enablePeerFirst(bool useAsyncCopy) HIPCHECK (myHipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault, 0/*stream*/, useAsyncCopy)); // This is P2P copy. // Copy data back to host: + // Have to wait for previous operation to finish, since we are switching to another one: + HIPCHECK(hipDeviceSynchronize()); + HIPCHECK (hipSetDevice(g_peerDevice)); HIPCHECK (myHipMemcpy(A_h, A_d1, Nbytes, hipMemcpyDeviceToHost, 0/*stream*/, useAsyncCopy)); HIPCHECK(hipDeviceSynchronize()); @@ -241,12 +244,13 @@ void allocMemoryFirst(bool useAsyncCopy) HIPCHECK (hipSetDevice(p_memcpyWithPeer ? g_peerDevice : g_currentDevice)); HIPCHECK (myHipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyDefault, 0/*stream*/, useAsyncCopy)); + syncBothDevices(); // TODO - remove me, should handle this in implementation. + // Copy data back to host: HIPCHECK (hipSetDevice(g_peerDevice)); HIPCHECK (myHipMemcpy(A_h, A_d1, Nbytes, hipMemcpyDeviceToHost, 0/*stream*/, useAsyncCopy)); - HIPCHECK(hipDeviceSynchronize()); - HIPCHECK (hipSetDevice(g_currentDevice)); + syncBothDevices(); // TODO - remove me, should handle this in implementation. //--- @@ -287,15 +291,15 @@ void testPeerHostToDevice(bool useAsyncCopy) size_t Nbytes = N*sizeof(char); - char *A_d0, *A_d1; + char *A_host_d0, *A_d1; char *A_h; A_h = (char*)malloc(Nbytes); // allocate and initialize memory on device0 HIPCHECK (hipSetDevice(g_currentDevice)); - HIPCHECK (hipHostMalloc(&A_d0, Nbytes) ); - HIPCHECK (hipMemset(A_d0, memsetval, Nbytes) ); + HIPCHECK (hipHostMalloc(&A_host_d0, Nbytes) ); + HIPCHECK (hipMemset(A_host_d0, memsetval, Nbytes) ); // allocate and initialize memory on peer device HIPCHECK (hipSetDevice(g_peerDevice)); @@ -314,15 +318,15 @@ void testPeerHostToDevice(bool useAsyncCopy) if (p_memcpyWithPeer) { // p_memcpyWithPeer=1 case is HostToDevice. // if p_mirrorPeers = 1, this is accelerated copy over PCIe. - // if p_mirrorPeers = 0, this should fall back to host (because peer can't see A_d0) + // if p_mirrorPeers = 0, this should fall back to host (because peer can't see A_host_d0) HIPCHECK (hipSetDevice(g_peerDevice)); - HIPCHECK (myHipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyHostToDevice, 0/*stream*/, firstAsyncCopy)); // This is P2P copy. + HIPCHECK (myHipMemcpy(A_d1, A_host_d0, Nbytes, hipMemcpyHostToDevice, 0/*stream*/, firstAsyncCopy)); // This is P2P copy. } else { // p_memcpyWithPeer=0 case is HostToDevice. // if p_mirrorPeers = 1, this is accelerated copy over PCIe. // if p_mirrorPeers = 0, this should fall back to host (because device0 can't see A_d1) HIPCHECK (hipSetDevice(g_currentDevice)); - HIPCHECK (myHipMemcpy(A_d1, A_d0, Nbytes, hipMemcpyHostToDevice, 0/*stream*/, firstAsyncCopy)); // This is P2P copy. + HIPCHECK (myHipMemcpy(A_d1, A_host_d0, Nbytes, hipMemcpyHostToDevice, 0/*stream*/, firstAsyncCopy)); // This is P2P copy. } syncBothDevices(); From abf6872b2b5ba5d9be2eef8bb5d0f41a89092bdc Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 11 Nov 2016 12:25:23 -0600 Subject: [PATCH 60/65] fixed multi-dim module kernel launch Change-Id: Id1d81f2375d058979ab526433f905cf0ea3d23d6 --- src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 5218f4e2eb..e5619f723e 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -525,7 +525,7 @@ void ihipStream_t::launchModuleKernel( (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) | (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); - uint16_t setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + uint16_t setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; uint32_t header32 = header | (setup << 16); __atomic_store_n((uint32_t*)(dispatch_packet), header32, __ATOMIC_RELEASE); From 50867efa10e0d807ffec39a40b9a1b03833fb3ca Mon Sep 17 00:00:00 2001 From: pensun Date: Fri, 11 Nov 2016 15:09:31 -0600 Subject: [PATCH 61/65] Add direct test case for threadfence_system workaround Change-Id: I5b21b590e957c901044741ac94e816cd8b1426f9 --- CMakeLists.txt | 9 -- src/hip_memory.cpp | 84 ++++++++++++------- .../memory/hipMemoryAllocateCoherent.cpp | 61 ++++++++++++++ .../hipMemoryAllocateCoherentDriver.cpp | 60 +++++++++++++ 4 files changed, 173 insertions(+), 41 deletions(-) create mode 100644 tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp create mode 100644 tests/src/runtimeApi/memory/hipMemoryAllocateCoherentDriver.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e5c3d51c6a..ccd390fbe5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,15 +142,6 @@ if(NOT DEFINED COMPILE_HIP_ATP_MARKER) endif() add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER) -# Check if we need to force finegrained system memory allocation -if(NOT DEFINED HIP_COHERENT_HOST_ALLOC) - if(NOT DEFINED ENV{HIP_COHERENT_HOST_ALLOC}) - set(HIP_COHERENT_HOST_ALLOC 0) - else() - set(HIP_COHERENT_HOST_ALLOC $ENV{HIP_COHERENT_HOST_ALLOC}) - endif() -endif() -add_to_config(_buildInfo HIP_COHERENT_HOST_ALLOC) ############################# # Build steps diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 2f1eb1e27f..672b9f2ee2 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -129,12 +129,12 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) LockedAccessor_CtxCrit_t crit(ctx->criticalData()); // the peerCnt always stores self so make sure the trace actually peerCnt = crit->peerCnt(); - tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allowed %d other peer(s) access\n", + tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allowed %d other peer(s) access\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1); - if (peerCnt > 1) { - + if (peerCnt > 1) { + //printf ("peer self access\n"); - + // TODOD - remove me: for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); @@ -155,6 +155,20 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) return ihipLogStatus(hip_status); } +void ihipReadSingleEnv(int *var_ptr, const char *var_name1, const char *description) +{ + char * env = getenv(var_name1); + + // Default is set when variable is initialized (at top of this file), so only override if we find + // an environment variable. + if (env) { + long int v = strtol(env, NULL, 0); + *var_ptr = (int) (v); + } + if (HIP_PRINT_ENV) { + printf ("%-30s = %2d : %s\n", var_name1, *var_ptr, description); + } +} hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { @@ -173,44 +187,50 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) unsigned trueFlags = flags; if (flags == hipHostMallocDefault) { trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined; - } + } const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; + // Read from environment variable of HIP_COHERENT_HOST_ALLOC + int coherent_alloc=0; + ihipReadSingleEnv(&coherent_alloc, "HIP_COHERENT_HOST_ALLOC", "Flag to force allocate finegrained system memory"); + if (flags & ~supportedFlags) { hip_status = hipErrorInvalidValue; - } else { -#if HIP_COHERENT_HOST_ALLOC - // TODO - let's make this an environment variable - *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if(sizeBytes < 1 && (*ptr == NULL)){ - hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent); - } - tprintf(DB_MEM, " %s: pinned ptr=%p\n", __func__, *ptr); -#else - // TODO - am_alloc requires writeable __acc, perhaps could be refactored? - // TODO - hipHostMallocMapped is be ignored on ROCM - all memory is mapped to host address space as WC. + } + else { auto device = ctx->getWriteableDevice(); - *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); - if (*ptr == NULL) { - hip_status = hipErrorMemoryAllocation; - } else { - hc::am_memtracker_update(*ptr, device->_deviceId, flags); - // TODO-hipHostMallocPortable should map the host memory into all contexts, regardless of peer status. - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - peerCnt = crit->peerCnt(); - if (peerCnt > 1) { - hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); + if(coherent_alloc){ + // Force to allocate finedgrained system memory + *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); + if(sizeBytes < 1 && (*ptr == NULL)){ + hip_status = hipErrorMemoryAllocation; + } else { + hc::am_memtracker_update(*ptr, device->_deviceId, amHostCoherent); + } + tprintf(DB_MEM, " %s: finegrained system memory ptr=%p\n", __func__, *ptr); + } + else{ + // TODO - am_alloc requires writeable __acc, perhaps could be refactored? + // TODO - hipHostMallocMapped is be ignored on ROCM - all memory is mapped to host address space as WC. + *ptr = hc::am_alloc(sizeBytes, device->_acc, amHostPinned); + if (*ptr == NULL) { + hip_status = hipErrorMemoryAllocation; + } else { + hc::am_memtracker_update(*ptr, device->_deviceId, flags); + // TODO-hipHostMallocPortable should map the host memory into all contexts, regardless of peer status. + int peerCnt=0; + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + peerCnt = crit->peerCnt(); + if (peerCnt > 1) { + hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, *ptr); + } } + tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1); } - tprintf(DB_MEM, "allocated pinned_host ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n", *ptr, sizeBytes, device->_deviceId, peerCnt-1); } } -#endif //HIP_COHERENT_HOST_ALLOC } return ihipLogStatus(hip_status); } diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp new file mode 100644 index 0000000000..6042f538b3 --- /dev/null +++ b/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp @@ -0,0 +1,61 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * HIT_END + */ + + +#include +#include "hip/hip_runtime.h" + +__global__ void Kernel(hipLaunchParm lp,volatile float* hostRes) +{ + int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + hostRes[tid] = tid + 1; + __threadfence_system(); + // expecting that the data is getting flushed to host here! + // time waster for-loop (sleep) + for (int timeWater = 0; timeWater < 100000000; timeWater++); +} + +int main() +{ + size_t blocks = 2; + volatile float* hostRes; + hipHostMalloc((void**)&hostRes,blocks*sizeof(float),hipHostMallocMapped); + hostRes[0]=0; + hostRes[1]=0; + hipLaunchKernel(HIP_KERNEL_NAME(Kernel), dim3(1), dim3(blocks), 0, 0, hostRes); + int eleCounter = 0; + while (eleCounter < blocks) + { + // blocks until the value changes + while(hostRes[eleCounter] == 0); + printf("%f\n", hostRes[eleCounter]);; + eleCounter++; + } + hipHostFree((void *)hostRes); + return 0; +} + diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocateCoherentDriver.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocateCoherentDriver.cpp new file mode 100644 index 0000000000..dc512b41f8 --- /dev/null +++ b/tests/src/runtimeApi/memory/hipMemoryAllocateCoherentDriver.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the +following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 + * RUN: %t + * HIT_END + */ + +#include +#include +#include +#include +#include +#include +#include +#include "hip/hip_runtime.h" +using namespace std; + +string getRes(){ + FILE *in; + char buff[512], buff_2[512]; + string str = "./hipMemoryAllocateCoherent"; + if(!(in = popen(str.c_str(), "r"))){ + exit(1); + } + fgets(buff, sizeof(buff), in); + fgets(buff_2, sizeof(buff_2), in); + string str_buff = buff; + str_buff += buff_2; + pclose(in); + return str_buff; +} + +int main() { + setenv("HIP_COHERENT_HOST_ALLOC","1000,0,1",1); + string output = getRes(); + istringstream buffer(output); + double res1, res2; + buffer >> res1; + buffer >> res2; + if((res2-res1*2)>0.000001) + exit(1); + std::cout << "PASSED" << std::endl; + return 0; +} From c9401cb95ffe477dc1d23097f639a674b40f3907 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 11 Nov 2016 16:49:23 -0600 Subject: [PATCH 62/65] Add   to demangler Change-Id: I89586c7c17f5152b7a6850d0d6c2aa1d3ebc8190 --- bin/hipdemangleatp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/hipdemangleatp b/bin/hipdemangleatp index f979f59434..456ea9ae15 100755 --- a/bin/hipdemangleatp +++ b/bin/hipdemangleatp @@ -5,7 +5,7 @@ # HIP kernels kernels=$(grep grid_launch_parm $1 | cut -d" " -f1 | sort | uniq) for mangled_sym in $kernels; do - real_sym=$(c++filt -p $(c++filt _$mangled_sym | cut -d: -f3 | sed 's/_functor//g')) + real_sym=$(c++filt -p $(c++filt _$mangled_sym | cut -d: -f3 | sed 's/_functor//g' | sed 's/ /\\\ /g')) #echo "$mangled_sym => $real_sym" >> $1.log sed -i "s/$mangled_sym/$real_sym/g" $1 done @@ -13,7 +13,7 @@ done # HC kernels kernels=$(grep cxxamp_trampoline $1 | cut -d" " -f1 | sort | uniq) for mangled_sym in $kernels; do - real_sym=$(echo $mangled_sym | sed "s/^/_/g; s/_EC_/_$/g" | c++filt -p | cut -d\( -f1 | cut -d" " -f1 --complement) + real_sym=$(echo $mangled_sym | sed "s/^/_/g; s/_EC_/_$/g" | c++filt -p | cut -d\( -f1 | cut -d" " -f1 --complement | sed 's/ /\\\ /g') #echo "$mangled_sym => $real_sym" >> $1.log sed -i "s/$mangled_sym/$real_sym/g" $1 done From faf2a1e01abd63605ff2fd1d881a24162a12398e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 13 Nov 2016 09:14:50 -0600 Subject: [PATCH 63/65] Add draft doc on profiling with hip. Change-Id: I79727dd2500333b3f16acb381dd5852a15ed408a --- docs/markdown/hip_profiling.md | 95 ++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 docs/markdown/hip_profiling.md diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md new file mode 100644 index 0000000000..e4b88945e5 --- /dev/null +++ b/docs/markdown/hip_profiling.md @@ -0,0 +1,95 @@ +# Profiling HIP Code + +HIP provides several capabilities to support debugging and profiling. Profiling information can be displayed to stderr or viewed in the CodeXl visualization tool. + +### Usign CodeXL to profile a HIP Application +By defauly, CodeXL can trace all kernel commands, data transfer commands, and HSA Runtime (ROCr) API calls. +/opt/rocm/bin/rocm-profiler -o -A + +### Using CodeXL markers for HIP Functions +HIP can generate markers at function being/end which are displayed on the CodeXL timeline view. +HIP 1.0 compiles marker support by default, and you can enable it by setting the HIP_PROFILE_API environment variable and then running the rocm-profiler: + +```shell + +# Use profile to generate timeline view: +export HIP_PROFILE_API=1 +/opt/rocm/bin/rocm-profiler -o -A + +Or +/opt/rocm/bin/rocm-profiler -e HIP_PROFILE_API=1 -o -A +``` + +#### Developer Builds +For developer builds, you must enable marker support manually when compiling HIP. + +1. Build HIP with ATP markers enabled +HIP pre-built packages are enabled with ATP marker support by default. +To enable ATP marker support when building HIP from source, use the option ```-DCOMPILE_HIP_ATP_MARKER=1``` during the cmake configure step. + +2. Install ROCm-Profiler +Installing HIP from the [rocm](http://gpuopen.com/getting-started-with-boltzmann-components-platforms-installation/) pre-built packages, installs the ROCm-Profiler as well. +Alternatively, you can build ROCm-Profiler using the instructions [here](https://github.com/RadeonOpenCompute/ROCm-Profiler#building-the-rocm-profiler). + +3. Recompile the target application + +Then follow the steps above to collect a marker-enabled trace. + + +### Using HIP_TRACE_API +You can also print the HIP function strings to stderr using HIP_TRACE_API environment variable. This can also be combined with the more detailed debug information provided +by the HIP_DB switch. For example: +```shell +# Trace to stderr showing being/end of each function (with arguments) + intermediate debug trace during the execution of each function. +HIP_TRACE_API=1 HIP_DB=0x2 ./myHipApp +``` + +#### Color +Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. +You can change the color used for the trace mode with the HIP_TRACE_API_COLOR environment variable. Possible values are None/Red/Green/Yellow/Blue/Magenta/Cyan/White. +None will disable use of color control codes and may be useful when saving the trace file or when a pure text trace is desired. + +#### + + +### Using HIP_DB + +This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well. +The HIP debug information is designed to print important information during the execution of a HIP API. HIP provides +different color-coded levels of debug informaton: + - api : Print the beginning and end of each HIP API, including the arguments and return codes. + - sync : Print multi-thread and other synchronization debug information. + - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory. + - mem : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more. + +DB_MEM format is flags separated by '+' sign, or a hex code for the bitmask. Generally the + format is preferred. +For example: +```shell +HIP_DB=api+copy+mem my-application +HIP_DB=0xF my-application +``` +HIP_DB=1 same as HIP_TRACE_API=1 + + + + +Trace provides quick look at API. +Explain output of +Reference the cookbook example. +Command-line profile. +/// disable profiling at the start of the application you can start CodeXLGpuProfiler with the --startdisabled flag. + +Can use strace interleaved with HSA Debug calls . + +HIP_PROFILE_API=1 +HIP_PROFILE_API=2 : Will show the full API in the trace. This can be useful for lower-level debugging when you want to see all the parameters that are passed to a specific API. + +demangle atp + +Write how to collect performance counters. +- include how to compute bandwidth for copy and kernel activity. + +- How to disable HSA APIs. +- Do I need to use profiler with HSA enabled? Do I need to enable HSA profiling on the command line? + +Offline compile, how to visualize. From fd1483ce35265b453a355601c17c7b4ad265b9f0 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 14 Nov 2016 06:05:31 +0530 Subject: [PATCH 64/65] Revert "hipcc: Turn back linking hip_ir.ll by default" This reverts commit 528b25700431880c116dd0f7b0af71a2b7855784. --- bin/hipcc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hipcc b/bin/hipcc index 1504d68141..dee0894869 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -218,7 +218,7 @@ if($HIP_PLATFORM eq "hcc"){ } } -if($HIP_PLATFORM eq "hcc"){ +if(($HIP_PLATFORM eq "hcc") and defined $ENV{HIP_EXPERIMENTAL}){ $EXPORT_LL=" "; $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n"; } From 09b157ca8c06ca48980284e2ce83dcc57c485af5 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Wed, 9 Nov 2016 12:06:45 +0530 Subject: [PATCH 65/65] Add p2p for cookbook Change-Id: Id2e77ab31123ef95885d665efe34bc0d4596733a (cherry picked from commit 6fbd0352713ca36e399b1ed4f17c486207a53875) --- samples/2_Cookbook/8_peer2peer/Makefile | 36 +++ samples/2_Cookbook/8_peer2peer/peer2peer.cpp | 241 +++++++++++++++++++ 2 files changed, 277 insertions(+) create mode 100644 samples/2_Cookbook/8_peer2peer/Makefile create mode 100644 samples/2_Cookbook/8_peer2peer/peer2peer.cpp diff --git a/samples/2_Cookbook/8_peer2peer/Makefile b/samples/2_Cookbook/8_peer2peer/Makefile new file mode 100644 index 0000000000..a1dad7d1da --- /dev/null +++ b/samples/2_Cookbook/8_peer2peer/Makefile @@ -0,0 +1,36 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = peer2peer.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./peer2peer + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o + diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp new file mode 100644 index 0000000000..624de56cb0 --- /dev/null +++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp @@ -0,0 +1,241 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANUMTY OF ANY KIND, EXPRESS OR +IMPLIED, INUMCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNUMESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANUMY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INUM AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INUM CONUMECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include +#include +#define WIDTH 32 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +using namespace std; + +#define KNRM "\x1B[0m" +#define KRED "\x1B[31m" + +#define failed(...) \ + printf ("%serror: ", KRED);\ + printf (__VA_ARGS__);\ + printf ("\n");\ + printf ("error: TEST FAILED\n%s", KNRM );\ + abort(); + +#define HIPCHECK(error) \ +{\ + hipError_t localError = error; \ + if (localError != hipSuccess) { \ + printf("%serror: '%s'(%d) from %s at %s:%d%s\n", \ + KRED, hipGetErrorString(localError), localError,\ + #error,__FILE__, __LINE__, KNRM); \ + failed("API returned error code.");\ + }\ +} + +void checkPeer2PeerSupport() +{ + int gpuCount; + int canAccessPeer; + int p2pCapableDeviceCount=0; + + HIPCHECK(hipGetDeviceCount(&gpuCount)); + + if (gpuCount < 2) + printf("Peer2Peer application requires atleast 2 gpu devices"); + + for (int currentGpu=0; currentGpu eps ) { + printf("%d cpu: %f gpu peered data %f\n",i,randArray[i],TransposeMatrix[1][i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("Peer2Peer PASSED!\n"); + } + + free(randArray); + for(int i=0;i<2;i++){ + hipFree(data[i]); + hipFree(gpuTransposeMatrix[i]); + free(TransposeMatrix[i]); + } + + HIPCHECK(hipSetDevice(peerGpu)); + HIPCHECK(hipDeviceReset()); + + HIPCHECK(hipSetDevice(currentGpu)); + HIPCHECK(hipDeviceReset()); + + return 0; +}