diff --git a/CMakeLists.txt b/CMakeLists.txt index 69f2fb2f1..bf815472c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -179,7 +179,7 @@ endif() # GPU profilers if(Kokkos_ENABLE_CUDA) - add_subdirectory(profiling/nvprof-connector) + add_subdirectory(profiling/nvtx-connector) add_subdirectory(profiling/nvprof-focused-connector) endif() if(Kokkos_ENABLE_HIP) diff --git a/README.md b/README.md index fae07324c..73f1a902a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Note: `Kokkos` must be configured with `Kokkos_ENABLE_LIBDL=ON` to load profilin ## General Usage - To use one of the tools you have to compile it, which will generate a dynamic library. Before executing the Kokkos application you then have to set the environment variable `KOKKOS_TOOLS_LIBS` to point to the dynamic library e.g. in the `bash` shell: +To use one of the tools you have to compile it, which will generate a dynamic library. Before executing the Kokkos application you then have to set the environment variable `KOKKOS_TOOLS_LIBS` to point to the dynamic library e.g. in the `bash` shell: ``` export KOKKOS_TOOLS_LIBS=${HOME}/kokkos-tools/src/tools/memory-events/kp_memory_event.so ``` @@ -69,18 +69,21 @@ The following provides an overview of the tools available in the set of Kokkos T Like VTuneConnector but turns profiling off outside of kernels. Should be used in conjunction with the KernelFilter tool. ++ [**NVTXConnector:**](https://github.com/kokkos/kokkos-tools/wiki/NVTXConnector) + + Provides Kokkos Kernel Names to NVTX, so that analysis can be performed on a per kernel base. + + [**Timemory:**](https://github.com/kokkos/kokkos-tools/wiki/Timemory) Modular connector for accumulating timing, memory usage, hardware counters, and other various metrics. Supports controlling VTune, CUDA profilers, and TAU + kernel name forwarding to VTune, NVTX, TAU, Caliper, and LIKWID. - ##### If you need to write your own plug-in, this provides a straight-forward API to writing the plug-in. + ##### If you need to write your own plug-in, this provides a straight-forward API to writing the plug-in. Defining a timemory component will enable your plug-in to output to stdout, text, and JSON, accumulate statistics, and utilize various portable function calls for common needs w.r.t. timers, resource usage, etc. - # Building Kokkos Tools @@ -91,18 +94,18 @@ Use either CMake or Makefile to build Kokkos Tools. 1. create a build directory in Kokkos Tools, e.g., type `mkdir myBuild; cd myBuild` 2. To configure the Type `ccmake ..` for any options you would like to enable/disable. 3. To compile, type `make` -4. To install, type `make install` +4. To install, type `make install` ## Using make To build with make, simply type `make` within each subdirectory of Kokkos Tools. -Building with Makefiles is currently recommended. +Building using `make` is currently recommended. Eventually, the preferred method of building will be `cmake`. # Running a Kokkos-based Application with a tool -Given your tool shared library .so (which contains kokkos profiling callback functions) and an application executable called yourApplication.exe, type: +Given your tool shared library `.so` (which contains kokkos profiling callback functions) and an application executable called yourApplication.exe, type: `export KOKKOS_TOOLS_LIBS=${YOUR_KOKKOS_TOOLS_DIR}/; ./yourApplication.exe` diff --git a/profiling/all/kp_all.cpp b/profiling/all/kp_all.cpp index 6199cefcb..a1c9683a6 100644 --- a/profiling/all/kp_all.cpp +++ b/profiling/all/kp_all.cpp @@ -49,7 +49,7 @@ KOKKOSTOOLS_EXTERN_EVENT_SET(VTuneFocusedConnector) KOKKOSTOOLS_EXTERN_EVENT_SET(VariorumConnector) #endif #ifdef KOKKOSTOOLS_HAS_NVPROF -KOKKOSTOOLS_EXTERN_EVENT_SET(NVProfConnector) +KOKKOSTOOLS_EXTERN_EVENT_SET(NVTXConnector) KOKKOSTOOLS_EXTERN_EVENT_SET(NVProfFocusedConnector) #endif #ifdef KOKKOSTOOLS_HAS_CALIPER @@ -91,7 +91,7 @@ EventSet get_event_set(const char* profiler, const char* config_str) { handlers["caliper"] = cali::get_kokkos_event_set(config_str); #endif #ifdef KOKKOSTOOLS_HAS_NVPROF - handlers["nvprof-connector"] = NVProfConnector::get_event_set(); + handlers["nvtx-connector"] = NVTXConnector::get_event_set(); handlers["nvprof-focused-connector"] = NVProfFocusedConnector::get_event_set(); #endif diff --git a/profiling/all/kp_core.hpp b/profiling/all/kp_core.hpp index a1834159b..5cb5ed391 100644 --- a/profiling/all/kp_core.hpp +++ b/profiling/all/kp_core.hpp @@ -46,6 +46,8 @@ using Kokkos::Tools::SpaceHandle; #define EXPOSE_STOP_PROFILE_SECTION(FUNC_NAME) #define EXPOSE_DESTROY_PROFILE_SECTION(FUNC_NAME) #define EXPOSE_PROFILE_EVENT(FUNC_NAME) +#define EXPOSE_BEGIN_FENCE(FUNC_NAME) +#define EXPOSE_END_FENCE(FUNC_NAME) #else @@ -165,6 +167,17 @@ using Kokkos::Tools::SpaceHandle; FUNC_NAME(name); \ } +#define EXPOSE_BEGIN_FENCE(FUNC_NAME) \ + __attribute__((weak)) void kokkosp_begin_fence( \ + const char* name, const uint32_t deviceId, uint64_t* handle) { \ + FUNC_NAME(name, deviceId, handle); \ + } + +#define EXPOSE_END_FENCE(FUNC_NAME) \ + __attribute__((weak)) void kokkosp_end_fence(uint64_t handle) { \ + FUNC_NAME(handle); \ + } + #define EXPOSE_DUAL_VIEW_SYNC(FUNC_NAME) \ __attribute__((weak)) void kokkosp_dual_view_sync( \ const char* name, const void* const ptr, bool is_device) { \ diff --git a/profiling/nvprof-connector/CMakeLists.txt b/profiling/nvprof-connector/CMakeLists.txt deleted file mode 100644 index eae33dc2d..000000000 --- a/profiling/nvprof-connector/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -find_package(CUDAToolkit REQUIRED) -kp_add_library(kp_nvprof_connector kp_nvprof_connector.cpp) - -target_link_libraries(kp_nvprof_connector CUDA::nvToolsExt) \ No newline at end of file diff --git a/profiling/nvprof-connector/kp_nvprof_connector_domain.h b/profiling/nvprof-connector/kp_nvprof_connector_domain.h deleted file mode 100644 index cbaf63ac2..000000000 --- a/profiling/nvprof-connector/kp_nvprof_connector_domain.h +++ /dev/null @@ -1,78 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef _H_KOKKOSP_KERNEL_NVPROF_CONNECTOR_INFO -#define _H_KOKKOSP_KERNEL_NVPROF_CONNECTOR_INFO - -#include -#include -#include - -#include "nvToolsExt.h" - -enum KernelExecutionType { - PARALLEL_FOR = 0, - PARALLEL_REDUCE = 1, - PARALLEL_SCAN = 2 -}; - -class KernelNVProfConnectorInfo { - public: - KernelNVProfConnectorInfo(std::string kName, KernelExecutionType kernelType) { - domainNameHandle = kName; - char* domainName = (char*)malloc(sizeof(char*) * (32 + kName.size())); - - if (kernelType == PARALLEL_FOR) { - sprintf(domainName, "ParallelFor.%s", kName.c_str()); - } else if (kernelType == PARALLEL_REDUCE) { - sprintf(domainName, "ParallelReduce.%s", kName.c_str()); - } else if (kernelType == PARALLEL_SCAN) { - sprintf(domainName, "ParallelScan.%s", kName.c_str()); - } else { - sprintf(domainName, "Kernel.%s", kName.c_str()); - } - - domain = nvtxDomainCreateA(domainName); - currentRange = 0; - } - - nvtxRangeId_t startRange() { - nvtxEventAttributes_t eventAttrib = {0}; - eventAttrib.version = NVTX_VERSION; - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; - eventAttrib.message.ascii = "Kernel"; - currentRange = nvtxDomainRangeStartEx(domain, &eventAttrib); - return currentRange; - } - - nvtxRangeId_t getCurrentRange() { return currentRange; } - - void endRange() { nvtxDomainRangeEnd(domain, currentRange); } - - nvtxDomainHandle_t getDomain() { return domain; } - - std::string getDomainNameHandle() { return domainNameHandle; } - - ~KernelNVProfConnectorInfo() { nvtxDomainDestroy(domain); } - - private: - std::string domainNameHandle; - nvtxRangeId_t currentRange; - nvtxDomainHandle_t domain; -}; - -#endif diff --git a/profiling/nvtx-connector/CMakeLists.txt b/profiling/nvtx-connector/CMakeLists.txt new file mode 100644 index 000000000..5082532cb --- /dev/null +++ b/profiling/nvtx-connector/CMakeLists.txt @@ -0,0 +1,4 @@ +find_package(CUDAToolkit REQUIRED) +kp_add_library(kp_nvtx_connector kp_nvtx_connector.cpp) + +target_link_libraries(kp_nvtx_connector CUDA::nvToolsExt) diff --git a/profiling/nvprof-connector/Makefile b/profiling/nvtx-connector/Makefile similarity index 50% rename from profiling/nvprof-connector/Makefile rename to profiling/nvtx-connector/Makefile index bff0a30cf..35071bfcf 100644 --- a/profiling/nvprof-connector/Makefile +++ b/profiling/nvtx-connector/Makefile @@ -4,15 +4,15 @@ LDFLAGS=-L$(CUDA_ROOT)/lib64 LIBS=-lnvToolsExt SHARED_CXXFLAGS=-shared -fPIC -all: kp_nvprof_connector.so +all: kp_nvtx_connector.so MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) -CXXFLAGS+=-I${MAKEFILE_PATH} -I${MAKEFILE_PATH}/../../common/makefile-only -I${MAKEFILE_PATH}../all +CXXFLAGS+=-I${MAKEFILE_PATH} -I${MAKEFILE_PATH}../../common/makefile-only -I${MAKEFILE_PATH}../all -kp_nvprof_connector.so: ${MAKEFILE_PATH}kp_nvprof_connector.cpp +kp_nvtx_connector.so: ${MAKEFILE_PATH}kp_nvtx_connector.cpp $(CXX) $(SHARED_CXXFLAGS) $(CXXFLAGS) $(LDFLAGS) \ - -o $@ ${MAKEFILE_PATH}kp_nvprof_connector.cpp $(LIBS) + -o $@ ${MAKEFILE_PATH}kp_nvtx_connector.cpp $(LIBS) clean: - rm *.so + rm -f kp_nvtx_connector.so diff --git a/profiling/nvprof-connector/kp_nvprof_connector.cpp b/profiling/nvtx-connector/kp_nvtx_connector.cpp similarity index 56% rename from profiling/nvprof-connector/kp_nvprof_connector.cpp rename to profiling/nvtx-connector/kp_nvtx_connector.cpp index 34bdecff9..9ef3ce241 100644 --- a/profiling/nvprof-connector/kp_nvprof_connector.cpp +++ b/profiling/nvtx-connector/kp_nvtx_connector.cpp @@ -18,17 +18,25 @@ #include #include #include +#include #include "nvToolsExt.h" #include "kp_core.hpp" +static bool tool_globfences; + namespace KokkosTools { -namespace NVProfConnector { +namespace NVTXConnector { void kokkosp_request_tool_settings(const uint32_t, Kokkos_Tools_ToolSettings* settings) { - settings->requires_global_fencing = false; + settings->requires_global_fencing = true; + if (tool_globfences) { + settings->requires_global_fencing = true; + } else { + settings->requires_global_fencing = false; + } } void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, @@ -39,9 +47,14 @@ void kokkosp_init_library(const int loadSeq, const uint64_t interfaceVer, loadSeq, (unsigned long long)(interfaceVer)); printf("-----------------------------------------------------------\n"); - nvtxNameOsThread(pthread_self(), "Application Main Thread"); - nvtxMarkA("Kokkos::Initialization Complete"); -} + const char* tool_global_fences = getenv("KOKKOS_TOOLS_GLOBALFENCES"); + if (NULL != tool_global_fences) { + tool_globfences = (atoi(tool_global_fences) != 0); + nvtxNameOsThread(pthread_self(), "Application Main Thread"); + nvtxMarkA("Kokkos::Initialization Complete"); + } + +} // end kokkosp_init_library void kokkosp_finalize_library() { printf("-----------------------------------------------------------\n"); @@ -87,24 +100,6 @@ struct Section { std::vector
kokkosp_sections; } // namespace -Kokkos::Tools::Experimental::EventSet get_event_set() { - Kokkos::Tools::Experimental::EventSet my_event_set; - memset(&my_event_set, 0, - sizeof(my_event_set)); // zero any pointers not set here - my_event_set.request_tool_settings = kokkosp_request_tool_settings; - my_event_set.init = kokkosp_init_library; - my_event_set.finalize = kokkosp_finalize_library; - my_event_set.push_region = kokkosp_push_profile_region; - my_event_set.pop_region = kokkosp_pop_profile_region; - my_event_set.begin_parallel_for = kokkosp_begin_parallel_for; - my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce; - my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan; - my_event_set.end_parallel_for = kokkosp_end_parallel_for; - my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce; - my_event_set.end_parallel_scan = kokkosp_end_parallel_scan; - return my_event_set; -} - void kokkosp_create_profile_section(const char* name, uint32_t* sID) { *sID = kokkosp_sections.size(); kokkosp_sections.push_back( @@ -121,12 +116,61 @@ void kokkosp_stop_profile_section(const uint32_t sID) { nvtxRangeEnd(section.id); } -} // namespace NVProfConnector +void kokkosp_profile_event(const char* name) { nvtxMarkA(name); } + +void kokkosp_begin_fence(const char* name, const uint32_t deviceId, + uint64_t* handle) { + // filter out fence as this is a duplicate and unneeded (causing the tool to + // hinder performance of application). We use strstr for checking if the + // string contains the label of a fence (we assume the user will always have + // the word fence in the label of the fence). + if (std::strstr(name, "Kokkos Profile Tool Fence")) { + // set the dereferenced execution identifier to be the maximum value of + // uint64_t, which is assumed to never be assigned + *handle = std::numeric_limits::max(); + } else { + nvtxRangeId_t id = nvtxRangeStartA(name); + *handle = id; // handle will be provided back to end_fence + } +} + +void kokkosp_end_fence(uint64_t handle) { + nvtxRangeId_t id = handle; + if (handle != std::numeric_limits::max()) { + nvtxRangeEnd(id); + } +} + +Kokkos::Tools::Experimental::EventSet get_event_set() { + Kokkos::Tools::Experimental::EventSet my_event_set; + memset(&my_event_set, 0, + sizeof(my_event_set)); // zero any pointers not set here + my_event_set.request_tool_settings = kokkosp_request_tool_settings; + my_event_set.init = kokkosp_init_library; + my_event_set.finalize = kokkosp_finalize_library; + my_event_set.push_region = kokkosp_push_profile_region; + my_event_set.pop_region = kokkosp_pop_profile_region; + my_event_set.begin_parallel_for = kokkosp_begin_parallel_for; + my_event_set.begin_parallel_reduce = kokkosp_begin_parallel_reduce; + my_event_set.begin_parallel_scan = kokkosp_begin_parallel_scan; + my_event_set.end_parallel_for = kokkosp_end_parallel_for; + my_event_set.end_parallel_reduce = kokkosp_end_parallel_reduce; + my_event_set.end_parallel_scan = kokkosp_end_parallel_scan; + my_event_set.create_profile_section = kokkosp_create_profile_section; + my_event_set.start_profile_section = kokkosp_start_profile_section; + my_event_set.stop_profile_section = kokkosp_stop_profile_section; + my_event_set.profile_event = kokkosp_profile_event; + my_event_set.begin_fence = kokkosp_begin_fence; + my_event_set.end_fence = kokkosp_end_fence; + return my_event_set; +} + +} // namespace NVTXConnector } // namespace KokkosTools extern "C" { -namespace impl = KokkosTools::NVProfConnector; +namespace impl = KokkosTools::NVTXConnector; EXPOSE_TOOL_SETTINGS(impl::kokkosp_request_tool_settings) EXPOSE_INIT(impl::kokkosp_init_library) @@ -139,5 +183,10 @@ EXPOSE_BEGIN_PARALLEL_SCAN(impl::kokkosp_begin_parallel_scan) EXPOSE_END_PARALLEL_SCAN(impl::kokkosp_end_parallel_scan) EXPOSE_BEGIN_PARALLEL_REDUCE(impl::kokkosp_begin_parallel_reduce) EXPOSE_END_PARALLEL_REDUCE(impl::kokkosp_end_parallel_reduce) -// TODO: expose section stuff +EXPOSE_CREATE_PROFILE_SECTION(impl::kokkosp_create_profile_section) +EXPOSE_START_PROFILE_SECTION(impl::kokkosp_start_profile_section) +EXPOSE_STOP_PROFILE_SECTION(impl::kokkosp_stop_profile_section) +EXPOSE_PROFILE_EVENT(impl::kokkosp_profile_event); +EXPOSE_BEGIN_FENCE(impl::kokkosp_begin_fence); +EXPOSE_END_FENCE(impl::kokkosp_end_fence); } // extern "C"