diff --git a/source/adapters/level_zero/context.cpp b/source/adapters/level_zero/context.cpp index faa16d48dd..169c8ec097 100644 --- a/source/adapters/level_zero/context.cpp +++ b/source/adapters/level_zero/context.cpp @@ -422,6 +422,7 @@ ur_result_t ur_context_handle_t_::finalize() { for (auto &EventCache : EventCaches) { for (auto &Event : EventCache) { auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); + Event->ZeEvent = nullptr; // Gracefully handle the case that L0 was already unloaded. if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) return ze2urResult(ZeResult); diff --git a/source/adapters/level_zero/event.cpp b/source/adapters/level_zero/event.cpp index eae16f0c57..a854c50fd9 100644 --- a/source/adapters/level_zero/event.cpp +++ b/source/adapters/level_zero/event.cpp @@ -1051,6 +1051,26 @@ ur_result_t ur_event_handle_t_::getOrCreateHostVisibleEvent( return UR_RESULT_SUCCESS; } +/** + * @brief Destructor for the ur_event_handle_t_ class. + * + * This destructor is responsible for cleaning up the event handle when the + * object is destroyed. It checks if the event (`ZeEvent`) is valid and if the + * event has been completed (`Completed`). If both conditions are met, it + * further checks if the associated queue (`UrQueue`) is valid and if it is not + * set to discard events. If all conditions are satisfied, it calls + * `zeEventDestroy` to destroy the event. + * + * This ensures that resources are properly released and avoids potential memory + * leaks or resource mismanagement. + */ +ur_event_handle_t_::~ur_event_handle_t_() { + if (this->ZeEvent && this->Completed) { + if (this->UrQueue && !this->UrQueue->isDiscardEvents()) + ZE_CALL_NOCHECK(zeEventDestroy, (this->ZeEvent)); + } +} + ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { if (!Event->RefCount.decrementAndTest()) return UR_RESULT_SUCCESS; @@ -1073,6 +1093,7 @@ ur_result_t urEventReleaseInternal(ur_event_handle_t Event) { if (Event->OwnNativeHandle) { if (DisableEventsCaching) { auto ZeResult = ZE_CALL_NOCHECK(zeEventDestroy, (Event->ZeEvent)); + Event->ZeEvent = nullptr; // Gracefully handle the case that L0 was already unloaded. if (ZeResult && ZeResult != ZE_RESULT_ERROR_UNINITIALIZED) return ze2urResult(ZeResult); diff --git a/source/adapters/level_zero/event.hpp b/source/adapters/level_zero/event.hpp index de018e7060..efae32f361 100644 --- a/source/adapters/level_zero/event.hpp +++ b/source/adapters/level_zero/event.hpp @@ -156,6 +156,8 @@ struct ur_event_handle_t_ : _ur_object { reinterpret_cast(HostVisibleEvent)); } + ~ur_event_handle_t_(); + // Provide direct access to Context, instead of going via queue. // Not every PI event has a queue, and we need a handle to Context // to get to event pool related information. diff --git a/source/adapters/level_zero/image.cpp b/source/adapters/level_zero/image.cpp index 09bdc16a64..8c205f54c5 100644 --- a/source/adapters/level_zero/image.cpp +++ b/source/adapters/level_zero/image.cpp @@ -265,6 +265,16 @@ ur_result_t ze2urImageFormat(const ze_image_desc_t *ZeImageDesc, return UR_RESULT_SUCCESS; } +static bool Is3ChannelOrder(ur_image_channel_order_t ChannelOrder) { + switch (ChannelOrder) { + case UR_IMAGE_CHANNEL_ORDER_RGB: + case UR_IMAGE_CHANNEL_ORDER_RGX: + return true; + default: + return false; + } +} + /// Construct ZE image desc from UR image format and desc. ur_result_t ur2zeImageDesc(const ur_image_format_t *ImageFormat, const ur_image_desc_t *ImageDesc, @@ -843,6 +853,14 @@ ur_result_t urBindlessImagesImageCopyExp( UR_CALL(ur2zeImageDesc(pSrcImageFormat, pSrcImageDesc, ZeImageDesc)); bool UseCopyEngine = hQueue->useCopyEngine(/*PreferCopyEngine*/ true); + // Due to the limitation of the copy engine, disable usage of Copy Engine + // Given 3 channel image + if (Is3ChannelOrder( + ur_cast(pSrcImageFormat->channelOrder)) || + Is3ChannelOrder( + ur_cast(pDstImageFormat->channelOrder))) { + UseCopyEngine = false; + } _ur_ze_event_list_t TmpWaitList; UR_CALL(TmpWaitList.createAndRetainUrZeEventList( diff --git a/source/adapters/level_zero/program.cpp b/source/adapters/level_zero/program.cpp index be8c366d6b..b5a64c3eda 100644 --- a/source/adapters/level_zero/program.cpp +++ b/source/adapters/level_zero/program.cpp @@ -452,11 +452,9 @@ ur_result_t urProgramLinkExp( // Build flags may be different for different devices, so handle them // here. Clear values of the previous device first. BuildFlagPtrs.clear(); - std::vector TemporaryOptionsStrings; for (uint32_t I = 0; I < count; I++) { - TemporaryOptionsStrings.push_back( - phPrograms[I]->getBuildOptions(ZeDevice)); - BuildFlagPtrs.push_back(TemporaryOptionsStrings.back().c_str()); + BuildFlagPtrs.push_back( + phPrograms[I]->getBuildOptions(ZeDevice).c_str()); } ZeExtModuleDesc.pBuildFlags = BuildFlagPtrs.data(); if (count == 1) diff --git a/source/adapters/level_zero/program.hpp b/source/adapters/level_zero/program.hpp index 4fe8c24acd..90b297fa40 100644 --- a/source/adapters/level_zero/program.hpp +++ b/source/adapters/level_zero/program.hpp @@ -169,7 +169,7 @@ struct ur_program_handle_t_ : _ur_object { DeviceDataMap[ZeDevice].BuildFlags += Options; } - std::string getBuildOptions(ze_device_handle_t ZeDevice) { + std::string &getBuildOptions(ze_device_handle_t ZeDevice) { return DeviceDataMap[ZeDevice].BuildFlags; } diff --git a/source/loader/CMakeLists.txt b/source/loader/CMakeLists.txt index d8f6056ae9..a10e99f422 100644 --- a/source/loader/CMakeLists.txt +++ b/source/loader/CMakeLists.txt @@ -136,6 +136,7 @@ if(UR_ENABLE_SANITIZER) ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_buffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_buffer.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_ddi.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_ddi.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_interceptor.cpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_interceptor.hpp ${CMAKE_CURRENT_SOURCE_DIR}/layers/sanitizer/asan/asan_libdevice.hpp diff --git a/source/loader/layers/sanitizer/asan/asan_ddi.cpp b/source/loader/layers/sanitizer/asan/asan_ddi.cpp index f8ded3ec7a..bf4dff157a 100644 --- a/source/loader/layers/sanitizer/asan/asan_ddi.cpp +++ b/source/loader/layers/sanitizer/asan/asan_ddi.cpp @@ -52,15 +52,6 @@ ur_result_t setupContext(ur_context_handle_t Context, uint32_t numDevices, return UR_RESULT_SUCCESS; } -bool isInstrumentedKernel(ur_kernel_handle_t hKernel) { - auto hProgram = GetProgram(hKernel); - auto PI = getAsanInterceptor()->getProgramInfo(hProgram); - if (PI == nullptr) { - return false; - } - return PI->isKernelInstrumented(hKernel); -} - } // namespace /////////////////////////////////////////////////////////////////////////////// @@ -470,15 +461,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunch( getContext()->logger.debug("==== urEnqueueKernelLaunch"); - if (!isInstrumentedKernel(hKernel)) { - return pfnKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent); - } - LaunchInfo LaunchInfo(GetContext(hQueue), GetDevice(hQueue), pGlobalWorkSize, pLocalWorkSize, pGlobalWorkOffset, workDim); + UR_CALL(LaunchInfo.Data.syncToDevice(hQueue)); UR_CALL(getAsanInterceptor()->preLaunchKernel(hKernel, hQueue, LaunchInfo)); @@ -1366,9 +1352,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelCreate( getContext()->logger.debug("==== urKernelCreate"); UR_CALL(pfnCreate(hProgram, pKernelName, phKernel)); - if (isInstrumentedKernel(*phKernel)) { - UR_CALL(getAsanInterceptor()->insertKernel(*phKernel)); - } + UR_CALL(getAsanInterceptor()->insertKernel(*phKernel)); return UR_RESULT_SUCCESS; } @@ -1389,9 +1373,7 @@ __urdlllocal ur_result_t UR_APICALL urKernelRetain( UR_CALL(pfnRetain(hKernel)); auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel); - if (KernelInfo) { - KernelInfo->RefCount++; - } + KernelInfo->RefCount++; return UR_RESULT_SUCCESS; } @@ -1411,10 +1393,8 @@ __urdlllocal ur_result_t urKernelRelease( UR_CALL(pfnRelease(hKernel)); auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel); - if (KernelInfo) { - if (--KernelInfo->RefCount == 0) { - UR_CALL(getAsanInterceptor()->eraseKernel(hKernel)); - } + if (--KernelInfo->RefCount == 0) { + UR_CALL(getAsanInterceptor()->eraseKernel(hKernel)); } return UR_RESULT_SUCCESS; @@ -1440,11 +1420,10 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgValue( getContext()->logger.debug("==== urKernelSetArgValue"); std::shared_ptr MemBuffer; - std::shared_ptr KernelInfo; if (argSize == sizeof(ur_mem_handle_t) && (MemBuffer = getAsanInterceptor()->getMemBuffer( - *ur_cast(pArgValue))) && - (KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel))) { + *ur_cast(pArgValue)))) { + auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel); std::scoped_lock Guard(KernelInfo->Mutex); KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer); } else { @@ -1473,9 +1452,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgMemObj( getContext()->logger.debug("==== urKernelSetArgMemObj"); std::shared_ptr MemBuffer; - std::shared_ptr KernelInfo; - if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue)) && - (KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel))) { + if ((MemBuffer = getAsanInterceptor()->getMemBuffer(hArgValue))) { + auto KernelInfo = getAsanInterceptor()->getKernelInfo(hKernel); std::scoped_lock Guard(KernelInfo->Mutex); KernelInfo->BufferArgs[argIndex] = std::move(MemBuffer); } else { @@ -1505,7 +1483,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgLocal( "==== urKernelSetArgLocal (argIndex={}, argSize={})", argIndex, argSize); - if (auto KI = getAsanInterceptor()->getKernelInfo(hKernel)) { + { + auto KI = getAsanInterceptor()->getKernelInfo(hKernel); std::scoped_lock Guard(KI->Mutex); // TODO: get local variable alignment auto argSizeWithRZ = GetSizeAndRedzoneSizeForLocal( @@ -1542,8 +1521,8 @@ __urdlllocal ur_result_t UR_APICALL urKernelSetArgPointer( pArgValue); std::shared_ptr KI; - if (getAsanInterceptor()->getOptions().DetectKernelArguments && - (KI = getAsanInterceptor()->getKernelInfo(hKernel))) { + if (getAsanInterceptor()->getOptions().DetectKernelArguments) { + auto KI = getAsanInterceptor()->getKernelInfo(hKernel); std::scoped_lock Guard(KI->Mutex); KI->PointerArgs[argIndex] = {pArgValue, GetCurrentBacktrace()}; } diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp index 02dcb3d0d3..31c35201de 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.cpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.cpp @@ -644,7 +644,13 @@ ur_result_t AsanInterceptor::insertKernel(ur_kernel_handle_t Kernel) { if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { return UR_RESULT_SUCCESS; } - m_KernelMap.emplace(Kernel, std::make_shared(Kernel)); + + auto hProgram = GetProgram(Kernel); + auto PI = getAsanInterceptor()->getProgramInfo(hProgram); + bool IsInstrumented = PI->isKernelInstrumented(Kernel); + + m_KernelMap.emplace(Kernel, + std::make_shared(Kernel, IsInstrumented)); return UR_RESULT_SUCCESS; } @@ -685,9 +691,19 @@ ur_result_t AsanInterceptor::prepareLaunch( std::shared_ptr &ContextInfo, std::shared_ptr &DeviceInfo, ur_queue_handle_t Queue, ur_kernel_handle_t Kernel, LaunchInfo &LaunchInfo) { - auto KernelInfo = getKernelInfo(Kernel); - assert(KernelInfo && "Kernel should be instrumented"); + + auto ArgNums = GetKernelNumArgs(Kernel); + auto LocalMemoryUsage = + GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle); + auto PrivateMemoryUsage = + GetKernelPrivateMemorySize(Kernel, DeviceInfo->Handle); + + getContext()->logger.info( + "KernelInfo {} (Name={}, ArgNums={}, IsInstrumented={}, " + "LocalMemory={}, PrivateMemory={})", + (void *)Kernel, GetKernelName(Kernel), ArgNums, + KernelInfo->IsInstrumented, LocalMemoryUsage, PrivateMemoryUsage); // Validate pointer arguments if (getOptions().DetectKernelArguments) { @@ -719,11 +735,17 @@ ur_result_t AsanInterceptor::prepareLaunch( } } - auto ArgNums = GetKernelNumArgs(Kernel); + if (!KernelInfo->IsInstrumented) { + return UR_RESULT_SUCCESS; + } + // We must prepare all kernel args before call // urKernelGetSuggestedLocalWorkSize, otherwise the call will fail on // CPU device. - if (ArgNums) { + { + assert(ArgNums >= 1 && + "Sanitized Kernel should have at least one argument"); + ur_result_t URes = getContext()->urDdiTable.Kernel.pfnSetArgPointer( Kernel, ArgNums - 1, nullptr, LaunchInfo.Data.getDevicePtr()); if (URes != UR_RESULT_SUCCESS) { @@ -763,15 +785,6 @@ ur_result_t AsanInterceptor::prepareLaunch( LaunchInfo.Data.Host.DeviceTy = DeviceInfo->Type; LaunchInfo.Data.Host.Debug = getOptions().Debug ? 1 : 0; - auto LocalMemoryUsage = - GetKernelLocalMemorySize(Kernel, DeviceInfo->Handle); - auto PrivateMemoryUsage = - GetKernelPrivateMemorySize(Kernel, DeviceInfo->Handle); - - getContext()->logger.info( - "KernelInfo {} (LocalMemory={}, PrivateMemory={})", (void *)Kernel, - LocalMemoryUsage, PrivateMemoryUsage); - // Write shadow memory offset for local memory if (getOptions().DetectLocals) { if (DeviceInfo->Shadow->AllocLocalShadow( @@ -831,10 +844,12 @@ ur_result_t AsanInterceptor::prepareLaunch( // sync asan runtime data to device side UR_CALL(LaunchInfo.Data.syncToDevice(Queue)); - getContext()->logger.debug("launch_info {} (numLocalArgs={}, localArgs={})", - (void *)LaunchInfo.Data.getDevicePtr(), - LaunchInfo.Data.Host.NumLocalArgs, - (void *)LaunchInfo.Data.Host.LocalArgs); + getContext()->logger.info( + "LaunchInfo {} (device={}, debug={}, numLocalArgs={}, localArgs={})", + (void *)LaunchInfo.Data.getDevicePtr(), + ToString(LaunchInfo.Data.Host.DeviceTy), LaunchInfo.Data.Host.Debug, + LaunchInfo.Data.Host.NumLocalArgs, + (void *)LaunchInfo.Data.Host.LocalArgs); return UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp index 2270795969..27d5e37532 100644 --- a/source/loader/layers/sanitizer/asan/asan_interceptor.hpp +++ b/source/loader/layers/sanitizer/asan/asan_interceptor.hpp @@ -85,6 +85,9 @@ struct KernelInfo { ur_kernel_handle_t Handle; std::atomic RefCount = 1; + // sanitized kernel + bool IsInstrumented = false; + // lock this mutex if following fields are accessed ur_shared_mutex Mutex; std::unordered_map> BufferArgs; @@ -94,7 +97,8 @@ struct KernelInfo { // Need preserve the order of local arguments std::map LocalArgs; - explicit KernelInfo(ur_kernel_handle_t Kernel) : Handle(Kernel) { + explicit KernelInfo(ur_kernel_handle_t Kernel, bool IsInstrumented) + : Handle(Kernel), IsInstrumented(IsInstrumented) { [[maybe_unused]] auto Result = getContext()->urDdiTable.Kernel.pfnRetain(Kernel); assert(Result == UR_RESULT_SUCCESS); @@ -348,10 +352,8 @@ class AsanInterceptor { std::shared_ptr getKernelInfo(ur_kernel_handle_t Kernel) { std::shared_lock Guard(m_KernelMapMutex); - if (m_KernelMap.find(Kernel) != m_KernelMap.end()) { - return m_KernelMap[Kernel]; - } - return nullptr; + assert(m_KernelMap.find(Kernel) != m_KernelMap.end()); + return m_KernelMap[Kernel]; } const AsanOptions &getOptions() { return m_Options; } diff --git a/source/loader/layers/sanitizer/asan/asan_libdevice.hpp b/source/loader/layers/sanitizer/asan/asan_libdevice.hpp index a2d5ecd6be..4c6aaaeac8 100644 --- a/source/loader/layers/sanitizer/asan/asan_libdevice.hpp +++ b/source/loader/layers/sanitizer/asan/asan_libdevice.hpp @@ -66,7 +66,7 @@ struct AsanRuntimeData { uint32_t Debug = 0; int ReportFlag = 0; - AsanErrorReport Report[ASAN_MAX_NUM_REPORTS]; + AsanErrorReport Report[ASAN_MAX_NUM_REPORTS] = {}; }; constexpr unsigned ASAN_SHADOW_SCALE = 4; diff --git a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp index 30a2e07359..b9fd9d1ed6 100644 --- a/source/loader/layers/sanitizer/msan/msan_interceptor.cpp +++ b/source/loader/layers/sanitizer/msan/msan_interceptor.cpp @@ -175,10 +175,7 @@ ur_result_t MsanInterceptor::registerSpirKernels(ur_program_handle_t Program) { Device, Program, kSPIR_MsanSpirKernelMetadata, &MetadataSize, &MetadataPtr); if (Result != UR_RESULT_SUCCESS) { - getContext()->logger.error( - "Can't get the pointer of <{}> under device {}: {}", - kSPIR_MsanSpirKernelMetadata, (void *)Device, Result); - return Result; + continue; } const uint64_t NumOfSpirKernel = MetadataSize / sizeof(SpirKernelInfo);