Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[L0 v2] implement deferred kernel deallocation #2451

Merged
merged 3 commits into from
Jan 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-hw-reusable.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ jobs:

- name: Test adapter specific
working-directory: ${{github.workspace}}/build
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" -E "memcheck" --timeout 180
# Don't run adapter specific tests when building multiple adapters
if: ${{ matrix.adapter.other_name == '' }}

Expand Down
12 changes: 7 additions & 5 deletions scripts/benchmarks/benches/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,9 @@ def benchmarks(self) -> list[Benchmark]:

if options.ur is not None:
benches += [
SubmitKernelUR(self, 0),
SubmitKernelUR(self, 1),
SubmitKernelUR(self, 0, 0),
SubmitKernelUR(self, 1, 0),
SubmitKernelUR(self, 1, 1),
]

return benches
Expand Down Expand Up @@ -180,13 +181,14 @@ def bin_args(self) -> list[str]:
]

class SubmitKernelUR(ComputeBenchmark):
def __init__(self, bench, ioq):
def __init__(self, bench, ioq, measureCompletion):
self.ioq = ioq
self.measureCompletion = measureCompletion
super().__init__(bench, "api_overhead_benchmark_ur", "SubmitKernel")

def name(self):
order = "in order" if self.ioq else "out of order"
return f"api_overhead_benchmark_ur SubmitKernel {order}"
return f"api_overhead_benchmark_ur SubmitKernel {order}" + (" with measure completion" if self.measureCompletion else "")

def explicit_group(self):
return "SubmitKernel"
Expand All @@ -195,7 +197,7 @@ def bin_args(self) -> list[str]:
return [
f"--Ioq={self.ioq}",
"--DiscardEvents=0",
"--MeasureCompletion=0",
f"--MeasureCompletion={self.measureCompletion}",
"--iterations=100000",
"--Profiling=0",
"--NumKernels=10",
Expand Down
13 changes: 6 additions & 7 deletions source/adapters/level_zero/v2/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ ur_kernel_handle_t_::ur_kernel_handle_t_(
}

ur_result_t ur_kernel_handle_t_::release() {
if (!RefCount.decrementAndTest())
return UR_RESULT_SUCCESS;

// manually release kernels to allow errors to be propagated
for (auto &singleDeviceKernelOpt : deviceKernels) {
if (singleDeviceKernelOpt.has_value()) {
Expand All @@ -104,6 +107,8 @@ ur_result_t ur_kernel_handle_t_::release() {

UR_CALL_THROWS(ur::level_zero::urProgramRelease(hProgram));

delete this;

return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -362,13 +367,7 @@ ur_result_t urKernelRetain(
ur_result_t urKernelRelease(
ur_kernel_handle_t hKernel ///< [in] handle for the Kernel to release
) try {
if (!hKernel->RefCount.decrementAndTest())
return UR_RESULT_SUCCESS;

hKernel->release();
delete hKernel;

return UR_RESULT_SUCCESS;
return hKernel->release();
} catch (...) {
return exceptionToResult(std::current_exception());
}
Expand Down
18 changes: 17 additions & 1 deletion source/adapters/level_zero/v2/queue_immediate_in_order.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,25 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {

// Free deferred events
for (auto &hEvent : deferredEvents) {
hEvent->releaseDeferred();
UR_CALL(hEvent->releaseDeferred());
}
deferredEvents.clear();

// Free deferred kernels
for (auto &hKernel : submittedKernels) {
UR_CALL(hKernel->release());
}
submittedKernels.clear();

return UR_RESULT_SUCCESS;
}

void ur_queue_immediate_in_order_t::recordSubmittedKernel(
ur_kernel_handle_t hKernel) {
submittedKernels.push_back(hKernel);
hKernel->RefCount.increment();
}

ur_result_t ur_queue_immediate_in_order_t::queueFlush() {
return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -251,6 +263,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch(
(handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions,
zeSignalEvent, waitList.second, waitList.first));

recordSubmittedKernel(hKernel);

return UR_RESULT_SUCCESS;
}

Expand Down Expand Up @@ -1063,6 +1077,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
(handler.commandList.get(), hZeKernel, &zeThreadGroupDimensions,
zeSignalEvent, waitList.second, waitList.first));

recordSubmittedKernel(hKernel);

return UR_RESULT_SUCCESS;
}

Expand Down
3 changes: 3 additions & 0 deletions source/adapters/level_zero/v2/queue_immediate_in_order.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
std::vector<ze_event_handle_t> waitList;

std::vector<ur_event_handle_t> deferredEvents;
std::vector<ur_kernel_handle_t> submittedKernels;

std::pair<ze_event_handle_t *, uint32_t>
getWaitListView(const ur_event_handle_t *phWaitEvents,
Expand Down Expand Up @@ -82,6 +83,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_handle_t_ {
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent);

void recordSubmittedKernel(ur_kernel_handle_t hKernel);

public:
ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t,
const ur_queue_properties_t *);
Expand Down
24 changes: 24 additions & 0 deletions test/adapters/level_zero/v2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,27 @@ add_adapter_test(level_zero_memory_residency
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero_v2>\""
"ZES_ENABLE_SYSMAN=1"
)

if(NOT WIN32)
add_adapter_test(level_zero_deferred_kernel
FIXTURE KERNELS
SOURCES
deferred_kernel.cpp
ENVIRONMENT
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero_v2>\""
)

set(backend level_zero)
add_adapter_memcheck_test(level_zero_deferred_kernel
FIXTURE KERNELS
SOURCES
deferred_kernel.cpp
ENVIRONMENT
"UR_ADAPTERS_FORCE_LOAD=\"$<TARGET_FILE:ur_adapter_level_zero_v2>\""
)

target_link_libraries(test-adapter-level_zero_deferred_kernel PRIVATE
LevelZeroLoader
LevelZeroLoader-Headers
)
endif()
166 changes: 166 additions & 0 deletions test/adapters/level_zero/v2/deferred_kernel.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
// Copyright (C) 2024 Intel Corporation
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
// See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <ze_api.h>

#include "../../../conformance/enqueue/helpers.h"
#include "../ze_helpers.hpp"
#include "uur/fixtures.h"
#include "uur/raii.h"

struct urEnqueueKernelLaunchTest : uur::urKernelExecutionTest {
void SetUp() override {
program_name = "fill";
UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp());
}

uint32_t val = 42;
size_t global_size = 32;
size_t global_offset = 0;
size_t n_dimensions = 1;
};
UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueKernelLaunchTest);

TEST_P(urEnqueueKernelLaunchTest, DeferredKernelRelease) {
ur_mem_handle_t buffer = nullptr;
AddBuffer1DArg(sizeof(val) * global_size, &buffer);
AddPodArg(val);

auto zeEvent = createZeEvent(context, device);

ur_event_handle_t event;
ASSERT_SUCCESS(urEventCreateWithNativeHandle(
reinterpret_cast<ur_native_handle_t>(zeEvent.get()), context, nullptr,
&event));

ASSERT_SUCCESS(urEnqueueEventsWait(queue, 1, &event, nullptr));
ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions,
&global_offset, &global_size, nullptr,
0, nullptr, nullptr));
ASSERT_SUCCESS(urKernelRelease(kernel));

// Kernel should still be alive since kernel launch is pending
ur_context_handle_t contextFromKernel;
ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT,
sizeof(ur_context_handle_t),
&contextFromKernel, nullptr));

ASSERT_EQ(context, contextFromKernel);

ze_event_handle_t ze_event = nullptr;
ASSERT_SUCCESS(urEventGetNativeHandle(
event, reinterpret_cast<ur_native_handle_t *>(&ze_event)));
ASSERT_EQ(zeEventHostSignal(ze_event), ZE_RESULT_SUCCESS);

ASSERT_SUCCESS(urQueueFinish(queue));

kernel = nullptr;

ASSERT_SUCCESS(urEventRelease(event));
}

struct urMultiQueueLaunchKernelDeferFreeTest
: uur::urMultiQueueMultiDeviceTest<2> {
std::string KernelName;

static constexpr char ProgramName[] = "foo";
static constexpr size_t ArraySize = 100;
static constexpr uint32_t InitialValue = 1;

ur_program_handle_t program = nullptr;
ur_kernel_handle_t kernel = nullptr;

void SetUp() override {
if (devices.size() < 2) {
GTEST_SKIP() << "This test requires at least 2 devices";
}

UUR_RETURN_ON_FATAL_FAILURE(
uur::urMultiQueueMultiDeviceTest<2>::SetUp());

KernelName = uur::KernelsEnvironment::instance->GetEntryPointNames(
ProgramName)[0];

std::shared_ptr<std::vector<char>> il_binary;
std::vector<ur_program_metadata_t> metadatas{};

uur::KernelsEnvironment::instance->LoadSource(ProgramName, platform,
il_binary);

const ur_program_properties_t properties = {
UR_STRUCTURE_TYPE_PROGRAM_PROPERTIES, nullptr,
static_cast<uint32_t>(metadatas.size()),
metadatas.empty() ? nullptr : metadatas.data()};

ASSERT_SUCCESS(urProgramCreateWithIL(context, il_binary->data(),
il_binary->size(), &properties,
&program));

UUR_ASSERT_SUCCESS_OR_UNSUPPORTED(
urProgramBuild(context, program, nullptr));
ASSERT_SUCCESS(urKernelCreate(program, KernelName.data(), &kernel));
}

void TearDown() override {
// kernel will be release in the actual test

urProgramRelease(program);
UUR_RETURN_ON_FATAL_FAILURE(
uur::urMultiQueueMultiDeviceTest<2>::TearDown());
}
};

UUR_INSTANTIATE_PLATFORM_TEST_SUITE_P(urMultiQueueLaunchKernelDeferFreeTest);

TEST_P(urMultiQueueLaunchKernelDeferFreeTest, Success) {
auto zeEvent1 = createZeEvent(context, devices[0]);
auto zeEvent2 = createZeEvent(context, devices[1]);

ur_event_handle_t event1;
ASSERT_SUCCESS(urEventCreateWithNativeHandle(
reinterpret_cast<ur_native_handle_t>(zeEvent1.get()), context, nullptr,
&event1));
ur_event_handle_t event2;
ASSERT_SUCCESS(urEventCreateWithNativeHandle(
reinterpret_cast<ur_native_handle_t>(zeEvent2.get()), context, nullptr,
&event2));

size_t global_offset = 0;
size_t global_size = 1;

ASSERT_SUCCESS(urEnqueueEventsWait(queues[0], 1, &event1, nullptr));
ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[0], kernel, 1, &global_offset,
&global_size, nullptr, 0, nullptr,
nullptr));

ASSERT_SUCCESS(urEnqueueEventsWait(queues[1], 1, &event2, nullptr));
ASSERT_SUCCESS(urEnqueueKernelLaunch(queues[1], kernel, 1, &global_offset,
&global_size, nullptr, 0, nullptr,
nullptr));

ASSERT_SUCCESS(urKernelRelease(kernel));

// Kernel should still be alive since both kernels are pending
ur_context_handle_t contextFromKernel;
ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT,
sizeof(ur_context_handle_t),
&contextFromKernel, nullptr));
ASSERT_EQ(context, contextFromKernel);

ASSERT_EQ(zeEventHostSignal(zeEvent2.get()), ZE_RESULT_SUCCESS);
ASSERT_SUCCESS(urQueueFinish(queues[1]));

// Kernel should still be alive since kernel launch is pending
ASSERT_SUCCESS(urKernelGetInfo(kernel, UR_KERNEL_INFO_CONTEXT,
sizeof(ur_context_handle_t),
&contextFromKernel, nullptr));
ASSERT_EQ(context, contextFromKernel);

ASSERT_EQ(zeEventHostSignal(zeEvent1.get()), ZE_RESULT_SUCCESS);
ASSERT_SUCCESS(urQueueFinish(queues[0]));

ASSERT_SUCCESS(urEventRelease(event1));
ASSERT_SUCCESS(urEventRelease(event2));
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{{IGNORE}}
pbalcer marked this conversation as resolved.
Show resolved Hide resolved
{{.*}} ERROR SUMMARY: 0 errors from 0 contexts {{.*}}
Loading