Skip to content

Commit

Permalink
Unify loading of precompiled GPU kernels in tests
Browse files Browse the repository at this point in the history
We have quite a few tests that load kernels via `MultiKernelLoaderSpec`. Most of these tests load a kernel using `AddCudaPtxInMemory` from an embedded PTX string.

The same is not possible for ROCm because there is no equivalent abstraction layer to PTX. So for ROCm we load an embedded precompiled HSACO (fat) binary using `AddCudaCubinInMemory` and litter the tests with preprocessor conditionals that call either the former or the latter depending on the enabled GPU backend.

This change unifies both approaches by introducing a genrule that can extract a CUDA or ROCm fat binary from a `gpu_kernel_library` target.

Concretely it does the following:

1. Create new target `:gpu_test_kernels_fatbin` which complements `:gpu_test_kernels` and provides the same kernels as a fat binary (CUBIN/HSACO) in memory.
2. Extend `gpu_kernel_test` so that it test all kernel loading APIs `AddCudaPtxInMemory`, `AddCudaCubinInMemory`, `AddInProcessSymbol` in the same way.
3. Use `AddCudaCubinInMemory` for all the runtime tests which now can share the same code between CUDA and ROCm.
4. Use the correct kernels names `AddI32` and `MulI32` in all tests (this matters since we don't load from PTX anymore and there is a lookup by name.)
5. Remove the old ROCm only kernel embedding approach which only worked for ROCm.

PiperOrigin-RevId: 664681229
  • Loading branch information
beckerhe authored and tensorflower-gardener committed Aug 19, 2024
1 parent a6b5bdb commit 5046013
Show file tree
Hide file tree
Showing 12 changed files with 286 additions and 220 deletions.
8 changes: 5 additions & 3 deletions third_party/xla/xla/service/gpu/runtime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,12 @@ xla_test(
"//xla/stream_executor:platform",
"//xla/stream_executor:platform_manager",
"//xla/stream_executor:stream_executor_memory_allocator",
"//xla/stream_executor/gpu:gpu_test_kernels",
"//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
"//xla/tsl/lib/core:status_test_util",
"@com_google_absl//absl/functional:function_ref",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
"@local_tsl//tsl/platform:status",
"@local_tsl//tsl/platform:statusor",
"@local_tsl//tsl/platform:test",
Expand Down Expand Up @@ -423,7 +424,6 @@ cc_library(
name = "command_buffer_thunk",
srcs = ["command_buffer_thunk.cc"],
hdrs = ["command_buffer_thunk.h"],
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
deps = [
":annotation",
":command_buffer_cmd",
Expand All @@ -450,7 +450,7 @@ cc_library(

xla_test(
name = "command_buffer_thunk_test",
srcs = if_gpu_is_configured(["command_buffer_thunk_test.cc"]),
srcs = ["command_buffer_thunk_test.cc"],
backend_tags = {
"gpu_a100": if_google(["config-cuda-only"]),
"gpu_v100": if_google(["config-cuda-only"]),
Expand Down Expand Up @@ -484,10 +484,12 @@ xla_test(
"//xla/stream_executor:platform_manager",
"//xla/stream_executor:stream_executor_memory_allocator",
"//xla/stream_executor/gpu:gpu_test_kernels",
"//xla/stream_executor/gpu:gpu_test_kernels_fatbin",
"//xla/stream_executor/gpu:gpu_types_header",
"//xla/tsl/lib/core:status_test_util",
"@com_google_absl//absl/status:statusor",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/types:span",
"@local_tsl//tsl/platform:statusor",
"@local_tsl//tsl/platform:test",
"@local_tsl//tsl/platform:test_main",
Expand Down
18 changes: 7 additions & 11 deletions third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ limitations under the License.
#include "absl/functional/function_ref.h"
#include "absl/status/status.h"
#include "absl/strings/ascii.h"
#include "absl/types/span.h"
#include "xla/service/buffer_assignment.h"
#include "xla/service/gpu/buffer_allocations.h"
#include "xla/service/gpu/launch_dimensions.h"
Expand All @@ -30,7 +31,7 @@ limitations under the License.
#include "xla/service/service_executable_run_options.h"
#include "xla/stream_executor/command_buffer.h"
#include "xla/stream_executor/device_memory.h"
#include "xla/stream_executor/gpu/gpu_test_kernels.h"
#include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
#include "xla/stream_executor/platform.h"
#include "xla/stream_executor/platform_manager.h"
#include "xla/stream_executor/stream_executor.h"
Expand Down Expand Up @@ -352,20 +353,15 @@ TEST(CommandBufferCmdTest, LaunchCmd) {

// Prepare commands sequence for constructing command buffer.
CommandBufferCmdSequence commands;
commands.Emplace<LaunchCmd>(s0, "add", args, args_access,
commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);

// Initialize command sequence and load device kernels.
Thunk::ExecutableSource source = {
#if defined(GOOGLE_CUDA)
/*text=*/se::gpu::internal::kAddI32Kernel,
/*binary=*/{}
#elif defined(TENSORFLOW_USE_ROCM)
/*text=*/{},
/*binary=*/se::gpu::internal::kAddI32KernelModule
#endif
};
TF_ASSERT_OK_AND_ASSIGN(std::vector<uint8_t> fatbin,
se::gpu::GetGpuTestKernelsFatbin());
Thunk::ExecutableSource source = {/*text=*/{},
/*binary=*/fatbin};

CommandBufferCmd::StateManager state;
TF_ASSERT_OK(commands.Initialize({executor, source}, state));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@ limitations under the License.
#include <cstdint>
#include <memory>
#include <optional>
#include <string>
#include <thread> // NOLINT
#include <utility>
#include <vector>

#include "absl/status/statusor.h"
#include "absl/strings/ascii.h"
#include "absl/types/span.h"
#include "xla/service/buffer_assignment.h"
#include "xla/service/gpu/buffer_allocations.h"
#include "xla/service/gpu/launch_dimensions.h"
Expand All @@ -41,6 +43,7 @@ limitations under the License.
#include "xla/stream_executor/device_memory.h"
#include "xla/stream_executor/device_memory_allocator.h"
#include "xla/stream_executor/gpu/gpu_test_kernels.h"
#include "xla/stream_executor/gpu/gpu_test_kernels_fatbin.h"
#include "xla/stream_executor/gpu/gpu_types.h" // IWYU pragma: keep
#include "xla/stream_executor/kernel.h"
#include "xla/stream_executor/kernel_spec.h"
Expand All @@ -64,27 +67,29 @@ namespace xla::gpu {
using MemoryAccess = CommandBufferCmd::MemoryAccess;
using KernelArgsPacking = se::MultiKernelLoaderSpec::KernelArgsPacking;

static se::StreamExecutor* GpuExecutor() {
namespace {
se::StreamExecutor* GpuExecutor() {
auto name =
absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
auto* platform = se::PlatformManager::PlatformWithName(name).value();
return platform->ExecutorForDevice(0).value();
}

static Thunk::ExecutableSource ExecutableSource() {
Thunk::ExecutableSource source = {
#if defined(GOOGLE_CUDA)
/*text=*/se::gpu::internal::kAddI32Kernel,
/*binary=*/{}
#elif defined(TENSORFLOW_USE_ROCM)
/*text=*/{},
/*binary=*/se::gpu::internal::kAddI32KernelModule
#endif
};
return source;
struct OwningExecutableSource {
std::string text;
std::vector<uint8_t> binary;

explicit operator Thunk::ExecutableSource() const { return {text, binary}; }
};

absl::StatusOr<OwningExecutableSource> ExecutableSource() {
TF_ASSIGN_OR_RETURN(std::vector<uint8_t> fatbin,
se::gpu::GetGpuTestKernelsFatbin());
return OwningExecutableSource{/*text=*/{},
/*binary=*/fatbin};
}

static KernelArgsPacking CreateDefaultArgsPacking() {
KernelArgsPacking CreateDefaultArgsPacking() {
using Packed = absl::StatusOr<std::unique_ptr<se::KernelArgsPackedArrayBase>>;

return [=](const se::Kernel& kernel, const se::KernelArgs& args) -> Packed {
Expand All @@ -96,7 +101,7 @@ static KernelArgsPacking CreateDefaultArgsPacking() {
}

// Some of the tests rely on CUDA 12.3+ features.
static bool IsAtLeastCuda12300() {
bool IsAtLeastCuda12300() {
#if defined(TENSORFLOW_USE_ROCM)
return false;
#endif
Expand All @@ -107,8 +112,9 @@ static bool IsAtLeastCuda12300() {
}

// Give a short aliases to execution threads.
static constexpr auto s0 = ExecutionStreamId(0);
static constexpr auto s1 = ExecutionStreamId(1);
constexpr auto s0 = ExecutionStreamId(0);
constexpr auto s1 = ExecutionStreamId(1);
} // namespace

TEST(CommandBufferThunkTest, MemcpyCmd) {
se::StreamExecutor* executor = GpuExecutor();
Expand Down Expand Up @@ -428,7 +434,7 @@ TEST(CommandBufferThunkTest, LaunchCmd) {

// Prepare commands sequence for constructing command buffer.
CommandBufferCmdSequence commands;
commands.Emplace<LaunchCmd>(s0, "add", args, args_access,
commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);

Expand All @@ -442,9 +448,10 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);

Thunk::ExecutableSource source = ExecutableSource();
TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
TF_ASSERT_OK(
thunk.Initialize({executor, source, &allocations, stream.get()}));
thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
&allocations, stream.get()}));

// Execute command buffer thunk and verify that it added the value.
TF_ASSERT_OK(thunk.ExecuteOnStream(params));
Expand Down Expand Up @@ -498,7 +505,7 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
spec.AddInProcessSymbol(se::gpu::internal::GetAddI32Kernel(), "add");

auto custom_kernel =
CustomKernel("add", std::move(spec), se::BlockDim(),
CustomKernel("AddI32", std::move(spec), se::BlockDim(),
se::ThreadDim(4, 1, 1), /*shared_memory_bytes=*/0);

int64_t length = 4;
Expand All @@ -524,7 +531,7 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {

// Prepare commands sequence for constructing command buffer.
CommandBufferCmdSequence commands;
commands.Emplace<LaunchCmd>(s0, "add", args, args_access,
commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);

Expand All @@ -538,9 +545,10 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);

Thunk::ExecutableSource source = ExecutableSource();
TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
TF_ASSERT_OK(
thunk.Initialize({executor, source, &allocations, stream.get()}));
thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
&allocations, stream.get()}));

// Execute command buffer thunk and verify that it added the value.
TF_ASSERT_OK(thunk.ExecuteOnStream(params));
Expand Down Expand Up @@ -880,10 +888,10 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {

// Prepare commands sequence for constructing command buffer.
CommandBufferCmdSequence commands;
commands.Emplace<LaunchCmd>(s0, "add", args, args_access,
commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);
commands.Emplace<LaunchCmd>(s0, "add", args_1, args_access,
commands.Emplace<LaunchCmd>(s0, "AddI32", args_1, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);

Expand All @@ -897,9 +905,10 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);

Thunk::ExecutableSource source = ExecutableSource();
TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
TF_ASSERT_OK(
thunk.Initialize({executor, source, &allocations, stream.get()}));
thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
&allocations, stream.get()}));

// Execute command buffer thunk and verify that it added the value.
TF_ASSERT_OK(thunk.ExecuteOnStream(params));
Expand Down Expand Up @@ -994,7 +1003,7 @@ TEST(CommandBufferThunkTest, IfCmd) {

// Prepare commands sequence for `then` branch.
CommandBufferCmdSequence then_commands;
then_commands.Emplace<LaunchCmd>(s0, "add", args, args_access,
then_commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);

Expand All @@ -1012,9 +1021,10 @@ TEST(CommandBufferThunkTest, IfCmd) {
Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);

Thunk::ExecutableSource source = ExecutableSource();
TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
TF_ASSERT_OK(
thunk.Initialize({executor, source, &allocations, stream.get()}));
thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
&allocations, stream.get()}));

// Execute command buffer thunk and verify that it added the value.
TF_ASSERT_OK(thunk.ExecuteOnStream(params));
Expand Down Expand Up @@ -1084,14 +1094,14 @@ TEST(CommandBufferThunkTest, IfElseCmd) {

{ // Then: b = a + a
auto args = {slice_a, slice_a, slice_b};
then_commands.Emplace<LaunchCmd>(s0, "add", args, args_access,
then_commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);
}

{ // Else: b = b + b
auto args = {slice_b, slice_b, slice_b};
else_commands.Emplace<LaunchCmd>(s0, "add", args, args_access,
else_commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);
}
Expand All @@ -1111,9 +1121,10 @@ TEST(CommandBufferThunkTest, IfElseCmd) {
Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);

Thunk::ExecutableSource source = ExecutableSource();
TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
TF_ASSERT_OK(
thunk.Initialize({executor, source, &allocations, stream.get()}));
thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
&allocations, stream.get()}));

// Execute command buffer thunk and verify that it added the value.
TF_ASSERT_OK(thunk.ExecuteOnStream(params));
Expand Down Expand Up @@ -1174,14 +1185,14 @@ TEST(CommandBufferThunkTest, CaseCmd) {

{ // Case 0: b = a + a
auto args = {slice_a, slice_a, slice_b};
branches[0].Emplace<LaunchCmd>(s0, "add", args, args_access,
branches[0].Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);
}

{ // Case 1: b = b + b
auto args = {slice_b, slice_b, slice_b};
branches[1].Emplace<LaunchCmd>(s0, "add", args, args_access,
branches[1].Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);
}
Expand All @@ -1200,9 +1211,10 @@ TEST(CommandBufferThunkTest, CaseCmd) {
Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);

Thunk::ExecutableSource source = ExecutableSource();
TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
TF_ASSERT_OK(
thunk.Initialize({executor, source, &allocations, stream.get()}));
thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
&allocations, stream.get()}));

// Execute command buffer thunk and verify that it added the value.
TF_ASSERT_OK(thunk.ExecuteOnStream(params));
Expand Down Expand Up @@ -1260,7 +1272,7 @@ TEST(CommandBufferThunkTest, ForCmd) {

// Prepare commands sequence for loop `body`.
CommandBufferCmdSequence body_commands;
body_commands.Emplace<LaunchCmd>(s0, "add", args, args_access,
body_commands.Emplace<LaunchCmd>(s0, "AddI32", args, args_access,
LaunchDimensions(1, 4),
/*shmem_bytes=*/0);

Expand All @@ -1279,9 +1291,10 @@ TEST(CommandBufferThunkTest, ForCmd) {
Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
run_options, allocations, stream.get(), stream.get(), nullptr, nullptr);

Thunk::ExecutableSource source = ExecutableSource();
TF_ASSERT_OK_AND_ASSIGN(OwningExecutableSource source, ExecutableSource());
TF_ASSERT_OK(
thunk.Initialize({executor, source, &allocations, stream.get()}));
thunk.Initialize({executor, static_cast<Thunk::ExecutableSource>(source),
&allocations, stream.get()}));

// Execute command buffer thunk and verify that it added the value 10 times.
TF_ASSERT_OK(thunk.ExecuteOnStream(params));
Expand Down
Loading

0 comments on commit 5046013

Please sign in to comment.