Skip to content

Commit

Permalink
PR tensorflow#7029: [ROCm] add fpus for rocm and enable xla_compile_l…
Browse files Browse the repository at this point in the history
…ib_test

Imported from GitHub PR openxla/xla#7029

1. we add `fpus_per_core` for cost model and fusion analysis.
2. enable ROCm for new xla_compile_lib_test openxla/xla@a129c9b#diff-19845b853b7cf43ac606aca22262c15fdf2d56e5eb196a98f8d982e947123f1d

please check @xla-rotation thanks in advance!
Copybara import of the project:

--
e9bcf2439d1384dc11e57f8e045b2e959b67dc62 by Chao Chen <cchen104@amd.com>:

add fpus for rocm and enable xla_compile_lib_test

Merging this change closes tensorflow#7029

PiperOrigin-RevId: 583947247
  • Loading branch information
i-chaochen authored and tensorflower-gardener committed Nov 20, 2023
1 parent 0641f01 commit 6589436
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ stream_executor::DeviceDescription TestGpuDeviceInfo::AMDMI210DeviceInfo() {
b.set_shared_memory_per_core(64 * 1024);
b.set_threads_per_core_limit(2048);
b.set_core_count(104);
b.set_fpus_per_core(0);
b.set_fpus_per_core(128);
b.set_block_dim_limit_x(2'147'483'647);
b.set_block_dim_limit_y(2'147'483'647);
b.set_block_dim_limit_z(2'147'483'647);
Expand Down
4 changes: 2 additions & 2 deletions third_party/xla/xla/service/gpu/gpu_device_info_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
/*shared_memory_per_block_optin=*/0,
/*shared_memory_per_core=*/64 * 1024,
/*threads_per_core_limit=*/2560, /*core_count=*/120,
/*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
/*fpus_per_core=*/128, /*block_dim_limit_x=*/2'147'483'647,
/*block_dim_limit_y=*/2'147'483'647,
/*block_dim_limit_z=*/2'147'483'647,
/*memory_bandwidth=*/1228800000000,
Expand All @@ -136,7 +136,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
/*shared_memory_per_block_optin=*/0,
/*shared_memory_per_core=*/64 * 1024,
/*threads_per_core_limit=*/2560, /*core_count=*/60,
/*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
/*fpus_per_core=*/64, /*block_dim_limit_x=*/2'147'483'647,
/*block_dim_limit_y=*/2'147'483'647,
/*block_dim_limit_z=*/2'147'483'647,
/*memory_bandwidth=*/256000000000,
Expand Down
13 changes: 13 additions & 0 deletions third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
return UnloadGpuBinary(gpu_binary);
}

namespace {
int fpus_per_core(std::string gcn_arch_name) {
// Source:
// https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
int n = 128; // gfx90a and gfx908 -> 128
if (gcn_arch_name.substr(0, 6) == "gfx906") {
n = 64;
}
return n;
}
} // namespace

tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
GpuExecutor::CreateOrShareConstant(Stream* stream,
const std::vector<uint8_t>& content) {
Expand Down Expand Up @@ -893,6 +905,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
GpuDriver::GetMaxSharedMemoryPerBlock(device).value());
int core_count = GpuDriver::GetMultiprocessorCount(device).value();
builder.set_core_count(core_count);
builder.set_fpus_per_core(fpus_per_core(gcn_arch_name));
builder.set_threads_per_core_limit(
GpuDriver::GetMaxThreadsPerMultiprocessor(device).value());
builder.set_registers_per_block_limit(
Expand Down
4 changes: 3 additions & 1 deletion third_party/xla/xla/tools/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,9 @@ xla_test(
":data/add.hlo",
"//xla/service:xla_aot_compile_test_gpu_target_config.prototxt",
],
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
"TENSORFLOW_USE_ROCM=1",
]),
deps = [
":xla_compile_lib",
"//xla:util",
Expand Down
7 changes: 6 additions & 1 deletion third_party/xla/xla/tools/xla_compile_lib_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,12 @@ using ::tsl::testing::StatusIs;
#if XLA_TEST_BACKEND_CPU
static constexpr absl::string_view kPlatformName = "Host";
#elif XLA_TEST_BACKEND_GPU
static constexpr absl::string_view kPlatformName = "CUDA";
static constexpr absl::string_view kPlatformName =
#if TENSORFLOW_USE_ROCM
"ROCM";
#else
"CUDA";
#endif
#endif // XLA_TEST_BACKEND_CPU

class XlaCompileLibTest : public HloTestBase {
Expand Down

0 comments on commit 6589436

Please sign in to comment.