diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc index 5f88847b3352a4..8eb44d45aa56eb 100644 --- a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc +++ b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc @@ -53,7 +53,7 @@ stream_executor::DeviceDescription TestGpuDeviceInfo::AMDMI210DeviceInfo() { b.set_shared_memory_per_core(64 * 1024); b.set_threads_per_core_limit(2048); b.set_core_count(104); - b.set_fpus_per_core(0); + b.set_fpus_per_core(128); b.set_block_dim_limit_x(2'147'483'647); b.set_block_dim_limit_y(2'147'483'647); b.set_block_dim_limit_z(2'147'483'647); diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc b/third_party/xla/xla/service/gpu/gpu_device_info_test.cc index cb8d69f4c62592..c9e2ae245a7535 100644 --- a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc +++ b/third_party/xla/xla/service/gpu/gpu_device_info_test.cc @@ -119,7 +119,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) { /*shared_memory_per_block_optin=*/0, /*shared_memory_per_core=*/64 * 1024, /*threads_per_core_limit=*/2560, /*core_count=*/120, - /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647, + /*fpus_per_core=*/128, /*block_dim_limit_x=*/2'147'483'647, /*block_dim_limit_y=*/2'147'483'647, /*block_dim_limit_z=*/2'147'483'647, /*memory_bandwidth=*/1228800000000, @@ -136,7 +136,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) { /*shared_memory_per_block_optin=*/0, /*shared_memory_per_core=*/64 * 1024, /*threads_per_core_limit=*/2560, /*core_count=*/60, - /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647, + /*fpus_per_core=*/64, /*block_dim_limit_x=*/2'147'483'647, /*block_dim_limit_y=*/2'147'483'647, /*block_dim_limit_z=*/2'147'483'647, /*memory_bandwidth=*/256000000000, diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc index 3b7e6dd0a178c6..bd596857070335 100644 --- a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc +++ b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc @@ -107,6 +107,18 @@ bool GpuExecutor::UnloadModule(ModuleHandle module_handle) { return UnloadGpuBinary(gpu_binary); } +namespace { +int fpus_per_core(std::string gcn_arch_name) { + // Source: + // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf + int n = 128; // gfx90a and gfx908 -> 128 + if (gcn_arch_name.substr(0, 6) == "gfx906") { + n = 64; + } + return n; +} +} // namespace + tsl::StatusOr> GpuExecutor::CreateOrShareConstant(Stream* stream, const std::vector& content) { @@ -893,6 +905,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) { GpuDriver::GetMaxSharedMemoryPerBlock(device).value()); int core_count = GpuDriver::GetMultiprocessorCount(device).value(); builder.set_core_count(core_count); + builder.set_fpus_per_core(fpus_per_core(gcn_arch_name)); builder.set_threads_per_core_limit( GpuDriver::GetMaxThreadsPerMultiprocessor(device).value()); builder.set_registers_per_block_limit( diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD index 2f23d3b68b8717..99b1848b934800 100644 --- a/third_party/xla/xla/tools/BUILD +++ b/third_party/xla/xla/tools/BUILD @@ -689,7 +689,9 @@ xla_test( ":data/add.hlo", "//xla/service:xla_aot_compile_test_gpu_target_config.prototxt", ], - local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]), + local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([ + "TENSORFLOW_USE_ROCM=1", + ]), deps = [ ":xla_compile_lib", "//xla:util", diff --git a/third_party/xla/xla/tools/xla_compile_lib_test.cc b/third_party/xla/xla/tools/xla_compile_lib_test.cc index 05d86570ecee0e..dacd2ecec26820 100644 --- a/third_party/xla/xla/tools/xla_compile_lib_test.cc +++ b/third_party/xla/xla/tools/xla_compile_lib_test.cc @@ -53,7 +53,12 @@ using ::tsl::testing::StatusIs; #if XLA_TEST_BACKEND_CPU static constexpr absl::string_view kPlatformName = "Host"; #elif XLA_TEST_BACKEND_GPU -static constexpr absl::string_view kPlatformName = "CUDA"; +static constexpr absl::string_view kPlatformName = +#if TENSORFLOW_USE_ROCM + "ROCM"; +#else + "CUDA"; +#endif #endif // XLA_TEST_BACKEND_CPU class XlaCompileLibTest : public HloTestBase {