PR tensorflow#7029: [ROCm] add fpus for rocm and enable xla_compile_l…

…ib_test Imported from GitHub PR openxla/xla#7029 1. we add `fpus_per_core` for cost model and fusion analysis. 2. enable ROCm for new xla_compile_lib_test openxla/xla@a129c9b#diff-19845b853b7cf43ac606aca22262c15fdf2d56e5eb196a98f8d982e947123f1d please check @xla-rotation thanks in advance! Copybara import of the project: -- e9bcf2439d1384dc11e57f8e045b2e959b67dc62 by Chao Chen <cchen104@amd.com>: add fpus for rocm and enable xla_compile_lib_test Merging this change closes tensorflow#7029 PiperOrigin-RevId: 583947247
Intel-tensorflow · Nov 20, 2023 · 6589436 · 6589436
1 parent 0641f01
commit 6589436
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 5 deletions.
diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc b/third_party/xla/xla/service/gpu/gpu_device_info_for_tests.cc
@@ -53,7 +53,7 @@ stream_executor::DeviceDescription TestGpuDeviceInfo::AMDMI210DeviceInfo() {
   b.set_shared_memory_per_core(64 * 1024);
   b.set_threads_per_core_limit(2048);
   b.set_core_count(104);
-  b.set_fpus_per_core(0);
+  b.set_fpus_per_core(128);
   b.set_block_dim_limit_x(2'147'483'647);
   b.set_block_dim_limit_y(2'147'483'647);
   b.set_block_dim_limit_z(2'147'483'647);

diff --git a/third_party/xla/xla/service/gpu/gpu_device_info_test.cc b/third_party/xla/xla/service/gpu/gpu_device_info_test.cc
@@ -119,7 +119,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
             /*shared_memory_per_block_optin=*/0,
             /*shared_memory_per_core=*/64 * 1024,
             /*threads_per_core_limit=*/2560, /*core_count=*/120,
-            /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
+            /*fpus_per_core=*/128, /*block_dim_limit_x=*/2'147'483'647,
             /*block_dim_limit_y=*/2'147'483'647,
             /*block_dim_limit_z=*/2'147'483'647,
             /*memory_bandwidth=*/1228800000000,
@@ -136,7 +136,7 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
             /*shared_memory_per_block_optin=*/0,
             /*shared_memory_per_core=*/64 * 1024,
             /*threads_per_core_limit=*/2560, /*core_count=*/60,
-            /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
+            /*fpus_per_core=*/64, /*block_dim_limit_x=*/2'147'483'647,
             /*block_dim_limit_y=*/2'147'483'647,
             /*block_dim_limit_z=*/2'147'483'647,
             /*memory_bandwidth=*/256000000000,

diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -107,6 +107,18 @@ bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
   return UnloadGpuBinary(gpu_binary);
 }
 
+namespace {
+int fpus_per_core(std::string gcn_arch_name) {
+  // Source:
+  // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
+  int n = 128;  // gfx90a and gfx908 -> 128
+  if (gcn_arch_name.substr(0, 6) == "gfx906") {
+    n = 64;
+  }
+  return n;
+}
+}  // namespace
+
 tsl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
 GpuExecutor::CreateOrShareConstant(Stream* stream,
                                    const std::vector<uint8_t>& content) {
@@ -893,6 +905,7 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
       GpuDriver::GetMaxSharedMemoryPerBlock(device).value());
   int core_count = GpuDriver::GetMultiprocessorCount(device).value();
   builder.set_core_count(core_count);
+  builder.set_fpus_per_core(fpus_per_core(gcn_arch_name));
   builder.set_threads_per_core_limit(
       GpuDriver::GetMaxThreadsPerMultiprocessor(device).value());
   builder.set_registers_per_block_limit(

diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
@@ -689,7 +689,9 @@ xla_test(
         ":data/add.hlo",
         "//xla/service:xla_aot_compile_test_gpu_target_config.prototxt",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     deps = [
         ":xla_compile_lib",
         "//xla:util",

diff --git a/third_party/xla/xla/tools/xla_compile_lib_test.cc b/third_party/xla/xla/tools/xla_compile_lib_test.cc
@@ -53,7 +53,12 @@ using ::tsl::testing::StatusIs;
 #if XLA_TEST_BACKEND_CPU
 static constexpr absl::string_view kPlatformName = "Host";
 #elif XLA_TEST_BACKEND_GPU
-static constexpr absl::string_view kPlatformName = "CUDA";
+static constexpr absl::string_view kPlatformName =
+#if TENSORFLOW_USE_ROCM
+    "ROCM";
+#else
+    "CUDA";
+#endif
 #endif  // XLA_TEST_BACKEND_CPU
 
 class XlaCompileLibTest : public HloTestBase {